Case Study: Social Media Feed (như Facebook News Feed)

Tổng Quan

Social Media Feed là hệ thống hiển thị nội dung cá nhân hóa cho users dựa trên hoạt động của friends và interests. Đây là một trong những system design phức tạp nhất.

Requirements

Functional Requirements

1. Users có thể đăng posts (text, images, videos)
2. Users có thể follow/unfollow other users
3. Generate news feed cho users
4. Support likes, comments, shares
5. Real-time notifications

Non-Functional Requirements

- Scale: 1B users, 100M daily active users
- Posts: 100M posts/day
- Feed generation: < 100ms
- Availability: 99.99%
- Consistency: Eventual consistency acceptable

Capacity Estimation

Storage Requirements

class SocialMediaCapacity:
    def __init__(self):
        self.daily_active_users = 100_000_000
        self.posts_per_user_per_day = 1
        self.avg_post_size = 1024  # bytes
        self.media_posts_ratio = 0.3
        self.avg_media_size = 5 * 1024 * 1024  # 5MB

    def daily_storage_requirement(self):
        text_posts = self.daily_active_users * self.posts_per_user_per_day * (1 - self.media_posts_ratio)
        media_posts = self.daily_active_users * self.posts_per_user_per_day * self.media_posts_ratio

        text_storage = text_posts * self.avg_post_size
        media_storage = media_posts * self.avg_media_size

        return text_storage + media_storage

Traffic Estimation

Feed Reads: 100M users * 20 feed refreshes/day = 2B reads/day
Feed Writes: 100M users * 1 post/day = 100M writes/day

Peak QPS:
- Read QPS: 2B / (24 * 3600) * 2 = ~46K QPS
- Write QPS: 100M / (24 * 3600) * 2 = ~2.3K QPS

Database Design

Core Tables

-- Users table
CREATE TABLE users (
    user_id BIGINT PRIMARY KEY,
    username VARCHAR(50) UNIQUE,
    email VARCHAR(100),
    profile_image_url TEXT,
    created_at TIMESTAMP,
    follower_count INT DEFAULT 0,
    following_count INT DEFAULT 0
);

-- Posts table
CREATE TABLE posts (
    post_id BIGINT PRIMARY KEY,
    user_id BIGINT REFERENCES users(user_id),
    content TEXT,
    media_urls JSON,
    post_type ENUM('text', 'image', 'video'),
    created_at TIMESTAMP,
    like_count INT DEFAULT 0,
    comment_count INT DEFAULT 0,
    share_count INT DEFAULT 0
);

-- Followers relationship
CREATE TABLE followers (
    follower_id BIGINT REFERENCES users(user_id),
    followee_id BIGINT REFERENCES users(user_id),
    created_at TIMESTAMP,
    PRIMARY KEY (follower_id, followee_id)
);

-- Likes table
CREATE TABLE likes (
    post_id BIGINT REFERENCES posts(post_id),
    user_id BIGINT REFERENCES users(user_id),
    created_at TIMESTAMP,
    PRIMARY KEY (post_id, user_id)
);

Feed Generation Strategies

1. Pull Model (Lazy Loading)

class PullBasedFeed:
    def generate_feed(self, user_id, limit=20):
        # Get list of users that this user follows
        following = self.get_following_users(user_id)

        # Fetch recent posts from followed users
        recent_posts = []
        for followed_user in following:
            posts = self.get_recent_posts(followed_user, limit=10)
            recent_posts.extend(posts)

        # Sort by timestamp and apply ranking algorithm
        sorted_posts = sorted(recent_posts, key=self.ranking_score, reverse=True)
        return sorted_posts[:limit]

    def ranking_score(self, post):
        # Simple ranking: recency + engagement
        time_score = self.time_decay_score(post.created_at)
        engagement_score = post.like_count + post.comment_count * 2
        return time_score + engagement_score

2. Push Model (Pre-computed)

class PushBasedFeed:
    def __init__(self):
        self.feed_cache = Redis()

    def on_new_post(self, post):
        # When user creates a post, push to all followers' feeds
        followers = self.get_followers(post.user_id)

        for follower_id in followers:
            self.add_to_feed(follower_id, post)

    def add_to_feed(self, user_id, post):
        feed_key = f"feed:{user_id}"
        self.feed_cache.zadd(feed_key, {
            f"post:{post.id}": post.created_at.timestamp()
        })
        # Keep only latest 1000 posts in feed
        self.feed_cache.zremrangebyrank(feed_key, 0, -1001)

    def get_feed(self, user_id, limit=20):
        feed_key = f"feed:{user_id}"
        post_ids = self.feed_cache.zrevrange(feed_key, 0, limit-1)
        return self.get_posts_by_ids(post_ids)

3. Hybrid Model

class HybridFeed:
    def __init__(self):
        self.celebrity_threshold = 1000000  # 1M followers

    def generate_feed(self, user_id):
        # Pre-computed feed for normal users
        precomputed_feed = self.get_precomputed_feed(user_id)

        # Real-time fetch for celebrity posts
        celebrity_posts = self.get_celebrity_posts(user_id)

        # Merge and rank
        merged_feed = self.merge_and_rank(precomputed_feed, celebrity_posts)
        return merged_feed

Ranking Algorithm

Content Scoring

class FeedRanking:
    def calculate_score(self, post, user_id):
        base_score = 0

        # Recency score (exponential decay)
        hours_since_post = (datetime.now() - post.created_at).hours
        recency_score = math.exp(-hours_since_post / 24)  # Decay over 24 hours

        # Engagement score
        engagement_score = (
            post.like_count * 1 +
            post.comment_count * 3 +
            post.share_count * 5
        )

        # User relationship score
        relationship_score = self.get_relationship_strength(user_id, post.user_id)

        # Content type preference
        content_preference = self.get_content_preference(user_id, post.post_type)

        total_score = (
            recency_score * 0.3 +
            engagement_score * 0.3 +
            relationship_score * 0.2 +
            content_preference * 0.2
        )

        return total_score

Machine Learning Integration

class MLRanking:
    def __init__(self):
        self.model = self.load_ranking_model()

    def rank_posts(self, user_id, posts):
        features = []
        for post in posts:
            feature_vector = self.extract_features(user_id, post)
            features.append(feature_vector)

        scores = self.model.predict(features)

        # Sort posts by predicted engagement probability
        ranked_posts = sorted(
            zip(posts, scores),
            key=lambda x: x[1],
            reverse=True
        )

        return [post for post, score in ranked_posts]

    def extract_features(self, user_id, post):
        return {
            'post_age_hours': (datetime.now() - post.created_at).hours,
            'author_follower_count': post.author.follower_count,
            'user_author_interaction_history': self.get_interaction_history(user_id, post.user_id),
            'post_engagement_rate': post.engagement_rate,
            'post_type': post.post_type,
            'user_content_preferences': self.get_user_preferences(user_id)
        }

Real-time Updates

WebSocket Implementation

class RealTimeUpdates:
    def __init__(self):
        self.websocket_connections = {}

    async def handle_connection(self, websocket, user_id):
        self.websocket_connections[user_id] = websocket
        try:
            async for message in websocket:
                await self.handle_message(user_id, message)
        finally:
            del self.websocket_connections[user_id]

    async def broadcast_new_post(self, post):
        # Notify followers about new post
        followers = self.get_followers(post.user_id)

        notification = {
            'type': 'new_post',
            'post_id': post.id,
            'author': post.author.username
        }

        for follower_id in followers:
            if follower_id in self.websocket_connections:
                await self.websocket_connections[follower_id].send(
                    json.dumps(notification)
                )

Caching Strategy

Multi-level Caching

class FeedCache:
    def __init__(self):
        self.l1_cache = {}  # Application memory
        self.l2_cache = Redis()  # Redis cache
        self.cdn = CDN()  # CDN for media

    def get_feed(self, user_id):
        # Check L1 cache
        if user_id in self.l1_cache:
            return self.l1_cache[user_id]

        # Check L2 cache
        cached_feed = self.l2_cache.get(f"feed:{user_id}")
        if cached_feed:
            self.l1_cache[user_id] = cached_feed
            return cached_feed

        # Generate fresh feed
        fresh_feed = self.generate_fresh_feed(user_id)
        self.cache_feed(user_id, fresh_feed)
        return fresh_feed

    def invalidate_feed(self, user_id):
        # Invalidate when user's feed needs refresh
        if user_id in self.l1_cache:
            del self.l1_cache[user_id]
        self.l2_cache.delete(f"feed:{user_id}")

Scalability Solutions

Database Sharding

class FeedSharding:
    def __init__(self, num_shards=1000):
        self.num_shards = num_shards

    def get_user_shard(self, user_id):
        return user_id % self.num_shards

    def get_post_shard(self, post_id):
        return post_id % self.num_shards

    def get_timeline_shard(self, user_id):
        # Keep user's timeline in same shard as user data
        return self.get_user_shard(user_id)

Content Delivery Network

class MediaCDN:
    def upload_media(self, media_file, user_id):
        # Upload to multiple regions
        cdn_urls = []
        for region in ['us-east', 'eu-west', 'asia-pacific']:
            url = self.upload_to_region(media_file, region)
            cdn_urls.append(url)

        return {
            'primary_url': cdn_urls[0],
            'regional_urls': cdn_urls
        }

    def get_optimal_url(self, media_id, user_location):
        # Return closest CDN URL based on user location
        return self.select_cdn_by_location(media_id, user_location)

Analytics và Monitoring

Feed Metrics

class FeedAnalytics:
    def track_feed_interaction(self, user_id, action, post_id):
        metrics = {
            'user_id': user_id,
            'action': action,  # view, like, comment, share
            'post_id': post_id,
            'timestamp': datetime.utcnow(),
            'session_id': self.get_session_id(user_id)
        }

        # Send to analytics pipeline
        self.analytics_queue.put(metrics)

    def calculate_engagement_rate(self, user_id, time_window):
        interactions = self.get_user_interactions(user_id, time_window)
        feed_views = self.get_feed_views(user_id, time_window)

        if feed_views == 0:
            return 0

        return len(interactions) / feed_views

Security Considerations

Content Moderation

class ContentModerator:
    def moderate_post(self, post):
        # Automated content filtering
        if self.contains_spam(post.content):
            return 'rejected'

        if self.contains_inappropriate_content(post.content):
            return 'flagged_for_review'

        # Check images for inappropriate content
        if post.media_urls:
            for media_url in post.media_urls:
                if self.scan_image(media_url):
                    return 'flagged_for_review'

        return 'approved'

Next Steps

Nội dung này sẽ được mở rộng thêm với: - Advanced ML ranking algorithms - Real-time personalization - A/B testing frameworks - Global content distribution - Privacy và data protection