Case Study: Social Media Feed (như Facebook News Feed)
Tổng Quan
Social Media Feed là hệ thống hiển thị nội dung cá nhân hóa cho users dựa trên hoạt động của friends và interests. Đây là một trong những system design phức tạp nhất.
Requirements
Functional Requirements
1. Users có thể đăng posts (text, images, videos)
2. Users có thể follow/unfollow other users
3. Generate news feed cho users
4. Support likes, comments, shares
5. Real-time notifications
Non-Functional Requirements
- Scale: 1B users, 100M daily active users
- Posts: 100M posts/day
- Feed generation: < 100ms
- Availability: 99.99%
- Consistency: Eventual consistency acceptable
Capacity Estimation
Storage Requirements
class SocialMediaCapacity:
def __init__(self):
self.daily_active_users = 100_000_000
self.posts_per_user_per_day = 1
self.avg_post_size = 1024 # bytes
self.media_posts_ratio = 0.3
self.avg_media_size = 5 * 1024 * 1024 # 5MB
def daily_storage_requirement(self):
text_posts = self.daily_active_users * self.posts_per_user_per_day * (1 - self.media_posts_ratio)
media_posts = self.daily_active_users * self.posts_per_user_per_day * self.media_posts_ratio
text_storage = text_posts * self.avg_post_size
media_storage = media_posts * self.avg_media_size
return text_storage + media_storage
Traffic Estimation
Feed Reads: 100M users * 20 feed refreshes/day = 2B reads/day
Feed Writes: 100M users * 1 post/day = 100M writes/day
Peak QPS:
- Read QPS: 2B / (24 * 3600) * 2 = ~46K QPS
- Write QPS: 100M / (24 * 3600) * 2 = ~2.3K QPS
Database Design
Core Tables
-- Users table
CREATE TABLE users (
user_id BIGINT PRIMARY KEY,
username VARCHAR(50) UNIQUE,
email VARCHAR(100),
profile_image_url TEXT,
created_at TIMESTAMP,
follower_count INT DEFAULT 0,
following_count INT DEFAULT 0
);
-- Posts table
CREATE TABLE posts (
post_id BIGINT PRIMARY KEY,
user_id BIGINT REFERENCES users(user_id),
content TEXT,
media_urls JSON,
post_type ENUM('text', 'image', 'video'),
created_at TIMESTAMP,
like_count INT DEFAULT 0,
comment_count INT DEFAULT 0,
share_count INT DEFAULT 0
);
-- Followers relationship
CREATE TABLE followers (
follower_id BIGINT REFERENCES users(user_id),
followee_id BIGINT REFERENCES users(user_id),
created_at TIMESTAMP,
PRIMARY KEY (follower_id, followee_id)
);
-- Likes table
CREATE TABLE likes (
post_id BIGINT REFERENCES posts(post_id),
user_id BIGINT REFERENCES users(user_id),
created_at TIMESTAMP,
PRIMARY KEY (post_id, user_id)
);
Feed Generation Strategies
1. Pull Model (Lazy Loading)
class PullBasedFeed:
def generate_feed(self, user_id, limit=20):
# Get list of users that this user follows
following = self.get_following_users(user_id)
# Fetch recent posts from followed users
recent_posts = []
for followed_user in following:
posts = self.get_recent_posts(followed_user, limit=10)
recent_posts.extend(posts)
# Sort by timestamp and apply ranking algorithm
sorted_posts = sorted(recent_posts, key=self.ranking_score, reverse=True)
return sorted_posts[:limit]
def ranking_score(self, post):
# Simple ranking: recency + engagement
time_score = self.time_decay_score(post.created_at)
engagement_score = post.like_count + post.comment_count * 2
return time_score + engagement_score
2. Push Model (Pre-computed)
class PushBasedFeed:
def __init__(self):
self.feed_cache = Redis()
def on_new_post(self, post):
# When user creates a post, push to all followers' feeds
followers = self.get_followers(post.user_id)
for follower_id in followers:
self.add_to_feed(follower_id, post)
def add_to_feed(self, user_id, post):
feed_key = f"feed:{user_id}"
self.feed_cache.zadd(feed_key, {
f"post:{post.id}": post.created_at.timestamp()
})
# Keep only latest 1000 posts in feed
self.feed_cache.zremrangebyrank(feed_key, 0, -1001)
def get_feed(self, user_id, limit=20):
feed_key = f"feed:{user_id}"
post_ids = self.feed_cache.zrevrange(feed_key, 0, limit-1)
return self.get_posts_by_ids(post_ids)
3. Hybrid Model
class HybridFeed:
def __init__(self):
self.celebrity_threshold = 1000000 # 1M followers
def generate_feed(self, user_id):
# Pre-computed feed for normal users
precomputed_feed = self.get_precomputed_feed(user_id)
# Real-time fetch for celebrity posts
celebrity_posts = self.get_celebrity_posts(user_id)
# Merge and rank
merged_feed = self.merge_and_rank(precomputed_feed, celebrity_posts)
return merged_feed
Ranking Algorithm
Content Scoring
class FeedRanking:
def calculate_score(self, post, user_id):
base_score = 0
# Recency score (exponential decay)
hours_since_post = (datetime.now() - post.created_at).hours
recency_score = math.exp(-hours_since_post / 24) # Decay over 24 hours
# Engagement score
engagement_score = (
post.like_count * 1 +
post.comment_count * 3 +
post.share_count * 5
)
# User relationship score
relationship_score = self.get_relationship_strength(user_id, post.user_id)
# Content type preference
content_preference = self.get_content_preference(user_id, post.post_type)
total_score = (
recency_score * 0.3 +
engagement_score * 0.3 +
relationship_score * 0.2 +
content_preference * 0.2
)
return total_score
Machine Learning Integration
class MLRanking:
def __init__(self):
self.model = self.load_ranking_model()
def rank_posts(self, user_id, posts):
features = []
for post in posts:
feature_vector = self.extract_features(user_id, post)
features.append(feature_vector)
scores = self.model.predict(features)
# Sort posts by predicted engagement probability
ranked_posts = sorted(
zip(posts, scores),
key=lambda x: x[1],
reverse=True
)
return [post for post, score in ranked_posts]
def extract_features(self, user_id, post):
return {
'post_age_hours': (datetime.now() - post.created_at).hours,
'author_follower_count': post.author.follower_count,
'user_author_interaction_history': self.get_interaction_history(user_id, post.user_id),
'post_engagement_rate': post.engagement_rate,
'post_type': post.post_type,
'user_content_preferences': self.get_user_preferences(user_id)
}
Real-time Updates
WebSocket Implementation
class RealTimeUpdates:
def __init__(self):
self.websocket_connections = {}
async def handle_connection(self, websocket, user_id):
self.websocket_connections[user_id] = websocket
try:
async for message in websocket:
await self.handle_message(user_id, message)
finally:
del self.websocket_connections[user_id]
async def broadcast_new_post(self, post):
# Notify followers about new post
followers = self.get_followers(post.user_id)
notification = {
'type': 'new_post',
'post_id': post.id,
'author': post.author.username
}
for follower_id in followers:
if follower_id in self.websocket_connections:
await self.websocket_connections[follower_id].send(
json.dumps(notification)
)
Caching Strategy
Multi-level Caching
class FeedCache:
def __init__(self):
self.l1_cache = {} # Application memory
self.l2_cache = Redis() # Redis cache
self.cdn = CDN() # CDN for media
def get_feed(self, user_id):
# Check L1 cache
if user_id in self.l1_cache:
return self.l1_cache[user_id]
# Check L2 cache
cached_feed = self.l2_cache.get(f"feed:{user_id}")
if cached_feed:
self.l1_cache[user_id] = cached_feed
return cached_feed
# Generate fresh feed
fresh_feed = self.generate_fresh_feed(user_id)
self.cache_feed(user_id, fresh_feed)
return fresh_feed
def invalidate_feed(self, user_id):
# Invalidate when user's feed needs refresh
if user_id in self.l1_cache:
del self.l1_cache[user_id]
self.l2_cache.delete(f"feed:{user_id}")
Scalability Solutions
Database Sharding
class FeedSharding:
def __init__(self, num_shards=1000):
self.num_shards = num_shards
def get_user_shard(self, user_id):
return user_id % self.num_shards
def get_post_shard(self, post_id):
return post_id % self.num_shards
def get_timeline_shard(self, user_id):
# Keep user's timeline in same shard as user data
return self.get_user_shard(user_id)
Content Delivery Network
class MediaCDN:
def upload_media(self, media_file, user_id):
# Upload to multiple regions
cdn_urls = []
for region in ['us-east', 'eu-west', 'asia-pacific']:
url = self.upload_to_region(media_file, region)
cdn_urls.append(url)
return {
'primary_url': cdn_urls[0],
'regional_urls': cdn_urls
}
def get_optimal_url(self, media_id, user_location):
# Return closest CDN URL based on user location
return self.select_cdn_by_location(media_id, user_location)
Analytics và Monitoring
Feed Metrics
class FeedAnalytics:
def track_feed_interaction(self, user_id, action, post_id):
metrics = {
'user_id': user_id,
'action': action, # view, like, comment, share
'post_id': post_id,
'timestamp': datetime.utcnow(),
'session_id': self.get_session_id(user_id)
}
# Send to analytics pipeline
self.analytics_queue.put(metrics)
def calculate_engagement_rate(self, user_id, time_window):
interactions = self.get_user_interactions(user_id, time_window)
feed_views = self.get_feed_views(user_id, time_window)
if feed_views == 0:
return 0
return len(interactions) / feed_views
Security Considerations
Content Moderation
class ContentModerator:
def moderate_post(self, post):
# Automated content filtering
if self.contains_spam(post.content):
return 'rejected'
if self.contains_inappropriate_content(post.content):
return 'flagged_for_review'
# Check images for inappropriate content
if post.media_urls:
for media_url in post.media_urls:
if self.scan_image(media_url):
return 'flagged_for_review'
return 'approved'
Next Steps
Nội dung này sẽ được mở rộng thêm với: - Advanced ML ranking algorithms - Real-time personalization - A/B testing frameworks - Global content distribution - Privacy và data protection