diff --git a/utils/count_collab_stats.py b/utils/count_collab_stats.py index 015c9be25..91e9e6b15 100644 --- a/utils/count_collab_stats.py +++ b/utils/count_collab_stats.py @@ -1,7 +1,7 @@ import argparse import csv import math -from collections import defaultdict +from collections import Counter, defaultdict def calculate_statistics(ratings): @@ -24,6 +24,7 @@ def calculate_statistics(ratings): def process_file(path): user_feeds = defaultdict(set) # Stores feeds rated by each user feed_users = defaultdict(set) # Stores users who have rated each feed + feed_ratings = defaultdict(list) # Stores ratings for each feed ratings = [] with open(path, newline="") as csvfile: @@ -32,12 +33,27 @@ def process_file(path): user, feed, rating = row user_feeds[user].add(feed) feed_users[feed].add(user) + feed_ratings[feed].append(float(rating)) ratings.append(float(rating)) + # Calculating average ratings for each feed + avg_feed_rating = {feed: sum(rates) / len(rates) for feed, rates in feed_ratings.items()} + + # Finding feed with the highest average rating + highest_rated_feed = max(avg_feed_rating, key=avg_feed_rating.get) + # Overlap statistics avg_user_overlap = sum(len(feeds) for feeds in user_feeds.values()) / len(user_feeds) avg_feed_overlap = sum(len(users) for users in feed_users.values()) / len(feed_users) + # Finding feeds with greatest user overlap + overlap_count = Counter() + for feed in feed_users: + for user in feed_users[feed]: + overlap_count.update(user_feeds[user]) + + most_overlapped_feeds = overlap_count.most_common(5) + unique_users = len(user_feeds) unique_feeds = len(feed_users) min_rating, max_rating, avg_rating, std_dev, median = calculate_statistics(ratings) @@ -52,6 +68,9 @@ def process_file(path): avg_rating, std_dev, median, + highest_rated_feed, + avg_feed_rating[highest_rated_feed], + most_overlapped_feeds, ) @@ -72,6 +91,9 @@ def main(): avg_rating, std_dev, median, + highest_rated_feed, + highest_rating, + most_overlapped_feeds, ) = process_file(path) print(f"Unique Users: {unique_users}") print(f"Unique Feeds: {unique_feeds}") @@ -80,6 +102,8 @@ def main(): print( f"Rating Stats - Min: {min_rating}, Max: {max_rating}, Average: {avg_rating:.2f}, Std Dev: {std_dev:.2f}, Median: {median}" ) + print(f"Highest Rated Feed: {highest_rated_feed} with Average Rating: {highest_rating:.2f}") + print("Feeds with Greatest User Overlap: ", most_overlapped_feeds) if __name__ == "__main__":