Spaces:

gpt-99
/

reddit_search

Sleeping

App Files Files Community

reddit_search / save.py

gpt-99

fixing user agent + local data

bcd2144 3 months ago

raw

history blame contribute delete

8.29 kB

	import time
	import requests
	import pandas as pd
	from datetime import datetime

	def extract_comment_data(comment, post_info):
	return {
	'subreddit': post_info['subreddit'],
	'post_title': post_info['title'],
	'post_score': post_info['score'],
	'post_created_utc': post_info['created_utc'],
	'comment_id': comment['data'].get('id'),
	'comment_author': comment['data'].get('author'),
	'comment_body': comment['data'].get('body'),
	'comment_score': comment['data'].get('score', 0),
	'comment_created_utc': datetime.fromtimestamp(comment['data'].get('created_utc', 0)),
	'post_url': post_info['url'],
	'comment_url': f"https://www.reddit.com{post_info['permalink']}{comment['data'].get('id')}",
	}

	def fetch_top_comments(post_df, num_comments=2):
	all_comments = []
	total_posts = len(post_df)
	headers = {
	'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
	}

	print(f"\nFetching top {num_comments} most upvoted comments for {total_posts} posts...")

	for idx, post in post_df.iterrows():
	print(f"\nProcessing post {idx + 1}/{total_posts}")
	print(f"Title: {post['title'][:100]}...")
	print(f"Post Score: {post['score']}, Number of Comments: {post['num_comments']}")

	try:
	json_url = post['permalink'].replace('https://www.reddit.com', '') + '.json'
	url = f'https://www.reddit.com{json_url}'

	response = requests.get(url, headers=headers)
	response.raise_for_status()
	data = response.json()

	if len(data) > 1:
	comments_data = data[1]['data']['children']

	# Filter out non-comment entries and extract scores
	valid_comments = [
	comment for comment in comments_data
	if comment['kind'] == 't1' and comment['data'].get('score') is not None
	]

	# Sort comments by score (upvotes) in descending order
	sorted_comments = sorted(
	valid_comments,
	key=lambda x: x['data'].get('score', 0),
	reverse=True
	)

	# Take only the top N comments
	top_comments = sorted_comments[:num_comments]

	# Print comment scores for verification
	print("\nTop comment scores for this post:")
	for i, comment in enumerate(top_comments, 1):
	score = comment['data'].get('score', 0)
	print(f"Comment {i}: {score} upvotes")

	# Add to main list
	for comment in top_comments:
	all_comments.append(extract_comment_data(comment, post))

	time.sleep(20)

	except requests.exceptions.RequestException as e:
	print(f"Error fetching comments for post {idx + 1}: {e}")
	continue

	# Create DataFrame and sort
	comments_df = pd.DataFrame(all_comments)

	if not comments_df.empty:
	# Verify sorting by showing top comments for each post
	print("\nVerification of comment sorting:")
	for post_title in comments_df['post_title'].unique():
	post_comments = comments_df[comments_df['post_title'] == post_title]
	print(f"\nPost: {post_title[:100]}...")
	print("Comment scores:", post_comments['comment_score'].tolist())

	return comments_df


	def fetch_subreddits(limit=10, min_subscribers=1000):
	headers = {
	'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
	}
	subreddits_data = []
	after = None

	while len(subreddits_data) < limit:
	try:
	url = f'https://www.reddit.com/subreddits/popular.json?limit=100'
	if after:
	url += f'&after={after}'

	print(f"Fetching subreddits... Current count: {len(subreddits_data)}")
	response = requests.get(url, headers=headers)
	response.raise_for_status()
	data = response.json()

	for subreddit in data['data']['children']:
	subreddit_data = subreddit['data']

	if subreddit_data.get('subscribers', 0) >= min_subscribers:
	sub_info = {
	'display_name': subreddit_data.get('display_name'),
	'display_name_prefixed': subreddit_data.get('display_name_prefixed'),
	'title': subreddit_data.get('title'),
	'subscribers': subreddit_data.get('subscribers', 0),
	'active_users': subreddit_data.get('active_user_count', 0),
	'created_utc': datetime.fromtimestamp(subreddit_data.get('created_utc', 0)),
	'description': subreddit_data.get('description'),
	'subreddit_type': subreddit_data.get('subreddit_type'),
	'over18': subreddit_data.get('over18', False),
	'url': f"https://www.reddit.com/r/{subreddit_data.get('display_name')}/"
	}
	subreddits_data.append(sub_info)

	after = data['data'].get('after')
	if not after:
	print("Reached end of listings")
	break

	time.sleep(2)

	except requests.exceptions.RequestException as e:
	print(f"Error fetching data: {e}")
	break

	return pd.DataFrame(subreddits_data)

	def fetch_top_posts(subreddit, limit=5):
	posts_data = []
	url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all&limit={limit}'
	headers = {
	'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1'
	}

	try:
	response = requests.get(url, headers=headers)
	response.raise_for_status()
	data = response.json()

	for post in data['data']['children']:
	post_data = post['data']
	posts_data.append({
	'subreddit': subreddit,
	'title': post_data.get('title'),
	'score': post_data.get('score'),
	'num_comments': post_data.get('num_comments'),
	'created_utc': datetime.fromtimestamp(post_data.get('created_utc', 0)),
	'url': post_data.get('url'),
	'permalink': 'https://www.reddit.com' + post_data.get('permalink', '')
	})

	time.sleep(2)

	except requests.exceptions.RequestException as e:
	print(f"Error fetching posts from r/{subreddit}: {e}")

	return pd.DataFrame(posts_data)

	def main():
	# Step 1: Fetch Subreddits
	print("Fetching subreddits...")
	subreddits_df = fetch_subreddits(limit=10, min_subscribers=1000)
	print(f"Fetched {len(subreddits_df)} subreddits.")
	subreddits_df.to_csv("subreddits.csv")

	# # Step 2: Fetch Top Posts for each subreddit
	all_posts_data = []
	for subreddit in subreddits_df['display_name']:
	print(f"\nFetching top posts for subreddit: {subreddit}...")
	posts_df = fetch_top_posts(subreddit, limit=5)
	all_posts_data.append(posts_df)

	# Combine all posts into a single DataFrame
	posts_df = pd.concat(all_posts_data, ignore_index=True)
	print(f"Fetched {len(posts_df)} top posts.")
	posts_df.to_csv("posts.csv")

	posts_df = pd.read_csv("posts.csv")

	# Step 3: Fetch Top Comments for each post
	all_comments_data = []
	if not posts_df.empty:
	all_comments_data = fetch_top_comments(posts_df, num_comments=2)
	print(f"Fetched {len(all_comments_data)} top comments.")
	all_comments_data.to_csv("comments.csv")

	if __name__ == "__main__":
	main()