Viral-808 / utils /preprocessing.py
Sam Fred
Commit
58e450d
import pandas as pd
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
import logging
def preprocess_data(df):
"""Preprocess the input DataFrame."""
# Ensure required columns exist
required_columns = ['likes', 'comments', 'shares', 'posting_time', 'caption', 'hashtags']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
logging.warning(f"Missing required columns: {missing_columns}")
for col in missing_columns:
if col in ['likes', 'comments', 'shares']:
df[col] = 0 # Fill with default value (integer)
elif col == 'caption':
df[col] = '' # Fill with default value (empty string)
elif col == 'hashtags':
df[col] = [[] for _ in range(len(df))] # Fill with default value (list of empty lists)
# Convert posting_time to datetime
df['posting_time'] = pd.to_datetime(df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df = df[df['posting_time'].notna()]
# Calculate engagement rate
df['engagement_rate'] = df['likes'] + df['comments'] + df['shares']
# Calculate caption length and hashtag count
df['caption_length'] = df['caption'].apply(len)
df['hashtag_count'] = df['hashtags'].apply(len)
# Calculate sentiment
df['caption_sentiment'] = df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['sentiment'] = df['caption_sentiment']
# Encode categorical columns
if 'content_type' in df.columns and 'media_type' in df.columns:
label_encoder = LabelEncoder()
df['content_type_encoded'] = label_encoder.fit_transform(df['content_type'])
df['media_type_encoded'] = label_encoder.fit_transform(df['media_type'])
return df