import pandas as pd from textblob import TextBlob from sklearn.preprocessing import LabelEncoder import logging def preprocess_data(df): """Preprocess the input DataFrame.""" # Ensure required columns exist required_columns = ['likes', 'comments', 'shares', 'posting_time', 'caption', 'hashtags'] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: logging.warning(f"Missing required columns: {missing_columns}") for col in missing_columns: if col in ['likes', 'comments', 'shares']: df[col] = 0 # Fill with default value (integer) elif col == 'caption': df[col] = '' # Fill with default value (empty string) elif col == 'hashtags': df[col] = [[] for _ in range(len(df))] # Fill with default value (list of empty lists) # Convert posting_time to datetime df['posting_time'] = pd.to_datetime(df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce') df = df[df['posting_time'].notna()] # Calculate engagement rate df['engagement_rate'] = df['likes'] + df['comments'] + df['shares'] # Calculate caption length and hashtag count df['caption_length'] = df['caption'].apply(len) df['hashtag_count'] = df['hashtags'].apply(len) # Calculate sentiment df['caption_sentiment'] = df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity) df['sentiment'] = df['caption_sentiment'] # Encode categorical columns if 'content_type' in df.columns and 'media_type' in df.columns: label_encoder = LabelEncoder() df['content_type_encoded'] = label_encoder.fit_transform(df['content_type']) df['media_type_encoded'] = label_encoder.fit_transform(df['media_type']) return df