|
import pandas as pd |
|
from textblob import TextBlob |
|
from sklearn.preprocessing import LabelEncoder |
|
import logging |
|
|
|
def preprocess_data(df): |
|
"""Preprocess the input DataFrame.""" |
|
|
|
required_columns = ['likes', 'comments', 'shares', 'posting_time', 'caption', 'hashtags'] |
|
missing_columns = [col for col in required_columns if col not in df.columns] |
|
|
|
if missing_columns: |
|
logging.warning(f"Missing required columns: {missing_columns}") |
|
for col in missing_columns: |
|
if col in ['likes', 'comments', 'shares']: |
|
df[col] = 0 |
|
elif col == 'caption': |
|
df[col] = '' |
|
elif col == 'hashtags': |
|
df[col] = [[] for _ in range(len(df))] |
|
|
|
|
|
df['posting_time'] = pd.to_datetime(df['posting_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce') |
|
df = df[df['posting_time'].notna()] |
|
|
|
|
|
df['engagement_rate'] = df['likes'] + df['comments'] + df['shares'] |
|
|
|
|
|
df['caption_length'] = df['caption'].apply(len) |
|
df['hashtag_count'] = df['hashtags'].apply(len) |
|
|
|
|
|
df['caption_sentiment'] = df['caption'].apply(lambda x: TextBlob(x).sentiment.polarity) |
|
df['sentiment'] = df['caption_sentiment'] |
|
|
|
|
|
if 'content_type' in df.columns and 'media_type' in df.columns: |
|
label_encoder = LabelEncoder() |
|
df['content_type_encoded'] = label_encoder.fit_transform(df['content_type']) |
|
df['media_type_encoded'] = label_encoder.fit_transform(df['media_type']) |
|
|
|
return df |