import streamlit as st import pandas as pd import os import base64 from pathlib import Path import matplotlib.pyplot as plt import seaborn as sns import numpy as np from datasets import load_dataset def load_css(): """Load custom CSS""" with open('styles/custom.css') as f: st.markdown(f'', unsafe_allow_html=True) def create_logo(): """Create and display the logo""" from PIL import Image import os # Path to the logo image logo_path = "assets/python_huggingface_logo.png" # Check if the logo exists if os.path.exists(logo_path): # Display the logo image image = Image.open(logo_path) st.image(image, width=200) else: # Fallback to text if image is not found st.markdown( """

Python & HuggingFace Explorer

""", unsafe_allow_html=True ) def get_dataset_info(dataset_name): """Get basic information about a HuggingFace dataset""" if not dataset_name or not isinstance(dataset_name, str): st.error("Invalid dataset name") return None, None try: # Attempt to load the dataset with default configuration st.info(f"Loading dataset: {dataset_name}...") try: # First try to load the dataset with streaming=False for better compatibility dataset = load_dataset(dataset_name, streaming=False) # Get the first split first_split = next(iter(dataset.keys())) data = dataset[first_split] except Exception as e: st.warning(f"Couldn't load dataset with default configuration: {str(e)}. Trying specific splits...") # If that fails, try loading with specific splits for split_name in ["train", "test", "validation"]: try: st.info(f"Trying to load '{split_name}' split...") data = load_dataset(dataset_name, split=split_name, streaming=False) break except Exception as split_error: if split_name == "validation": # Last attempt st.error(f"Failed to load dataset with any standard split: {str(split_error)}") return None, None continue # Get basic info info = { "Dataset": dataset_name, "Number of examples": len(data), "Features": list(data.features.keys()), "Sample": data[0] if len(data) > 0 else None } st.success(f"Successfully loaded dataset with {info['Number of examples']} examples") return info, data except Exception as e: st.error(f"Error loading dataset: {str(e)}") if "Connection error" in str(e) or "timeout" in str(e).lower(): st.warning("Network issue detected. Please check your internet connection and try again.") elif "not found" in str(e).lower(): st.warning(f"Dataset '{dataset_name}' not found. Please check the dataset name and try again.") return None, None def run_code(code): """Run Python code and capture output""" import io import sys import time from contextlib import redirect_stdout, redirect_stderr # Create StringIO objects to capture stdout and stderr stdout_capture = io.StringIO() stderr_capture = io.StringIO() # Dictionary for storing results results = { "output": "", "error": "", "figures": [] } # Safety check - limit code size if len(code) > 100000: results["error"] = "Code submission too large. Please reduce the size." return results # Basic security check - this is not comprehensive dangerous_imports = ['os.system', 'subprocess', 'eval(', 'shutil.rmtree', 'open(', 'with open'] for dangerous_import in dangerous_imports: if dangerous_import in code: results["error"] = f"Potential security risk: {dangerous_import} is not allowed." return results # Capture current figures to avoid including existing ones initial_figs = plt.get_fignums() # Set execution timeout MAX_EXECUTION_TIME = 30 # seconds start_time = time.time() try: # Create a restricted globals dictionary safe_globals = { 'plt': plt, 'pd': pd, 'np': np, 'sns': sns, 'print': print, '__builtins__': __builtins__, } # Add common data science libraries for module_name in ['datasets', 'transformers', 'sklearn', 'math']: try: module = __import__(module_name) safe_globals[module_name] = module except ImportError: pass # Module not available # Redirect stdout and stderr with redirect_stdout(stdout_capture), redirect_stderr(stderr_capture): # Execute the code with timeout check exec(code, safe_globals) if time.time() - start_time > MAX_EXECUTION_TIME: raise TimeoutError("Code execution exceeded maximum allowed time.") # Get the captured output results["output"] = stdout_capture.getvalue() # Also capture stderr stderr_output = stderr_capture.getvalue() if stderr_output: if results["output"]: results["output"] += "\n\n--- Warnings/Errors ---\n" + stderr_output else: results["output"] = "--- Warnings/Errors ---\n" + stderr_output # Capture any figures that were created final_figs = plt.get_fignums() new_figs = set(final_figs) - set(initial_figs) for fig_num in new_figs: fig = plt.figure(fig_num) results["figures"].append(fig) except Exception as e: # Capture the error results["error"] = f"{type(e).__name__}: {str(e)}" return results def get_dataset_preview(data, max_rows=10): """Convert a HuggingFace dataset to a pandas DataFrame for preview""" try: # Convert to pandas DataFrame df = pd.DataFrame(data[:max_rows]) return df except Exception as e: st.error(f"Error converting dataset to DataFrame: {str(e)}") return None def generate_basic_stats(data): """Generate basic statistics for a dataset""" try: # Convert to pandas DataFrame df = pd.DataFrame(data) # Get column types column_types = df.dtypes # Initialize stats dictionary stats = {} for col in df.columns: col_stats = {} # Check if column is numeric if pd.api.types.is_numeric_dtype(df[col]): col_stats["mean"] = df[col].mean() col_stats["median"] = df[col].median() col_stats["std"] = df[col].std() col_stats["min"] = df[col].min() col_stats["max"] = df[col].max() col_stats["missing"] = df[col].isna().sum() # Check if column is string/object elif pd.api.types.is_string_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]): col_stats["unique_values"] = df[col].nunique() col_stats["most_common"] = df[col].value_counts().head(5).to_dict() if df[col].nunique() < 100 else "Too many unique values" col_stats["missing"] = df[col].isna().sum() stats[col] = col_stats return stats except Exception as e: st.error(f"Error generating statistics: {str(e)}") return None def create_visualization(data, viz_type, x_col=None, y_col=None, hue_col=None): """Create a visualization based on the selected type and columns""" try: df = pd.DataFrame(data) fig, ax = plt.subplots(figsize=(10, 6)) if viz_type == "Bar Chart": if x_col and y_col: sns.barplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax) else: st.warning("Bar charts require both X and Y columns.") return None elif viz_type == "Line Chart": if x_col and y_col: sns.lineplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax) else: st.warning("Line charts require both X and Y columns.") return None elif viz_type == "Scatter Plot": if x_col and y_col: sns.scatterplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax) else: st.warning("Scatter plots require both X and Y columns.") return None elif viz_type == "Histogram": if x_col: sns.histplot(df[x_col], ax=ax) else: st.warning("Histograms require an X column.") return None elif viz_type == "Box Plot": if x_col and y_col: sns.boxplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax) else: st.warning("Box plots require both X and Y columns.") return None elif viz_type == "Count Plot": if x_col: sns.countplot(x=x_col, hue=hue_col, data=df, ax=ax) else: st.warning("Count plots require an X column.") return None # Set title and labels plt.title(f"{viz_type} of {y_col if y_col else ''} vs {x_col if x_col else ''}") plt.xlabel(x_col if x_col else "") plt.ylabel(y_col if y_col else "") plt.tight_layout() return fig except Exception as e: st.error(f"Error creating visualization: {str(e)}") return None def get_popular_datasets(category=None, limit=10): """Get popular HuggingFace datasets, optionally filtered by category""" popular_datasets = { "Text": ["glue", "imdb", "squad", "wikitext", "ag_news"], "Image": ["cifar10", "cifar100", "mnist", "fashion_mnist", "coco"], "Audio": ["common_voice", "librispeech_asr", "voxpopuli", "voxceleb", "audiofolder"], "Multimodal": ["conceptual_captions", "flickr8k", "hateful_memes", "nlvr", "vqa"] } if category and category in popular_datasets: return popular_datasets[category][:limit] else: # Return all datasets flattened all_datasets = [] for cat_datasets in popular_datasets.values(): all_datasets.extend(cat_datasets) return all_datasets[:limit]