Spaces:

whackthejacker
/

PythonScriptShowcase

Running

File size: 11,013 Bytes

2a64443

import streamlit as st
import pandas as pd
import os
import base64
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datasets import load_dataset

def load_css():
    """Load custom CSS"""
    with open('styles/custom.css') as f:
        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)

def create_logo():
    """Create and display the logo"""
    from PIL import Image
    import os
    
    # Path to the logo image
    logo_path = "assets/python_huggingface_logo.png"
    
    # Check if the logo exists
    if os.path.exists(logo_path):
        # Display the logo image
        image = Image.open(logo_path)
        st.image(image, width=200)
    else:
        # Fallback to text if image is not found
        st.markdown(
            """
            <div style="display: flex; justify-content: center; margin-bottom: 20px;">
                <h2 style="color: #2196F3;">Python & HuggingFace Explorer</h2>
            </div>
            """,
            unsafe_allow_html=True
        )

def get_dataset_info(dataset_name):
    """Get basic information about a HuggingFace dataset"""
    if not dataset_name or not isinstance(dataset_name, str):
        st.error("Invalid dataset name")
        return None, None
        
    try:
        # Attempt to load the dataset with default configuration
        st.info(f"Loading dataset: {dataset_name}...")
        
        try:
            # First try to load the dataset with streaming=False for better compatibility
            dataset = load_dataset(dataset_name, streaming=False)
            # Get the first split
            first_split = next(iter(dataset.keys()))
            data = dataset[first_split]
        except Exception as e:
            st.warning(f"Couldn't load dataset with default configuration: {str(e)}. Trying specific splits...")
            # If that fails, try loading with specific splits
            for split_name in ["train", "test", "validation"]:
                try:
                    st.info(f"Trying to load '{split_name}' split...")
                    data = load_dataset(dataset_name, split=split_name, streaming=False)
                    break
                except Exception as split_error:
                    if split_name == "validation":  # Last attempt
                        st.error(f"Failed to load dataset with any standard split: {str(split_error)}")
                        return None, None
                    continue
        
        # Get basic info
        info = {
            "Dataset": dataset_name,
            "Number of examples": len(data),
            "Features": list(data.features.keys()),
            "Sample": data[0] if len(data) > 0 else None
        }
        
        st.success(f"Successfully loaded dataset with {info['Number of examples']} examples")
        return info, data
    except Exception as e:
        st.error(f"Error loading dataset: {str(e)}")
        if "Connection error" in str(e) or "timeout" in str(e).lower():
            st.warning("Network issue detected. Please check your internet connection and try again.")
        elif "not found" in str(e).lower():
            st.warning(f"Dataset '{dataset_name}' not found. Please check the dataset name and try again.")
        return None, None

def run_code(code):
    """Run Python code and capture output"""
    import io
    import sys
    import time
    from contextlib import redirect_stdout, redirect_stderr
    
    # Create StringIO objects to capture stdout and stderr
    stdout_capture = io.StringIO()
    stderr_capture = io.StringIO()
    
    # Dictionary for storing results
    results = {
        "output": "",
        "error": "",
        "figures": []
    }
    
    # Safety check - limit code size
    if len(code) > 100000:
        results["error"] = "Code submission too large. Please reduce the size."
        return results
        
    # Basic security check - this is not comprehensive
    dangerous_imports = ['os.system', 'subprocess', 'eval(', 'shutil.rmtree', 'open(', 'with open']
    for dangerous_import in dangerous_imports:
        if dangerous_import in code:
            results["error"] = f"Potential security risk: {dangerous_import} is not allowed."
            return results
    
    # Capture current figures to avoid including existing ones
    initial_figs = plt.get_fignums()
    
    # Set execution timeout
    MAX_EXECUTION_TIME = 30  # seconds
    start_time = time.time()
    
    try:
        # Create a restricted globals dictionary
        safe_globals = {
            'plt': plt,
            'pd': pd,
            'np': np,
            'sns': sns,
            'print': print,
            '__builtins__': __builtins__,
        }
        
        # Add common data science libraries
        for module_name in ['datasets', 'transformers', 'sklearn', 'math']:
            try:
                module = __import__(module_name)
                safe_globals[module_name] = module
            except ImportError:
                pass  # Module not available
        
        # Redirect stdout and stderr
        with redirect_stdout(stdout_capture), redirect_stderr(stderr_capture):
            # Execute the code with timeout check
            exec(code, safe_globals)
            
            if time.time() - start_time > MAX_EXECUTION_TIME:
                raise TimeoutError("Code execution exceeded maximum allowed time.")
        
        # Get the captured output
        results["output"] = stdout_capture.getvalue()
        
        # Also capture stderr
        stderr_output = stderr_capture.getvalue()
        if stderr_output:
            if results["output"]:
                results["output"] += "\n\n--- Warnings/Errors ---\n" + stderr_output
            else:
                results["output"] = "--- Warnings/Errors ---\n" + stderr_output
        
        # Capture any figures that were created
        final_figs = plt.get_fignums()
        new_figs = set(final_figs) - set(initial_figs)
        
        for fig_num in new_figs:
            fig = plt.figure(fig_num)
            results["figures"].append(fig)
    
    except Exception as e:
        # Capture the error
        results["error"] = f"{type(e).__name__}: {str(e)}"
    
    return results

def get_dataset_preview(data, max_rows=10):
    """Convert a HuggingFace dataset to a pandas DataFrame for preview"""
    try:
        # Convert to pandas DataFrame
        df = pd.DataFrame(data[:max_rows])
        return df
    except Exception as e:
        st.error(f"Error converting dataset to DataFrame: {str(e)}")
        return None

def generate_basic_stats(data):
    """Generate basic statistics for a dataset"""
    try:
        # Convert to pandas DataFrame
        df = pd.DataFrame(data)
        
        # Get column types
        column_types = df.dtypes
        
        # Initialize stats dictionary
        stats = {}
        
        for col in df.columns:
            col_stats = {}
            
            # Check if column is numeric
            if pd.api.types.is_numeric_dtype(df[col]):
                col_stats["mean"] = df[col].mean()
                col_stats["median"] = df[col].median()
                col_stats["std"] = df[col].std()
                col_stats["min"] = df[col].min()
                col_stats["max"] = df[col].max()
                col_stats["missing"] = df[col].isna().sum()
            # Check if column is string/object
            elif pd.api.types.is_string_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
                col_stats["unique_values"] = df[col].nunique()
                col_stats["most_common"] = df[col].value_counts().head(5).to_dict() if df[col].nunique() < 100 else "Too many unique values"
                col_stats["missing"] = df[col].isna().sum()
            
            stats[col] = col_stats
        
        return stats
    except Exception as e:
        st.error(f"Error generating statistics: {str(e)}")
        return None

def create_visualization(data, viz_type, x_col=None, y_col=None, hue_col=None):
    """Create a visualization based on the selected type and columns"""
    try:
        df = pd.DataFrame(data)
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        if viz_type == "Bar Chart":
            if x_col and y_col:
                sns.barplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax)
            else:
                st.warning("Bar charts require both X and Y columns.")
                return None
        
        elif viz_type == "Line Chart":
            if x_col and y_col:
                sns.lineplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax)
            else:
                st.warning("Line charts require both X and Y columns.")
                return None
        
        elif viz_type == "Scatter Plot":
            if x_col and y_col:
                sns.scatterplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax)
            else:
                st.warning("Scatter plots require both X and Y columns.")
                return None
        
        elif viz_type == "Histogram":
            if x_col:
                sns.histplot(df[x_col], ax=ax)
            else:
                st.warning("Histograms require an X column.")
                return None
        
        elif viz_type == "Box Plot":
            if x_col and y_col:
                sns.boxplot(x=x_col, y=y_col, hue=hue_col, data=df, ax=ax)
            else:
                st.warning("Box plots require both X and Y columns.")
                return None
        
        elif viz_type == "Count Plot":
            if x_col:
                sns.countplot(x=x_col, hue=hue_col, data=df, ax=ax)
            else:
                st.warning("Count plots require an X column.")
                return None
        
        # Set title and labels
        plt.title(f"{viz_type} of {y_col if y_col else ''} vs {x_col if x_col else ''}")
        plt.xlabel(x_col if x_col else "")
        plt.ylabel(y_col if y_col else "")
        plt.tight_layout()
        
        return fig
    
    except Exception as e:
        st.error(f"Error creating visualization: {str(e)}")
        return None

def get_popular_datasets(category=None, limit=10):
    """Get popular HuggingFace datasets, optionally filtered by category"""
    popular_datasets = {
        "Text": ["glue", "imdb", "squad", "wikitext", "ag_news"],
        "Image": ["cifar10", "cifar100", "mnist", "fashion_mnist", "coco"],
        "Audio": ["common_voice", "librispeech_asr", "voxpopuli", "voxceleb", "audiofolder"],
        "Multimodal": ["conceptual_captions", "flickr8k", "hateful_memes", "nlvr", "vqa"]
    }
    
    if category and category in popular_datasets:
        return popular_datasets[category][:limit]
    else:
        # Return all datasets flattened
        all_datasets = []
        for cat_datasets in popular_datasets.values():
            all_datasets.extend(cat_datasets)
        return all_datasets[:limit]