Spaces:

bigcomputer
/

arena-annotation-progress

Running

App Files Files Community

Terry Zhuo commited on 23 days ago

Commit

020b18b

1 Parent(s): 6da81f8

update

Browse files

Files changed (8) hide show

__pycache__/azure_count_ip_data.cpython-311.pyc +0 -0
__pycache__/log_reader.cpython-311.pyc +0 -0
app.py +22 -17
azure_count_ip_data.py +246 -0
data/battle_stats.csv +12 -0
data/last_update.txt +1 -0
log_reader.py +118 -0
requirements.txt +1 -1

__pycache__/azure_count_ip_data.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

__pycache__/log_reader.cpython-311.pyc ADDED Viewed

Binary file (7 kB). View file

app.py CHANGED Viewed

@@ -3,7 +3,8 @@ import gradio as gr
 import pandas as pd
 from datetime import datetime
 import time
-from count_ip_data import count_files_per_ip
 import threading
 # Define the path for storing the data
@@ -32,23 +33,27 @@ def load_stats():
 def update_stats():
     """Get the latest battle statistics"""
-    smb_url = os.getenv("SMB_URL")
-    if not smb_url:
         return pd.DataFrame(columns=['IP Address', 'Battle Count']), ""
-    ip_counts = count_files_per_ip(smb_url)
-    # Convert to DataFrame for better display
-    df = pd.DataFrame(list(ip_counts.items()), columns=['IP Address', 'Battle Count'])
-    df = df.sort_values('Battle Count', ascending=False)
-    # Get current time
-    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    # Save the updated stats
-    save_stats(df, current_time)
-    return df, current_time
 def auto_update(state):
     """Background task to update stats every hour"""

 import pandas as pd
 from datetime import datetime
 import time
+from azure_count_ip_data import count_files_per_ip
+from log_reader import RemoteLogReader
 import threading
 # Define the path for storing the data
 def update_stats():
     """Get the latest battle statistics"""
+    try:
+        # Initialize RemoteLogReader
+        reader = RemoteLogReader()
+        # Get IP counts using Azure storage
+        ip_counts = count_files_per_ip(reader)
+        # Convert to DataFrame for better display
+        df = pd.DataFrame(list(ip_counts.items()), columns=['IP Address', 'Battle Count'])
+        df = df.sort_values('Battle Count', ascending=False)
+        # Get current time
+        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        # Save the updated stats
+        save_stats(df, current_time)
+        return df, current_time
+    except Exception as e:
+        print(f"Error updating stats: {e}")
         return pd.DataFrame(columns=['IP Address', 'Battle Count']), ""
 def auto_update(state):
     """Background task to update stats every hour"""

azure_count_ip_data.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import os
+import logging
+from datetime import datetime, timedelta
+import json
+from collections import defaultdict
+import shutil
+import re
+import argparse
+from typing import Dict, Set, Tuple, Optional
+from log_reader import RemoteLogReader
+# List of IP addresses we care about
+WHITELIST_IPS = [
+    "199.111.212.5",
+    "175.159.122.63",
+    "109.245.193.97",
+    "158.195.18.232",
+    "2607:fea8:4f40:4b00:e5b9:9806:6b69:233b",
+    "66.254.231.49",
+    "129.74.154.194",
+    "175.196.44.217",
+    "2601:600:8d00:9510:1d77:b610:9358:f443",
+    "74.90.222.68",
+    "2a02:169:3e9:0:6ce8:e76f:faed:c830",
+    "70.50.179.57",
+    "2a02:842a:24:5a01:8cd6:5b22:1189:6035",
+    "2408:8418:6390:7603:40b:555f:774:a05d"
+]
+logging.basicConfig(level=logging.WARNING)
+log = logging.getLogger(__name__)
+def get_ip_from_jsonl(content: str) -> Optional[str]:
+    """Extract IP from the first line of a JSONL content"""
+    try:
+        first_line = content.split('\n')[0]
+        data = json.loads(first_line)
+        return data.get('ip')
+    except Exception as e:
+        log.error(f"Error extracting IP from content: {e}")
+        return None
+def get_chat_session_id(file_name: str, content: str = None) -> Optional[str]:
+    """Extract chat_session_id based on the file location:
+    - For files under conv_logs: extract from filename
+    - For files under sandbox_logs: read from file content
+    """
+    try:
+        if 'conv_logs' in file_name:
+            # Extract from filename for conv_logs
+            match = re.match(r'conv-log-([a-f0-9]+)\.json', file_name)
+            if match:
+                return match.group(1)
+        elif 'sandbox_logs' in file_name and content:
+            # Read from file content for sandbox_logs
+            data = json.loads(content)
+            return data['sandbox_state'].get('chat_session_id')
+        return None
+    except Exception as e:
+        log.error(f"Error getting chat_session_id from {file_name}: {e}")
+        return None
+def get_sandbox_session_ids(reader: 'RemoteLogReader', date_str: str) -> Set[str]:
+    """Get all chat_session_ids from sandbox logs for a given date"""
+    session_ids = set()
+    try:
+        sandbox_logs = reader.get_sandbox_logs(date_str)
+        for log in sandbox_logs:
+            if isinstance(log, dict):
+                session_id = log.get('sandbox_state', {}).get('chat_session_id')
+                if session_id:
+                    session_ids.add(session_id)
+    except Exception as e:
+        log.error(f"Error getting sandbox session IDs for date {date_str}: {e}")
+    return session_ids
+def get_file_data(content: str) -> Tuple[Optional[str], bool]:
+    """Read file content and return IP and vote condition status"""
+    try:
+        lines = [line.strip() for line in content.split('\n') if line.strip()]
+        if not lines:
+            return None, False
+        # Get IP from first line
+        try:
+            first_line_data = json.loads(lines[0])
+            ip = first_line_data.get('ip')
+            # Early return if IP is not in whitelist
+            if ip not in WHITELIST_IPS:
+                return None, False
+        except json.JSONDecodeError:
+            ip = None
+        # Check vote conditions from last line
+        try:
+            last_line_data = json.loads(lines[-1])
+            feedback = last_line_data.get('feedback')
+            vote_conditions_met = (last_line_data.get('type') == 'vote' and
+                                isinstance(feedback, dict) and
+                                len(feedback) == 6)
+        except json.JSONDecodeError:
+            vote_conditions_met = False
+        return ip, vote_conditions_met
+    except Exception as e:
+        log.error(f"Error processing file content: {e}")
+        return None, False
+def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
+    """Count files per IP address from the given start date"""
+    # Convert start date string to datetime
+    start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
+    ip_counts = defaultdict(int)
+    try:
+        # Get current date for iteration
+        current_date = start_date
+        today = datetime.now()
+        while current_date <= today:
+            date_str = current_date.strftime("%Y_%m_%d")
+            try:
+                # Get conversation logs for battle_anony mode
+                conv_logs = reader.get_conv_logs(date_str)
+                battle_anony_logs = conv_logs.get('battle_anony', {})
+                # Process each conversation
+                for conv_id, messages in battle_anony_logs.items():
+                    if messages:
+                        # Convert messages to file content format
+                        content = '\n'.join(json.dumps(msg) for msg in messages)
+                        ip, vote_conditions_met = get_file_data(content)
+                        if vote_conditions_met and ip:
+                            ip_counts[ip] += 1
+            except Exception as e:
+                log.error(f"Error processing logs for date {date_str}: {e}")
+            # Move to next day
+            current_date += timedelta(days=1)
+    except Exception as e:
+        log.error(f"Error accessing logs: {e}")
+    return dict(ip_counts)
+def download_files_by_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18", check_sandbox: bool = True) -> None:
+    """Download files and organize them by IP address
+    Args:
+        reader: RemoteLogReader instance
+        start_date_str: The start date in YYYY_MM_DD format
+        check_sandbox: Whether to check for matching sandbox logs
+    """
+    # Create base data directory
+    data_dir = os.path.join(os.getcwd(), "data")
+    os.makedirs(data_dir, exist_ok=True)
+    # Convert start date string to datetime
+    start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
+    try:
+        # Get current date for iteration
+        current_date = start_date
+        today = datetime.now()
+        while current_date <= today:
+            date_str = current_date.strftime("%Y_%m_%d")
+            # Get all sandbox session IDs for this date
+            sandbox_session_ids = get_sandbox_session_ids(reader, date_str) if check_sandbox else set()
+            try:
+                # Get conversation logs for battle_anony mode
+                conv_logs = reader.get_conv_logs(date_str)
+                battle_anony_logs = conv_logs.get('battle_anony', {})
+                # Process each conversation
+                for conv_id, messages in battle_anony_logs.items():
+                    if not messages:
+                        continue
+                    # Convert messages to file content
+                    content = '\n'.join(json.dumps(msg) for msg in messages)
+                    ip = get_ip_from_jsonl(content)
+                    if ip:
+                        # Create directory structure for this IP
+                        ip_dir = os.path.join(data_dir, ip)
+                        valid_dir = os.path.join(ip_dir, "valid")
+                        invalid_dir = os.path.join(ip_dir, "invalid")
+                        os.makedirs(valid_dir, exist_ok=True)
+                        os.makedirs(invalid_dir, exist_ok=True)
+                        # Check if chat_session_id exists in sandbox logs
+                        if check_sandbox:
+                            has_sandbox = conv_id in sandbox_session_ids
+                            target_dir = valid_dir if has_sandbox else invalid_dir
+                        else:
+                            # When sandbox checking is disabled, put everything in valid
+                            target_dir = valid_dir
+                        # Save the file
+                        file_name = f"conv-log-{conv_id}.json"
+                        local_file_path = os.path.join(target_dir, file_name)
+                        try:
+                            with open(local_file_path, 'w') as f:
+                                f.write(content)
+                            log.info(f"Saved {file_name} to {target_dir}")
+                        except Exception as e:
+                            log.error(f"Error saving file {file_name}: {e}")
+            except Exception as e:
+                log.error(f"Error processing logs for date {date_str}: {e}")
+            # Move to next day
+            current_date += timedelta(days=1)
+    except Exception as e:
+        log.error(f"Error accessing logs: {e}")
+def main():
+    # Initialize RemoteLogReader
+    reader = RemoteLogReader()
+    # Add argument parser for optional parameters
+    parser = argparse.ArgumentParser(description='Download and organize conversation files by IP')
+    parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
+    parser.add_argument('--download', action='store_true', help='Enable file download')
+    args = parser.parse_args()
+    # Download files if enabled
+    if args.download:
+        print("\nDownloading files and organizing by IP address...")
+        download_files_by_ip(reader, check_sandbox=args.sandbox_check)
+    # Count and display statistics
+    ip_counts = count_files_per_ip(reader)
+    print("\nFile counts per IP address:")
+    for ip, count in sorted(ip_counts.items(), key=lambda x: x[1], reverse=True):
+        print(f"IP: {ip:<15} Count: {count}")
+if __name__ == "__main__":
+    main()

data/battle_stats.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+IP Address,Battle Count
+109.245.193.97,38
+70.50.179.57,13
+175.196.44.217,9
+66.254.231.49,3
+158.195.18.232,2
+2a02:169:3e9:0:6ce8:e76f:faed:c830,1
+2408:8418:6390:7603:40b:555f:774:a05d,1
+2607:fea8:4f40:4b00:e5b9:9806:6b69:233b,1
+175.159.122.63,1
+2a02:842a:24:5a01:8cd6:5b22:1189:6035,1
+2601:600:8d00:9510:1d77:b610:9358:f443,1

data/last_update.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 2025-02-21 00:07:06

log_reader.py ADDED Viewed

	@@ -0,0 +1,118 @@

+'''
+Facade for reading logs on remote storage.
+'''
+from collections import defaultdict
+import json
+import os
+from typing import Any
+from azure.storage.fileshare import ShareServiceClient
+class RemoteLogReader:
+    '''
+    remote log reader
+    '''
+    LOG_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING") or ""
+    LOG_SHARE_NAME = "swearenalogsfileshare"
+    IMAGE_DIR_NAME = "serve_images"
+    '''
+    Directory for storing user uploaded images.
+    '''
+    CONV_LOG_DIR_NAME = "conv_logs"
+    '''
+    Directory for conversation logs.
+    '''
+    SANDBOX_LOG_DIR_NAME = "sandbox_logs"
+    '''
+    Directory for sandbox logs.
+    '''
+    CHAT_MODES = ["battle_anony", "battle_named", "direct"]
+    def __init__(
+        self,
+        connection_string: str = LOG_CONNECTION_STRING,
+        share_name: str = LOG_SHARE_NAME,
+    ):
+        if not connection_string:
+            raise ValueError("Connection string is required.")
+        if not share_name:
+            raise ValueError("Share name is required.")
+        self.share_service = ShareServiceClient.from_connection_string(
+            conn_str=connection_string)
+        self.share_client = self.share_service.get_share_client(share=share_name)
+    def is_conv_log(self, file_name: str) -> bool:
+        return file_name.startswith("conv-log") and file_name.endswith(".json")
+    def get_conv_id_from_name(self, file_name: str) -> str:
+        return file_name.split("-")[2].strip('.json')
+    def is_sandbox_log(self, file_name: str) -> bool:
+        return file_name.startswith("sandbox-log") and file_name.endswith(".json")
+    def get_file_content(self, file_path: str) -> bytes:
+        file_client = self.share_client.get_file_client(file_path)
+        file_content = file_client.download_file().readall()
+        return file_content
+    def get_conv_logs(self, date: str) -> dict[str, defaultdict[str, list[Any]]]:
+        '''
+        Return conversation logs based on the date.
+        Returns a dict:
+            mode -> conv_id -> list of logs.
+        '''
+        conv_logs = {
+            mode: defaultdict(list) for mode in self.CHAT_MODES
+        }
+        for mode in self.CHAT_MODES:
+            conv_log_dir = f"{date}/{self.CONV_LOG_DIR_NAME}/{mode}/"
+            # check if the directory exists
+            if not self.share_client.get_directory_client(conv_log_dir).exists():
+                continue
+            for file in self.share_client.list_directories_and_files(conv_log_dir):
+                if not self.is_conv_log(file.name):
+                    continue
+                conv_id = self.get_conv_id_from_name(file.name)
+                file_content = self.get_file_content(
+                    conv_log_dir + file.name).decode("utf-8").strip(' \n')
+                for line in file_content.split('\n'):
+                    if line:
+                        conv_logs[mode][conv_id].append(json.loads(line))
+        return conv_logs
+    def get_sandbox_logs(self, date: str) -> list[str]:
+        '''
+        Return sandbox logs based on the date.
+        '''
+        sandbox_logs = []
+        sandbox_log_dir = f"{date}/{self.SANDBOX_LOG_DIR_NAME}/"
+        for file in self.share_client.list_directories_and_files(sandbox_log_dir):
+            if self.is_sandbox_log(file.name):
+                file_content = self.get_file_content(
+                    sandbox_log_dir + file.name).decode("utf-8").strip(' \n')
+                sandbox_logs.append(json.loads(file_content))
+        return sandbox_logs
+    def get_image(self, image_id: str) -> bytes:
+        '''
+        Return image data based on the image id.
+        '''
+        image_path = f"{self.IMAGE_DIR_NAME}/{image_id}.png"
+        return self.get_file_content(image_path)
+if __name__ == "__main__":
+    # Example usages
+    log_reader = RemoteLogReader()
+    date = "2025_02_20"
+    conv_logs = log_reader.get_conv_logs(date)
+    sandbox_logs = log_reader.get_sandbox_logs(date)
+    image_data = log_reader.get_image("051fdac24285ff6e219a9ba06d1ac843")
+    print(conv_logs)
+    print(sandbox_logs)
+    print(image_data)

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 gradio>=4.0.0
 pandas>=2.0.0
-smbprotocol

 gradio>=4.0.0
 pandas>=2.0.0
+azure-storage-file-share