Terry Zhuo commited on
Commit
020b18b
·
1 Parent(s): 6da81f8
__pycache__/azure_count_ip_data.cpython-311.pyc ADDED
Binary file (13.5 kB). View file
 
__pycache__/log_reader.cpython-311.pyc ADDED
Binary file (7 kB). View file
 
app.py CHANGED
@@ -3,7 +3,8 @@ import gradio as gr
3
  import pandas as pd
4
  from datetime import datetime
5
  import time
6
- from count_ip_data import count_files_per_ip
 
7
  import threading
8
 
9
  # Define the path for storing the data
@@ -32,23 +33,27 @@ def load_stats():
32
 
33
  def update_stats():
34
  """Get the latest battle statistics"""
35
- smb_url = os.getenv("SMB_URL")
36
- if not smb_url:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  return pd.DataFrame(columns=['IP Address', 'Battle Count']), ""
38
-
39
- ip_counts = count_files_per_ip(smb_url)
40
-
41
- # Convert to DataFrame for better display
42
- df = pd.DataFrame(list(ip_counts.items()), columns=['IP Address', 'Battle Count'])
43
- df = df.sort_values('Battle Count', ascending=False)
44
-
45
- # Get current time
46
- current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
47
-
48
- # Save the updated stats
49
- save_stats(df, current_time)
50
-
51
- return df, current_time
52
 
53
  def auto_update(state):
54
  """Background task to update stats every hour"""
 
3
  import pandas as pd
4
  from datetime import datetime
5
  import time
6
+ from azure_count_ip_data import count_files_per_ip
7
+ from log_reader import RemoteLogReader
8
  import threading
9
 
10
  # Define the path for storing the data
 
33
 
34
  def update_stats():
35
  """Get the latest battle statistics"""
36
+ try:
37
+ # Initialize RemoteLogReader
38
+ reader = RemoteLogReader()
39
+
40
+ # Get IP counts using Azure storage
41
+ ip_counts = count_files_per_ip(reader)
42
+
43
+ # Convert to DataFrame for better display
44
+ df = pd.DataFrame(list(ip_counts.items()), columns=['IP Address', 'Battle Count'])
45
+ df = df.sort_values('Battle Count', ascending=False)
46
+
47
+ # Get current time
48
+ current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
49
+
50
+ # Save the updated stats
51
+ save_stats(df, current_time)
52
+
53
+ return df, current_time
54
+ except Exception as e:
55
+ print(f"Error updating stats: {e}")
56
  return pd.DataFrame(columns=['IP Address', 'Battle Count']), ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def auto_update(state):
59
  """Background task to update stats every hour"""
azure_count_ip_data.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from datetime import datetime, timedelta
4
+ import json
5
+ from collections import defaultdict
6
+ import shutil
7
+ import re
8
+ import argparse
9
+ from typing import Dict, Set, Tuple, Optional
10
+ from log_reader import RemoteLogReader
11
+
12
+ # List of IP addresses we care about
13
+ WHITELIST_IPS = [
14
+ "199.111.212.5",
15
+ "175.159.122.63",
16
+ "109.245.193.97",
17
+ "158.195.18.232",
18
+ "2607:fea8:4f40:4b00:e5b9:9806:6b69:233b",
19
+ "66.254.231.49",
20
+ "129.74.154.194",
21
+ "175.196.44.217",
22
+ "2601:600:8d00:9510:1d77:b610:9358:f443",
23
+ "74.90.222.68",
24
+ "2a02:169:3e9:0:6ce8:e76f:faed:c830",
25
+ "70.50.179.57",
26
+ "2a02:842a:24:5a01:8cd6:5b22:1189:6035",
27
+ "2408:8418:6390:7603:40b:555f:774:a05d"
28
+ ]
29
+
30
+ logging.basicConfig(level=logging.WARNING)
31
+ log = logging.getLogger(__name__)
32
+
33
+ def get_ip_from_jsonl(content: str) -> Optional[str]:
34
+ """Extract IP from the first line of a JSONL content"""
35
+ try:
36
+ first_line = content.split('\n')[0]
37
+ data = json.loads(first_line)
38
+ return data.get('ip')
39
+ except Exception as e:
40
+ log.error(f"Error extracting IP from content: {e}")
41
+ return None
42
+
43
+ def get_chat_session_id(file_name: str, content: str = None) -> Optional[str]:
44
+ """Extract chat_session_id based on the file location:
45
+ - For files under conv_logs: extract from filename
46
+ - For files under sandbox_logs: read from file content
47
+ """
48
+ try:
49
+ if 'conv_logs' in file_name:
50
+ # Extract from filename for conv_logs
51
+ match = re.match(r'conv-log-([a-f0-9]+)\.json', file_name)
52
+ if match:
53
+ return match.group(1)
54
+ elif 'sandbox_logs' in file_name and content:
55
+ # Read from file content for sandbox_logs
56
+ data = json.loads(content)
57
+ return data['sandbox_state'].get('chat_session_id')
58
+ return None
59
+ except Exception as e:
60
+ log.error(f"Error getting chat_session_id from {file_name}: {e}")
61
+ return None
62
+
63
+ def get_sandbox_session_ids(reader: 'RemoteLogReader', date_str: str) -> Set[str]:
64
+ """Get all chat_session_ids from sandbox logs for a given date"""
65
+ session_ids = set()
66
+ try:
67
+ sandbox_logs = reader.get_sandbox_logs(date_str)
68
+ for log in sandbox_logs:
69
+ if isinstance(log, dict):
70
+ session_id = log.get('sandbox_state', {}).get('chat_session_id')
71
+ if session_id:
72
+ session_ids.add(session_id)
73
+ except Exception as e:
74
+ log.error(f"Error getting sandbox session IDs for date {date_str}: {e}")
75
+
76
+ return session_ids
77
+
78
+ def get_file_data(content: str) -> Tuple[Optional[str], bool]:
79
+ """Read file content and return IP and vote condition status"""
80
+ try:
81
+ lines = [line.strip() for line in content.split('\n') if line.strip()]
82
+ if not lines:
83
+ return None, False
84
+
85
+ # Get IP from first line
86
+ try:
87
+ first_line_data = json.loads(lines[0])
88
+ ip = first_line_data.get('ip')
89
+ # Early return if IP is not in whitelist
90
+ if ip not in WHITELIST_IPS:
91
+ return None, False
92
+ except json.JSONDecodeError:
93
+ ip = None
94
+
95
+ # Check vote conditions from last line
96
+ try:
97
+ last_line_data = json.loads(lines[-1])
98
+ feedback = last_line_data.get('feedback')
99
+ vote_conditions_met = (last_line_data.get('type') == 'vote' and
100
+ isinstance(feedback, dict) and
101
+ len(feedback) == 6)
102
+ except json.JSONDecodeError:
103
+ vote_conditions_met = False
104
+
105
+ return ip, vote_conditions_met
106
+ except Exception as e:
107
+ log.error(f"Error processing file content: {e}")
108
+ return None, False
109
+
110
+ def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
111
+ """Count files per IP address from the given start date"""
112
+ # Convert start date string to datetime
113
+ start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
114
+ ip_counts = defaultdict(int)
115
+
116
+ try:
117
+ # Get current date for iteration
118
+ current_date = start_date
119
+ today = datetime.now()
120
+
121
+ while current_date <= today:
122
+ date_str = current_date.strftime("%Y_%m_%d")
123
+
124
+ try:
125
+ # Get conversation logs for battle_anony mode
126
+ conv_logs = reader.get_conv_logs(date_str)
127
+ battle_anony_logs = conv_logs.get('battle_anony', {})
128
+
129
+ # Process each conversation
130
+ for conv_id, messages in battle_anony_logs.items():
131
+ if messages:
132
+ # Convert messages to file content format
133
+ content = '\n'.join(json.dumps(msg) for msg in messages)
134
+ ip, vote_conditions_met = get_file_data(content)
135
+ if vote_conditions_met and ip:
136
+ ip_counts[ip] += 1
137
+
138
+ except Exception as e:
139
+ log.error(f"Error processing logs for date {date_str}: {e}")
140
+
141
+ # Move to next day
142
+ current_date += timedelta(days=1)
143
+
144
+ except Exception as e:
145
+ log.error(f"Error accessing logs: {e}")
146
+
147
+ return dict(ip_counts)
148
+
149
+ def download_files_by_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18", check_sandbox: bool = True) -> None:
150
+ """Download files and organize them by IP address
151
+
152
+ Args:
153
+ reader: RemoteLogReader instance
154
+ start_date_str: The start date in YYYY_MM_DD format
155
+ check_sandbox: Whether to check for matching sandbox logs
156
+ """
157
+ # Create base data directory
158
+ data_dir = os.path.join(os.getcwd(), "data")
159
+ os.makedirs(data_dir, exist_ok=True)
160
+
161
+ # Convert start date string to datetime
162
+ start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
163
+
164
+ try:
165
+ # Get current date for iteration
166
+ current_date = start_date
167
+ today = datetime.now()
168
+
169
+ while current_date <= today:
170
+ date_str = current_date.strftime("%Y_%m_%d")
171
+
172
+ # Get all sandbox session IDs for this date
173
+ sandbox_session_ids = get_sandbox_session_ids(reader, date_str) if check_sandbox else set()
174
+
175
+ try:
176
+ # Get conversation logs for battle_anony mode
177
+ conv_logs = reader.get_conv_logs(date_str)
178
+ battle_anony_logs = conv_logs.get('battle_anony', {})
179
+
180
+ # Process each conversation
181
+ for conv_id, messages in battle_anony_logs.items():
182
+ if not messages:
183
+ continue
184
+
185
+ # Convert messages to file content
186
+ content = '\n'.join(json.dumps(msg) for msg in messages)
187
+ ip = get_ip_from_jsonl(content)
188
+
189
+ if ip:
190
+ # Create directory structure for this IP
191
+ ip_dir = os.path.join(data_dir, ip)
192
+ valid_dir = os.path.join(ip_dir, "valid")
193
+ invalid_dir = os.path.join(ip_dir, "invalid")
194
+ os.makedirs(valid_dir, exist_ok=True)
195
+ os.makedirs(invalid_dir, exist_ok=True)
196
+
197
+ # Check if chat_session_id exists in sandbox logs
198
+ if check_sandbox:
199
+ has_sandbox = conv_id in sandbox_session_ids
200
+ target_dir = valid_dir if has_sandbox else invalid_dir
201
+ else:
202
+ # When sandbox checking is disabled, put everything in valid
203
+ target_dir = valid_dir
204
+
205
+ # Save the file
206
+ file_name = f"conv-log-{conv_id}.json"
207
+ local_file_path = os.path.join(target_dir, file_name)
208
+ try:
209
+ with open(local_file_path, 'w') as f:
210
+ f.write(content)
211
+ log.info(f"Saved {file_name} to {target_dir}")
212
+ except Exception as e:
213
+ log.error(f"Error saving file {file_name}: {e}")
214
+
215
+ except Exception as e:
216
+ log.error(f"Error processing logs for date {date_str}: {e}")
217
+
218
+ # Move to next day
219
+ current_date += timedelta(days=1)
220
+
221
+ except Exception as e:
222
+ log.error(f"Error accessing logs: {e}")
223
+
224
+ def main():
225
+ # Initialize RemoteLogReader
226
+ reader = RemoteLogReader()
227
+
228
+ # Add argument parser for optional parameters
229
+ parser = argparse.ArgumentParser(description='Download and organize conversation files by IP')
230
+ parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
231
+ parser.add_argument('--download', action='store_true', help='Enable file download')
232
+ args = parser.parse_args()
233
+
234
+ # Download files if enabled
235
+ if args.download:
236
+ print("\nDownloading files and organizing by IP address...")
237
+ download_files_by_ip(reader, check_sandbox=args.sandbox_check)
238
+
239
+ # Count and display statistics
240
+ ip_counts = count_files_per_ip(reader)
241
+ print("\nFile counts per IP address:")
242
+ for ip, count in sorted(ip_counts.items(), key=lambda x: x[1], reverse=True):
243
+ print(f"IP: {ip:<15} Count: {count}")
244
+
245
+ if __name__ == "__main__":
246
+ main()
data/battle_stats.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ IP Address,Battle Count
2
+ 109.245.193.97,38
3
+ 70.50.179.57,13
4
+ 175.196.44.217,9
5
+ 66.254.231.49,3
6
+ 158.195.18.232,2
7
+ 2a02:169:3e9:0:6ce8:e76f:faed:c830,1
8
+ 2408:8418:6390:7603:40b:555f:774:a05d,1
9
+ 2607:fea8:4f40:4b00:e5b9:9806:6b69:233b,1
10
+ 175.159.122.63,1
11
+ 2a02:842a:24:5a01:8cd6:5b22:1189:6035,1
12
+ 2601:600:8d00:9510:1d77:b610:9358:f443,1
data/last_update.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 2025-02-21 00:07:06
log_reader.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Facade for reading logs on remote storage.
3
+ '''
4
+
5
+ from collections import defaultdict
6
+ import json
7
+ import os
8
+ from typing import Any
9
+ from azure.storage.fileshare import ShareServiceClient
10
+
11
+
12
+ class RemoteLogReader:
13
+ '''
14
+ remote log reader
15
+ '''
16
+
17
+ LOG_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING") or ""
18
+ LOG_SHARE_NAME = "swearenalogsfileshare"
19
+
20
+ IMAGE_DIR_NAME = "serve_images"
21
+ '''
22
+ Directory for storing user uploaded images.
23
+ '''
24
+ CONV_LOG_DIR_NAME = "conv_logs"
25
+ '''
26
+ Directory for conversation logs.
27
+ '''
28
+ SANDBOX_LOG_DIR_NAME = "sandbox_logs"
29
+ '''
30
+ Directory for sandbox logs.
31
+ '''
32
+
33
+ CHAT_MODES = ["battle_anony", "battle_named", "direct"]
34
+
35
+ def __init__(
36
+ self,
37
+ connection_string: str = LOG_CONNECTION_STRING,
38
+ share_name: str = LOG_SHARE_NAME,
39
+ ):
40
+ if not connection_string:
41
+ raise ValueError("Connection string is required.")
42
+ if not share_name:
43
+ raise ValueError("Share name is required.")
44
+
45
+ self.share_service = ShareServiceClient.from_connection_string(
46
+ conn_str=connection_string)
47
+ self.share_client = self.share_service.get_share_client(share=share_name)
48
+
49
+ def is_conv_log(self, file_name: str) -> bool:
50
+ return file_name.startswith("conv-log") and file_name.endswith(".json")
51
+
52
+ def get_conv_id_from_name(self, file_name: str) -> str:
53
+ return file_name.split("-")[2].strip('.json')
54
+
55
+ def is_sandbox_log(self, file_name: str) -> bool:
56
+ return file_name.startswith("sandbox-log") and file_name.endswith(".json")
57
+
58
+ def get_file_content(self, file_path: str) -> bytes:
59
+ file_client = self.share_client.get_file_client(file_path)
60
+ file_content = file_client.download_file().readall()
61
+ return file_content
62
+
63
+ def get_conv_logs(self, date: str) -> dict[str, defaultdict[str, list[Any]]]:
64
+ '''
65
+ Return conversation logs based on the date.
66
+ Returns a dict:
67
+ mode -> conv_id -> list of logs.
68
+ '''
69
+ conv_logs = {
70
+ mode: defaultdict(list) for mode in self.CHAT_MODES
71
+ }
72
+ for mode in self.CHAT_MODES:
73
+ conv_log_dir = f"{date}/{self.CONV_LOG_DIR_NAME}/{mode}/"
74
+ # check if the directory exists
75
+ if not self.share_client.get_directory_client(conv_log_dir).exists():
76
+ continue
77
+ for file in self.share_client.list_directories_and_files(conv_log_dir):
78
+ if not self.is_conv_log(file.name):
79
+ continue
80
+ conv_id = self.get_conv_id_from_name(file.name)
81
+ file_content = self.get_file_content(
82
+ conv_log_dir + file.name).decode("utf-8").strip(' \n')
83
+ for line in file_content.split('\n'):
84
+ if line:
85
+ conv_logs[mode][conv_id].append(json.loads(line))
86
+ return conv_logs
87
+
88
+ def get_sandbox_logs(self, date: str) -> list[str]:
89
+ '''
90
+ Return sandbox logs based on the date.
91
+ '''
92
+ sandbox_logs = []
93
+ sandbox_log_dir = f"{date}/{self.SANDBOX_LOG_DIR_NAME}/"
94
+ for file in self.share_client.list_directories_and_files(sandbox_log_dir):
95
+ if self.is_sandbox_log(file.name):
96
+ file_content = self.get_file_content(
97
+ sandbox_log_dir + file.name).decode("utf-8").strip(' \n')
98
+ sandbox_logs.append(json.loads(file_content))
99
+ return sandbox_logs
100
+
101
+ def get_image(self, image_id: str) -> bytes:
102
+ '''
103
+ Return image data based on the image id.
104
+ '''
105
+ image_path = f"{self.IMAGE_DIR_NAME}/{image_id}.png"
106
+ return self.get_file_content(image_path)
107
+
108
+
109
+ if __name__ == "__main__":
110
+ # Example usages
111
+ log_reader = RemoteLogReader()
112
+ date = "2025_02_20"
113
+ conv_logs = log_reader.get_conv_logs(date)
114
+ sandbox_logs = log_reader.get_sandbox_logs(date)
115
+ image_data = log_reader.get_image("051fdac24285ff6e219a9ba06d1ac843")
116
+ print(conv_logs)
117
+ print(sandbox_logs)
118
+ print(image_data)
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  gradio>=4.0.0
2
  pandas>=2.0.0
3
- smbprotocol
 
1
  gradio>=4.0.0
2
  pandas>=2.0.0
3
+ azure-storage-file-share