Terry Zhuo commited on
Commit
46e626f
·
1 Parent(s): ed574fd
app.py CHANGED
@@ -3,7 +3,7 @@ import gradio as gr
3
  import pandas as pd
4
  from datetime import datetime
5
  import time
6
- from azure_count_ip_data import count_files_per_ip
7
  from log_reader import RemoteLogReader
8
  import threading
9
 
@@ -36,12 +36,10 @@ def update_stats():
36
  try:
37
  # Initialize RemoteLogReader
38
  reader = RemoteLogReader()
39
-
40
- # Get IP counts using Azure storage
41
- ip_counts = count_files_per_ip(reader)
42
-
43
  # Convert to DataFrame for better display
44
- df = pd.DataFrame(list(ip_counts.items()), columns=['Annotator', 'Battle Count'])
45
  df = df.sort_values('Battle Count', ascending=False)
46
 
47
  # Get current time
@@ -88,7 +86,7 @@ def create_ui():
88
 
89
  with gr.Blocks(title="Battle Count Statistics") as app:
90
  gr.Markdown("# Battle Count Statistics")
91
- gr.Markdown("Displays the count of valid battles per IP address. Updates automatically every hour.")
92
 
93
  with gr.Row():
94
  last_update = gr.Textbox(
 
3
  import pandas as pd
4
  from datetime import datetime
5
  import time
6
+ from azure_count_ip_data import count_files_per_annotator
7
  from log_reader import RemoteLogReader
8
  import threading
9
 
 
36
  try:
37
  # Initialize RemoteLogReader
38
  reader = RemoteLogReader()
39
+ # Get annotator counts using Azure storage
40
+ annotator_counts = count_files_per_annotator(reader)
 
 
41
  # Convert to DataFrame for better display
42
+ df = pd.DataFrame(list(annotator_counts.items()), columns=['Annotator', 'Battle Count'])
43
  df = df.sort_values('Battle Count', ascending=False)
44
 
45
  # Get current time
 
86
 
87
  with gr.Blocks(title="Battle Count Statistics") as app:
88
  gr.Markdown("# Battle Count Statistics")
89
+ gr.Markdown("Displays the count of valid battles per annotator. Updates automatically every hour.")
90
 
91
  with gr.Row():
92
  last_update = gr.Textbox(
azure_count_ip_data.py CHANGED
@@ -6,7 +6,7 @@ from collections import defaultdict
6
  import shutil
7
  import re
8
  import argparse
9
- from typing import Dict, Set, Tuple, Optional
10
  from log_reader import RemoteLogReader
11
 
12
  # List of IP addresses we care about
@@ -16,24 +16,48 @@ WHITELIST_IPS_DICT = {
16
  "Kenneth Hamilton": ["109.245.193.97"],
17
  "Marek Suppa": ["158.195.18.232"],
18
  "Max Tian": ["2607:fea8:4f40:4b00:e5b9:9806:6b69:233b", "2607:fea8:4f40:4b00:bcef:571:6124:f01", "2607:fea8:7c9d:3800:d9c0:7295:3e2e:6287", "2607:fea8:7c9d:3800:fd51:2c91:c9e2:3c8"],
19
- "Mengzhao Jia": ["66.254.231.49", "160.32.74.89"],
20
- "Noah Ziems": ["2601:245:c500:92c0:633c:c0d2:dcc1:1f48", "2601:245:c500:92c0:961e:9ac7:e02:c266", "2601:245:c500:92c0:ae74:d1d5:ca3b:da90"],
21
  "Sabina A": ["175.196.44.217", "58.235.174.122", "14.52.175.55"],
22
- "Wenhao Yu": ["2601:600:8d00:9510:1d77:b610:9358:f443", "2601:600:8d00:9510:513f:6c4e:5960:fdc7", "174.164.6.99"],
23
  "Vaisakhi Mishra": ["74.90.222.68"],
24
  "Kumar Shridhar": ["129.132.145.250"],
25
  "Viktor Gal": ["2a02:169:3e9:0:6ce8:e76f:faed:c830"],
26
- "Guangyu Song": ["70.50.179.57", "209.226.139.83"],
27
- "Bhupesh Bishnoi": ["37.65.177.22", "195.220.58.237", "194.57.114.147", "195.220.58.234"],
28
  "Zheng Liu": ["128.143.71.67"],
29
- "Ming Xu": ["2601:600:8d00:9510:185b:955d:275b:7685", "2601:600:8d00:9510:5150:468c:ab7d:518d"],
30
- "Ayush Sunil Munot": ["10.145.76.56"],
31
- "Saiteja Utpala": ["192.168.31.185"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  }
33
 
34
  # Flatten IP list for backward compatibility
35
  WHITELIST_IPS = [ip for ips in WHITELIST_IPS_DICT.values() for ip in ips]
36
 
 
 
 
37
  logging.basicConfig(level=logging.WARNING)
38
  log = logging.getLogger(__name__)
39
 
@@ -47,6 +71,21 @@ def get_ip_from_jsonl(content: str) -> Optional[str]:
47
  log.error(f"Error extracting IP from content: {e}")
48
  return None
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def get_chat_session_id(file_name: str, content: str = None) -> Optional[str]:
51
  """Extract chat_session_id based on the file location:
52
  - For files under conv_logs: extract from filename
@@ -82,52 +121,86 @@ def get_sandbox_session_ids(reader: 'RemoteLogReader', date_str: str) -> Set[str
82
 
83
  return session_ids
84
 
85
- def get_file_data(content: str) -> Tuple[Optional[str], bool]:
86
- """Read file content and return IP and vote condition status"""
87
  try:
88
  lines = [line.strip() for line in content.split('\n') if line.strip()]
89
  if not lines:
90
- return None, False
91
 
92
  # Get IP from first line
93
  try:
94
  first_line_data = json.loads(lines[0])
95
  ip = first_line_data.get('ip')
96
- # Early return if IP is not in whitelist
97
- if ip not in WHITELIST_IPS:
98
- return None, False
99
  except json.JSONDecodeError:
100
  ip = None
101
 
102
- # Check vote conditions from last line
 
 
 
103
  try:
104
  last_line_data = json.loads(lines[-1])
105
- feedback = last_line_data.get('feedback')
106
- vote_conditions_met = (last_line_data.get('type') == 'vote' and
107
- isinstance(feedback, dict) and
108
- len(feedback) == 6)
109
- except json.JSONDecodeError:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  vote_conditions_met = False
111
 
112
- return ip, vote_conditions_met
 
 
 
 
 
 
 
113
  except Exception as e:
114
  log.error(f"Error processing file content: {e}")
115
- return None, False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
118
- """Count files per name from the given start date"""
119
  # Convert start date string to datetime
120
  start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
121
  name_counts = defaultdict(int)
122
-
123
  try:
124
  # Get current date for iteration
125
  current_date = start_date
126
  today = datetime.now()
127
 
128
- # Create reverse mapping of IP to name
129
- ip_to_name = {ip: name for name, ips in WHITELIST_IPS_DICT.items() for ip in ips}
130
-
131
  while current_date <= today:
132
  date_str = current_date.strftime("%Y_%m_%d")
133
 
@@ -135,16 +208,18 @@ def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02
135
  # Get conversation logs for battle_anony mode
136
  conv_logs = reader.get_conv_logs(date_str)
137
  battle_anony_logs = conv_logs.get('battle_anony', {})
138
-
139
  # Process each conversation
140
  for conv_id, messages in battle_anony_logs.items():
141
  if messages:
142
  # Convert messages to file content format
143
  content = '\n'.join(json.dumps(msg) for msg in messages)
144
- ip, vote_conditions_met = get_file_data(content)
145
- if vote_conditions_met and ip and ip in ip_to_name:
146
- name = ip_to_name[ip]
147
- name_counts[name] += 1
 
 
 
148
 
149
  except Exception as e:
150
  log.error(f"Error processing logs for date {date_str}: {e}")
@@ -169,9 +244,6 @@ def download_files_by_name(reader: 'RemoteLogReader', start_date_str: str = "202
169
  data_dir = os.path.join(os.getcwd(), "data")
170
  os.makedirs(data_dir, exist_ok=True)
171
 
172
- # Create reverse mapping of IP to name
173
- ip_to_name = {ip: name for name, ips in WHITELIST_IPS_DICT.items() for ip in ips}
174
-
175
  # Convert start date string to datetime
176
  start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
177
 
@@ -199,11 +271,14 @@ def download_files_by_name(reader: 'RemoteLogReader', start_date_str: str = "202
199
  # Convert messages to file content
200
  content = '\n'.join(json.dumps(msg) for msg in messages)
201
  ip = get_ip_from_jsonl(content)
 
 
 
 
202
 
203
- if ip and ip in ip_to_name:
204
- name = ip_to_name[ip]
205
  # Create directory structure for this name
206
- name_dir = os.path.join(data_dir, name)
207
  valid_dir = os.path.join(name_dir, "valid")
208
  invalid_dir = os.path.join(name_dir, "invalid")
209
  os.makedirs(valid_dir, exist_ok=True)
@@ -252,7 +327,7 @@ def main():
252
  download_files_by_name(reader, check_sandbox=args.sandbox_check)
253
 
254
  # Count and display statistics
255
- name_counts = count_files_per_ip(reader)
256
  print("\nFile counts per annotator:")
257
  for name, count in sorted(name_counts.items(), key=lambda x: x[1], reverse=True):
258
  print(f"Name: {name:<20} Count: {count}")
 
6
  import shutil
7
  import re
8
  import argparse
9
+ from typing import Dict, Set, Tuple, Optional, List, Union
10
  from log_reader import RemoteLogReader
11
 
12
  # List of IP addresses we care about
 
16
  "Kenneth Hamilton": ["109.245.193.97"],
17
  "Marek Suppa": ["158.195.18.232"],
18
  "Max Tian": ["2607:fea8:4f40:4b00:e5b9:9806:6b69:233b", "2607:fea8:4f40:4b00:bcef:571:6124:f01", "2607:fea8:7c9d:3800:d9c0:7295:3e2e:6287", "2607:fea8:7c9d:3800:fd51:2c91:c9e2:3c8"],
19
+ "Mengzhao Jia": ["66.254.231.49"],
20
+ "Noah Ziems": ["2601:245:c500:92c0:633c:c0d2:dcc1:1f48", "2601:245:c500:92c0:961e:9ac7:e02:c266"],
21
  "Sabina A": ["175.196.44.217", "58.235.174.122", "14.52.175.55"],
22
+ "Wenhao Yu": ["2601:600:8d00:9510:1d77:b610:9358:f443"],
23
  "Vaisakhi Mishra": ["74.90.222.68"],
24
  "Kumar Shridhar": ["129.132.145.250"],
25
  "Viktor Gal": ["2a02:169:3e9:0:6ce8:e76f:faed:c830"],
26
+ "Guangyu Song": ["70.50.179.57"],
27
+ "Bhupesh Bishnoi": ["2a02:842a:24:5a01:8cd6:5b22:1189:6035","192.168.1.8"],
28
  "Zheng Liu": ["128.143.71.67"],
29
+ "Ming Xu": ["10.0.0.243"],
30
+ "Ayush Sunil Munot": ["10.145.76.56"]
31
+ }
32
+
33
+ # Username whitelist for each annotator
34
+ WHITELIST_USERNAMES_DICT = {
35
+ "Chen Gong": [],
36
+ "Juyong Jiang": [],
37
+ "Kenneth Hamilton": [],
38
+ "Marek Suppa": [],
39
+ "Max Tian": [],
40
+ "Mengzhao Jia": [],
41
+ "Noah Ziems": [],
42
+ "Sabina A": [],
43
+ "Wenhao Yu": [],
44
+ "Vaisakhi Mishra": [],
45
+ "Kumar Shridhar": [],
46
+ "Viktor Gal": [],
47
+ "Guangyu Song": [],
48
+ "Bhupesh Bishnoi": [],
49
+ "Zheng Liu": [],
50
+ "Ming Xu": [],
51
+ "Ayush Sunil Munot": [],
52
+ "Terry Yue Zhuo": ["test"]
53
  }
54
 
55
  # Flatten IP list for backward compatibility
56
  WHITELIST_IPS = [ip for ips in WHITELIST_IPS_DICT.values() for ip in ips]
57
 
58
+ # Flatten username list for backward compatibility
59
+ WHITELIST_USERNAMES = [username for usernames in WHITELIST_USERNAMES_DICT.values() for username in usernames]
60
+
61
  logging.basicConfig(level=logging.WARNING)
62
  log = logging.getLogger(__name__)
63
 
 
71
  log.error(f"Error extracting IP from content: {e}")
72
  return None
73
 
74
+ def get_username_from_jsonl(content: str) -> Optional[str]:
75
+ """Extract username from the last line of a JSONL content if it's a vote"""
76
+ try:
77
+ lines = [line.strip() for line in content.split('\n') if line.strip()]
78
+ if not lines:
79
+ return None
80
+
81
+ last_line = json.loads(lines[-1])
82
+ if last_line.get('type') == 'vote':
83
+ return last_line.get('username')
84
+ return None
85
+ except Exception as e:
86
+ log.error(f"Error extracting username from content: {e}")
87
+ return None
88
+
89
  def get_chat_session_id(file_name: str, content: str = None) -> Optional[str]:
90
  """Extract chat_session_id based on the file location:
91
  - For files under conv_logs: extract from filename
 
121
 
122
  return session_ids
123
 
124
+ def get_file_data(content: str) -> Tuple[Optional[str], Optional[str], bool]:
125
+ """Read file content and return IP, username, and vote condition status"""
126
  try:
127
  lines = [line.strip() for line in content.split('\n') if line.strip()]
128
  if not lines:
129
+ return None, None, False
130
 
131
  # Get IP from first line
132
  try:
133
  first_line_data = json.loads(lines[0])
134
  ip = first_line_data.get('ip')
 
 
 
135
  except json.JSONDecodeError:
136
  ip = None
137
 
138
+ # Early check if IP is in whitelist
139
+ ip_in_whitelist = ip in WHITELIST_IPS
140
+
141
+ # Check vote conditions from last line and get username if available
142
  try:
143
  last_line_data = json.loads(lines[-1])
144
+ username = None
145
+
146
+ if last_line_data.get('type') == 'vote':
147
+ # Only try to get username if the key exists
148
+ if 'username' in last_line_data:
149
+ username = last_line_data.get('username')
150
+
151
+ feedback = last_line_data.get('feedback')
152
+
153
+ # Check vote conditions: type is vote, feedback has 6 items, and at least 4 lines (2 rounds of chat)
154
+ vote_conditions_met = (
155
+ isinstance(feedback, dict) and
156
+ len(feedback) == 6 and
157
+ len(lines) >= 4
158
+ )
159
+ else:
160
+ vote_conditions_met = False
161
+
162
+ except (json.JSONDecodeError, TypeError):
163
+ username = None
164
  vote_conditions_met = False
165
 
166
+ # Check if username is in whitelist (if username exists)
167
+ username_in_whitelist = username in WHITELIST_USERNAMES if username else False
168
+
169
+ # Early return if neither IP nor username is in whitelist
170
+ if not (ip_in_whitelist or username_in_whitelist):
171
+ return ip, username, False
172
+
173
+ return ip, username, vote_conditions_met
174
  except Exception as e:
175
  log.error(f"Error processing file content: {e}")
176
+ return None, None, False
177
+
178
+ def get_annotator_name(ip: Optional[str], username: Optional[str]) -> Optional[str]:
179
+ """Get annotator name from IP or username"""
180
+ # Check IP first
181
+ if ip:
182
+ for name, ips in WHITELIST_IPS_DICT.items():
183
+ if ip in ips:
184
+ return name
185
+
186
+ # Check username if IP didn't match
187
+ if username:
188
+ for name, usernames in WHITELIST_USERNAMES_DICT.items():
189
+ if username in usernames:
190
+ return name
191
+
192
+ return None
193
 
194
+ def count_files_per_annotator(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
195
+ """Count files per annotator name from the given start date, considering both IP and username"""
196
  # Convert start date string to datetime
197
  start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
198
  name_counts = defaultdict(int)
 
199
  try:
200
  # Get current date for iteration
201
  current_date = start_date
202
  today = datetime.now()
203
 
 
 
 
204
  while current_date <= today:
205
  date_str = current_date.strftime("%Y_%m_%d")
206
 
 
208
  # Get conversation logs for battle_anony mode
209
  conv_logs = reader.get_conv_logs(date_str)
210
  battle_anony_logs = conv_logs.get('battle_anony', {})
 
211
  # Process each conversation
212
  for conv_id, messages in battle_anony_logs.items():
213
  if messages:
214
  # Convert messages to file content format
215
  content = '\n'.join(json.dumps(msg) for msg in messages)
216
+ ip, username, vote_conditions_met = get_file_data(content)
217
+
218
+ if vote_conditions_met:
219
+ # Get annotator name from either IP or username
220
+ annotator_name = get_annotator_name(ip, username)
221
+ if annotator_name:
222
+ name_counts[annotator_name] += 1
223
 
224
  except Exception as e:
225
  log.error(f"Error processing logs for date {date_str}: {e}")
 
244
  data_dir = os.path.join(os.getcwd(), "data")
245
  os.makedirs(data_dir, exist_ok=True)
246
 
 
 
 
247
  # Convert start date string to datetime
248
  start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
249
 
 
271
  # Convert messages to file content
272
  content = '\n'.join(json.dumps(msg) for msg in messages)
273
  ip = get_ip_from_jsonl(content)
274
+ username = get_username_from_jsonl(content)
275
+
276
+ # Get annotator name from either IP or username
277
+ annotator_name = get_annotator_name(ip, username)
278
 
279
+ if annotator_name:
 
280
  # Create directory structure for this name
281
+ name_dir = os.path.join(data_dir, annotator_name)
282
  valid_dir = os.path.join(name_dir, "valid")
283
  invalid_dir = os.path.join(name_dir, "invalid")
284
  os.makedirs(valid_dir, exist_ok=True)
 
327
  download_files_by_name(reader, check_sandbox=args.sandbox_check)
328
 
329
  # Count and display statistics
330
+ name_counts = count_files_per_annotator(reader)
331
  print("\nFile counts per annotator:")
332
  for name, count in sorted(name_counts.items(), key=lambda x: x[1], reverse=True):
333
  print(f"Name: {name:<20} Count: {count}")
count_ip_data.py DELETED
@@ -1,288 +0,0 @@
1
- import os
2
- import logging
3
- from datetime import datetime, timedelta
4
- from urllib.parse import unquote
5
- import json
6
- from collections import defaultdict
7
- import smbclient
8
- import shutil
9
- import re
10
- import argparse
11
-
12
- # List of IP addresses we care about
13
- WHITELIST_IPS = [
14
- "199.111.212.5",
15
- "175.159.122.63",
16
- "109.245.193.97",
17
- "158.195.18.232",
18
- "2607:fea8:4f40:4b00:e5b9:9806:6b69:233b",
19
- "66.254.231.49",
20
- "129.74.154.194",
21
- "175.196.44.217",
22
- "2601:600:8d00:9510:1d77:b610:9358:f443",
23
- "74.90.222.68",
24
- "2a02:169:3e9:0:6ce8:e76f:faed:c830",
25
- "70.50.179.57",
26
- "2a02:842a:24:5a01:8cd6:5b22:1189:6035",
27
- "2408:8418:6390:7603:40b:555f:774:a05d"
28
- ]
29
-
30
- logging.basicConfig(level=logging.WARNING)
31
- log = logging.getLogger(__name__)
32
-
33
- def get_ip_from_jsonl(file_path):
34
- """Extract IP from the first line of a JSONL file"""
35
- try:
36
- with smbclient.open_file(file_path, mode='r') as f:
37
- first_line = f.readline()
38
- data = json.loads(first_line)
39
- return data.get('ip')
40
- except Exception as e:
41
- log.error(f"Error reading file {file_path}: {e}")
42
- return None
43
-
44
- def get_chat_session_id(file_path):
45
- """Extract chat_session_id based on the file location:
46
- - For files under conv_logs: extract from filename
47
- - For files under sandbox_logs: read from file content
48
- """
49
- try:
50
- if 'conv_logs' in file_path:
51
- # Extract from filename for conv_logs
52
- # Handle Windows UNC path format
53
- filename = file_path.split('\\')[-1] # Get the last component of the path
54
- match = re.match(r'conv-log-([a-f0-9]+)\.json', filename)
55
- if match:
56
- return match.group(1)
57
- elif 'sandbox_logs' in file_path:
58
- # Read from file content for sandbox_logs
59
- with smbclient.open_file(file_path, mode='r') as f:
60
- data = json.loads(f.read())
61
- return data['sandbox_state'].get('chat_session_id')
62
- return None
63
- except Exception as e:
64
- log.error(f"Error getting chat_session_id from {file_path}: {e}")
65
- return None
66
-
67
- def get_sandbox_session_ids(server, share, date_str):
68
- """Get all chat_session_ids from sandbox logs for a given date"""
69
- sandbox_folder = f"\\\\{server}\\{share}\\{date_str}\\sandbox_logs"
70
- session_ids = set()
71
-
72
- if not smbclient.path.exists(sandbox_folder):
73
- return session_ids
74
-
75
- try:
76
- for file_info in smbclient.scandir(sandbox_folder):
77
- if file_info.name.endswith('.json'):
78
- file_path = f"{sandbox_folder}\\{file_info.name}"
79
- session_id = get_chat_session_id(file_path)
80
- if session_id:
81
- session_ids.add(session_id)
82
- except Exception as e:
83
- log.error(f"Error scanning sandbox folder {sandbox_folder}: {e}")
84
-
85
- return session_ids
86
-
87
- def check_vote_conditions(file_path):
88
- """Check if the last line of the file has type:vote and feedback dict with 6 keys"""
89
- try:
90
- with smbclient.open_file(file_path, mode='r') as f:
91
- # Read all lines and get the last non-empty line
92
- lines = [line.strip() for line in f if line.strip()]
93
- if not lines:
94
- return False
95
- last_line = lines[-1]
96
- try:
97
- data = json.loads(last_line)
98
- feedback = data.get('feedback')
99
- return (data.get('type') == 'vote' and
100
- isinstance(feedback, dict) and
101
- len(feedback) == 6)
102
- except json.JSONDecodeError:
103
- return False
104
- except Exception as e:
105
- log.error(f"Error checking vote conditions in file {file_path}: {e}")
106
- return False
107
-
108
- def get_file_data(file_path):
109
- """Read file and return IP and vote condition status"""
110
- try:
111
- with smbclient.open_file(file_path, mode='r') as f:
112
- lines = [line.strip() for line in f if line.strip()]
113
- if not lines:
114
- return None, False
115
-
116
- # Get IP from first line
117
- try:
118
- first_line_data = json.loads(lines[0])
119
- ip = first_line_data.get('ip')
120
- # Early return if IP is not in whitelist
121
- if ip not in WHITELIST_IPS:
122
- return None, False
123
- except json.JSONDecodeError:
124
- ip = None
125
-
126
- # Check vote conditions from last line
127
- try:
128
- last_line_data = json.loads(lines[-1])
129
- feedback = last_line_data.get('feedback')
130
- vote_conditions_met = (last_line_data.get('type') == 'vote' and
131
- isinstance(feedback, dict) and
132
- len(feedback) == 6)
133
- except json.JSONDecodeError:
134
- vote_conditions_met = False
135
-
136
- return ip, vote_conditions_met
137
- except Exception as e:
138
- log.error(f"Error reading file {file_path}: {e}")
139
- return None, False
140
-
141
- def count_files_per_ip(smb_url, start_date_str="2025_02_18"):
142
- """Count files per IP address from the given start date"""
143
- # Remove 'smb://' prefix and parse URL components
144
- url = smb_url[6:]
145
- creds_server, share = url.split('/', 1)
146
- creds, server = creds_server.rsplit('@', 1)
147
- username, password = creds.split(':', 1)
148
- password = unquote(password)
149
-
150
- # Register the SMB session
151
- smbclient.register_session(server, username=username, password=password, port=8080)
152
-
153
- # Convert start date string to datetime
154
- start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
155
- ip_counts = defaultdict(int)
156
-
157
- try:
158
- # Get current date for iteration
159
- current_date = start_date
160
- today = datetime.now()
161
-
162
- while current_date <= today:
163
- date_str = current_date.strftime("%Y_%m_%d")
164
- folder_path = f"\\\\{server}\\{share}\\{date_str}\\conv_logs\\battle_anony"
165
-
166
- try:
167
- # List all JSON files in the battle_anony folder
168
- if smbclient.path.exists(folder_path):
169
- for file_info in smbclient.scandir(folder_path, search_pattern="conv-log-*.json"):
170
- file_path = f"{folder_path}\\{file_info.name}"
171
- ip, vote_conditions_met = get_file_data(file_path)
172
- if vote_conditions_met and ip:
173
- ip_counts[ip] += 1
174
- except Exception as e:
175
- log.error(f"Error processing folder {date_str}: {e}")
176
-
177
- # Move to next day
178
- current_date += timedelta(days=1)
179
-
180
- except Exception as e:
181
- log.error(f"Error accessing SMB share: {e}")
182
-
183
- return dict(ip_counts)
184
-
185
- def download_files_by_ip(smb_url, start_date_str="2025_02_18", check_sandbox=True):
186
- """Download files and organize them by IP address
187
-
188
- Args:
189
- smb_url (str): The SMB URL to connect to
190
- start_date_str (str): The start date in YYYY_MM_DD format
191
- check_sandbox (bool): Whether to check for matching sandbox logs
192
- """
193
- # Remove 'smb://' prefix and parse URL components
194
- url = smb_url[6:]
195
- creds_server, share = url.split('/', 1)
196
- creds, server = creds_server.rsplit('@', 1)
197
- username, password = creds.split(':', 1)
198
- password = unquote(password)
199
-
200
- # Register the SMB session
201
- smbclient.register_session(server, username=username, password=password)
202
-
203
- # Create base data directory
204
- data_dir = os.path.join(os.getcwd(), "data")
205
- os.makedirs(data_dir, exist_ok=True)
206
-
207
- # Convert start date string to datetime
208
- start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
209
-
210
- try:
211
- # Get current date for iteration
212
- current_date = start_date
213
- today = datetime.now()
214
-
215
- while current_date <= today:
216
- date_str = current_date.strftime("%Y_%m_%d")
217
- folder_path = f"\\\\{server}\\{share}\\{date_str}\\conv_logs\\battle_anony"
218
-
219
- # Get all sandbox session IDs for this date
220
- sandbox_session_ids = get_sandbox_session_ids(server, share, date_str) if check_sandbox else set()
221
- try:
222
- # List all JSON files in the battle_anony folder
223
- if smbclient.path.exists(folder_path):
224
- for file_info in smbclient.scandir(folder_path):
225
- # Skip macOS metadata files
226
- if file_info.name.startswith('._'):
227
- continue
228
- if file_info.name.endswith('.json'):
229
- file_path = f"{folder_path}\\{file_info.name}"
230
- ip = get_ip_from_jsonl(file_path)
231
- if ip:
232
- # Create directory structure for this IP
233
- ip_dir = os.path.join(data_dir, ip)
234
- valid_dir = os.path.join(ip_dir, "valid")
235
- invalid_dir = os.path.join(ip_dir, "invalid")
236
- os.makedirs(valid_dir, exist_ok=True)
237
- os.makedirs(invalid_dir, exist_ok=True)
238
-
239
- # Check if chat_session_id exists in sandbox logs
240
- if check_sandbox:
241
- chat_session_id = get_chat_session_id(file_path)
242
- has_sandbox = chat_session_id in sandbox_session_ids if chat_session_id else False
243
- target_dir = valid_dir if has_sandbox else invalid_dir
244
- else:
245
- # When sandbox checking is disabled, put everything in valid
246
- target_dir = valid_dir
247
-
248
- # Download the file
249
- local_file_path = os.path.join(target_dir, file_info.name)
250
- try:
251
- with smbclient.open_file(file_path, mode='rb') as remote_file:
252
- with open(local_file_path, 'wb') as local_file:
253
- shutil.copyfileobj(remote_file, local_file)
254
- log.info(f"Downloaded {file_info.name} to {target_dir}")
255
- except Exception as e:
256
- log.error(f"Error downloading file {file_info.name}: {e}")
257
-
258
- except Exception as e:
259
- log.error(f"Error processing folder {date_str}: {e}")
260
-
261
- # Move to next day
262
- current_date += timedelta(days=1)
263
-
264
- except Exception as e:
265
- log.error(f"Error accessing SMB share: {e}")
266
-
267
- def main():
268
- smb_url = os.getenv("SMB_URL")
269
-
270
- # Add argument parser for optional parameters
271
- parser = argparse.ArgumentParser(description='Download and organize conversation files by IP')
272
- parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
273
- parser.add_argument('--download', action='store_true', help='Enable file download')
274
- args = parser.parse_args()
275
-
276
- # Download files if enabled
277
- if args.download:
278
- print("\nDownloading files and organizing by IP address...")
279
- download_files_by_ip(smb_url, check_sandbox=args.sandbox_check)
280
-
281
- # Count and display statistics
282
- ip_counts = count_files_per_ip(smb_url)
283
- print("\nFile counts per IP address:")
284
- for ip, count in sorted(ip_counts.items(), key=lambda x: x[1], reverse=True):
285
- print(f"IP: {ip:<15} Count: {count}")
286
-
287
- if __name__ == "__main__":
288
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/battle_stats.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Annotator,Battle Count
data/last_update.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 2025-03-12 17:35:20