dejanseo commited on
Commit
f196edb
·
verified ·
1 Parent(s): fb39a51

Upload 7 files

Browse files
training/train_1.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
5
+ from sklearn.model_selection import train_test_split
6
+ import numpy as np
7
+ import os
8
+ from tqdm.auto import tqdm
9
+ import streamlit as st
10
+ import matplotlib.pyplot as plt
11
+
12
+ # Constants
13
+ EPOCHS = 10
14
+ VAL_SPLIT = 0.1
15
+ VAL_EVERY_STEPS = 1000
16
+ BATCH_SIZE = 38
17
+ LEARNING_RATE = 5e-5
18
+ LOG_EVERY_STEP = True
19
+ SAVE_CHECKPOINTS = True
20
+ MAX_SEQ_LENGTH = 512
21
+ EARLY_STOPPING_PATIENCE = 3
22
+ MODEL_NAME = 'albert/albert-base-v2'
23
+ LEVEL = 1
24
+ OUTPUT_DIR = f'level{LEVEL}'
25
+
26
+ # Ensure output directory exists
27
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
28
+
29
+ # Load data
30
+ df = pd.read_csv(f'level_{LEVEL}.csv')
31
+ df.rename(columns={'response': 'text'}, inplace=True)
32
+
33
+ # Get unique labels and create mapping
34
+ labels = sorted(df[str(LEVEL)].unique())
35
+ label_to_index = {label: i for i, label in enumerate(labels)}
36
+ index_to_label = {i: label for label, i in label_to_index.items()}
37
+ num_labels = len(labels)
38
+
39
+ # Save label mapping
40
+ np.save(os.path.join(OUTPUT_DIR, 'label_map.npy'), label_to_index)
41
+
42
+ # Prepare data for training
43
+ df['label'] = df[str(LEVEL)].map(label_to_index)
44
+ train_df, val_df = train_test_split(df, test_size=VAL_SPLIT, random_state=42)
45
+
46
+ # Tokenizer
47
+ tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)
48
+
49
+ class TaxonomyDataset(Dataset):
50
+ def __init__(self, dataframe, tokenizer, max_len):
51
+ self.data = dataframe
52
+ self.tokenizer = tokenizer
53
+ self.max_len = max_len
54
+
55
+ def __len__(self):
56
+ return len(self.data)
57
+
58
+ def __getitem__(self, index):
59
+ text = str(self.data.iloc[index].text)
60
+ label = int(self.data.iloc[index].label)
61
+ encoding = self.tokenizer.encode_plus(
62
+ text,
63
+ add_special_tokens=True,
64
+ max_length=self.max_len,
65
+ padding='max_length',
66
+ truncation=True,
67
+ return_attention_mask=True,
68
+ return_tensors='pt'
69
+ )
70
+ return {
71
+ 'input_ids': encoding['input_ids'].flatten(),
72
+ 'attention_mask': encoding['attention_mask'].flatten(),
73
+ 'labels': torch.tensor(label, dtype=torch.long)
74
+ }
75
+
76
+ # Create datasets and dataloaders
77
+ train_dataset = TaxonomyDataset(train_df, tokenizer, MAX_SEQ_LENGTH)
78
+ val_dataset = TaxonomyDataset(val_df, tokenizer, MAX_SEQ_LENGTH)
79
+
80
+ train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
81
+ val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
82
+
83
+ # Model
84
+ model = AlbertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
85
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
86
+ model.to(device)
87
+
88
+ # Optimizer and scheduler
89
+ optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
90
+ total_steps = len(train_dataloader) * EPOCHS
91
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
92
+
93
+ # Loss tracking
94
+ train_losses = []
95
+ val_losses = []
96
+ val_steps = []
97
+ best_val_loss = float('inf')
98
+ early_stopping_counter = 0
99
+ global_step = 0
100
+
101
+ # Streamlit setup
102
+ st.title(f'Level {LEVEL} Model Training')
103
+ progress_bar = st.progress(0)
104
+ status_text = st.empty()
105
+ train_loss_fig, train_loss_ax = plt.subplots()
106
+ val_loss_fig, val_loss_ax = plt.subplots()
107
+ train_loss_chart = st.pyplot(train_loss_fig)
108
+ val_loss_chart = st.pyplot(val_loss_fig)
109
+
110
+ def update_loss_charts():
111
+ train_loss_ax.clear()
112
+ train_loss_ax.plot(range(len(train_losses)), train_losses)
113
+ train_loss_ax.set_xlabel("Steps")
114
+ train_loss_ax.set_ylabel("Loss")
115
+ train_loss_ax.set_title("Training Loss")
116
+ train_loss_chart.pyplot(train_loss_fig)
117
+
118
+ val_loss_ax.clear()
119
+ val_loss_ax.plot(val_steps, val_losses)
120
+ val_loss_ax.set_xlabel("Steps")
121
+ val_loss_ax.set_ylabel("Loss")
122
+ val_loss_ax.set_title("Validation Loss")
123
+ val_loss_chart.pyplot(val_loss_fig)
124
+
125
+ # Training loop
126
+ for epoch in range(EPOCHS):
127
+ model.train()
128
+ total_train_loss = 0
129
+ for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False):
130
+ optimizer.zero_grad()
131
+ input_ids = batch['input_ids'].to(device)
132
+ attention_mask = batch['attention_mask'].to(device)
133
+ labels = batch['labels'].to(device)
134
+ outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
135
+ loss = outputs.loss
136
+ total_train_loss += loss.item()
137
+ loss.backward()
138
+ optimizer.step()
139
+ scheduler.step()
140
+ global_step += 1
141
+
142
+ train_losses.append(loss.item())
143
+
144
+ if LOG_EVERY_STEP:
145
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}")
146
+ update_loss_charts()
147
+
148
+ if global_step % VAL_EVERY_STEPS == 0:
149
+ model.eval()
150
+ total_val_loss = 0
151
+ with torch.no_grad():
152
+ for val_batch in val_dataloader:
153
+ input_ids = val_batch['input_ids'].to(device)
154
+ attention_mask = val_batch['attention_mask'].to(device)
155
+ labels = val_batch['labels'].to(device)
156
+ outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
157
+ total_val_loss += outputs.loss.item()
158
+
159
+ avg_val_loss = total_val_loss / len(val_dataloader)
160
+ val_losses.append(avg_val_loss)
161
+ val_steps.append(global_step)
162
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}")
163
+ update_loss_charts()
164
+
165
+ if SAVE_CHECKPOINTS:
166
+ checkpoint_dir = os.path.join(OUTPUT_DIR, f'level{LEVEL}_step{global_step}')
167
+ os.makedirs(checkpoint_dir, exist_ok=True)
168
+ model.save_pretrained(checkpoint_dir)
169
+ tokenizer.save_pretrained(checkpoint_dir)
170
+ status_text.text(f"Checkpoint saved at step {global_step}")
171
+
172
+ if avg_val_loss < best_val_loss:
173
+ best_val_loss = avg_val_loss
174
+ early_stopping_counter = 0
175
+ else:
176
+ early_stopping_counter += 1
177
+ if early_stopping_counter >= EARLY_STOPPING_PATIENCE:
178
+ status_text.text(f"Early stopping triggered at step {global_step}")
179
+ progress_bar.progress(100)
180
+ # Save final model before stopping
181
+ model.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
182
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
183
+ exit() # Stop training
184
+ progress_bar.progress(int((global_step / total_steps) * 100))
185
+
186
+ avg_train_loss = total_train_loss / len(train_dataloader)
187
+ print(f'Epoch {epoch+1}/{EPOCHS} Average Training Loss: {avg_train_loss:.4f}')
188
+
189
+ # Save final model
190
+ model.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
191
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
192
+ status_text.success("Training complete!")
training/train_2.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ from transformers import AlbertTokenizer, AlbertModel, AdamW, get_linear_schedule_with_warmup
5
+ from sklearn.model_selection import train_test_split
6
+ import numpy as np
7
+ import os
8
+ from tqdm.auto import tqdm
9
+ import streamlit as st
10
+ import matplotlib.pyplot as plt
11
+ import torch.nn as nn
12
+
13
+ # Constants
14
+ EPOCHS = 10
15
+ VAL_SPLIT = 0.1
16
+ VAL_EVERY_STEPS = 1000
17
+ BATCH_SIZE = 38
18
+ LEARNING_RATE = 1e-5
19
+ LOG_EVERY_STEP = True
20
+ SAVE_CHECKPOINTS = True
21
+ MAX_SEQ_LENGTH = 512
22
+ EARLY_STOPPING_PATIENCE = 3
23
+ MODEL_NAME = 'albert/albert-base-v2'
24
+ LEVEL = 2
25
+ OUTPUT_DIR = f'level{LEVEL}'
26
+
27
+ # Ensure output directory exists
28
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
29
+
30
+ # Load data
31
+ df = pd.read_csv(f'level_{LEVEL}.csv')
32
+ df.rename(columns={'response': 'text'}, inplace=True)
33
+
34
+ # Get unique labels for current level and create mapping
35
+ labels = sorted(df[str(LEVEL)].unique())
36
+ label_to_index = {label: i for i, label in enumerate(labels)}
37
+ index_to_label = {i: label for label, i in label_to_index.items()}
38
+ num_labels = len(labels)
39
+
40
+ # Save label mapping for current level
41
+ np.save(os.path.join(OUTPUT_DIR, 'label_map.npy'), label_to_index)
42
+
43
+ # Load parent level ID mapping
44
+ parent_level = LEVEL - 1
45
+ parent_label_to_index = np.load(f'level{parent_level}/label_map.npy', allow_pickle=True).item()
46
+ num_parent_labels = len(parent_label_to_index)
47
+
48
+ # Prepare data for training
49
+ df['label'] = df[str(LEVEL)].map(label_to_index)
50
+ train_df, val_df = train_test_split(df, test_size=VAL_SPLIT, random_state=42)
51
+
52
+ # Tokenizer
53
+ tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)
54
+
55
+ class TaxonomyDataset(Dataset):
56
+ def __init__(self, dataframe, tokenizer, max_len, parent_label_to_index):
57
+ self.data = dataframe
58
+ self.tokenizer = tokenizer
59
+ self.max_len = max_len
60
+ self.parent_label_to_index = parent_label_to_index
61
+
62
+ def __len__(self):
63
+ return len(self.data)
64
+
65
+ def __getitem__(self, index):
66
+ text = str(self.data.iloc[index].text)
67
+ label = int(self.data.iloc[index].label)
68
+ parent_id = int(self.data.iloc[index][str(LEVEL - 1)])
69
+
70
+ encoding = self.tokenizer.encode_plus(
71
+ text,
72
+ add_special_tokens=True,
73
+ max_length=self.max_len,
74
+ padding='max_length',
75
+ truncation=True,
76
+ return_attention_mask=True,
77
+ return_tensors='pt'
78
+ )
79
+
80
+ # One-hot encode parent ID
81
+ parent_one_hot = torch.zeros(len(self.parent_label_to_index))
82
+ if parent_id != 0:
83
+ parent_index = self.parent_label_to_index.get(parent_id)
84
+ if parent_index is not None:
85
+ parent_one_hot[parent_index] = 1
86
+
87
+ return {
88
+ 'input_ids': encoding['input_ids'].flatten(),
89
+ 'attention_mask': encoding['attention_mask'].flatten(),
90
+ 'parent_ids': parent_one_hot,
91
+ 'labels': torch.tensor(label, dtype=torch.long)
92
+ }
93
+
94
+ # Create datasets and dataloaders
95
+ train_dataset = TaxonomyDataset(train_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
96
+ val_dataset = TaxonomyDataset(val_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
97
+
98
+ train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
99
+ val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
100
+
101
+ # Model Definition
102
+ class TaxonomyClassifier(nn.Module):
103
+ def __init__(self, base_model_name, num_parent_labels, num_labels):
104
+ super().__init__()
105
+ self.albert = AlbertModel.from_pretrained(base_model_name)
106
+ self.dropout = nn.Dropout(0.1)
107
+ self.classifier = nn.Linear(self.albert.config.hidden_size + num_parent_labels, num_labels)
108
+
109
+ def forward(self, input_ids, attention_mask, parent_ids):
110
+ outputs = self.albert(input_ids, attention_mask=attention_mask)
111
+ pooled_output = outputs.pooler_output
112
+ pooled_output = self.dropout(pooled_output)
113
+ combined_features = torch.cat((pooled_output, parent_ids), dim=1)
114
+ logits = self.classifier(combined_features)
115
+ return logits
116
+
117
+ # Model Initialization
118
+ model = TaxonomyClassifier(MODEL_NAME, num_parent_labels, num_labels)
119
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ model.to(device)
121
+
122
+ # Optimizer and scheduler
123
+ optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
124
+ total_steps = len(train_dataloader) * EPOCHS
125
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
126
+
127
+ # Loss Function
128
+ loss_fn = nn.CrossEntropyLoss()
129
+
130
+ # Loss tracking
131
+ train_losses = []
132
+ val_losses = []
133
+ val_steps = []
134
+ best_val_loss = float('inf')
135
+ early_stopping_counter = 0
136
+ global_step = 0
137
+
138
+ # Streamlit setup
139
+ st.title(f'Level {LEVEL} Model Training')
140
+ progress_bar = st.progress(0)
141
+ status_text = st.empty()
142
+ train_loss_fig, train_loss_ax = plt.subplots()
143
+ val_loss_fig, val_loss_ax = plt.subplots()
144
+ train_loss_chart = st.pyplot(train_loss_fig)
145
+ val_loss_chart = st.pyplot(val_loss_fig)
146
+
147
+ def update_loss_charts():
148
+ train_loss_ax.clear()
149
+ train_loss_ax.plot(range(len(train_losses)), train_losses)
150
+ train_loss_ax.set_xlabel("Steps")
151
+ train_loss_ax.set_ylabel("Loss")
152
+ train_loss_ax.set_title("Training Loss")
153
+ train_loss_chart.pyplot(train_loss_fig)
154
+
155
+ val_loss_ax.clear()
156
+ val_loss_ax.plot(val_steps, val_losses)
157
+ val_loss_ax.set_xlabel("Steps")
158
+ val_loss_ax.set_ylabel("Loss")
159
+ val_loss_ax.set_title("Validation Loss")
160
+ val_loss_chart.pyplot(val_loss_fig)
161
+
162
+ # Training loop
163
+ for epoch in range(EPOCHS):
164
+ model.train()
165
+ total_train_loss = 0
166
+ for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False):
167
+ optimizer.zero_grad()
168
+ input_ids = batch['input_ids'].to(device)
169
+ attention_mask = batch['attention_mask'].to(device)
170
+ parent_ids = batch['parent_ids'].to(device)
171
+ labels = batch['labels'].to(device)
172
+ outputs = model(input_ids, attention_mask, parent_ids)
173
+ loss = loss_fn(outputs, labels)
174
+ total_train_loss += loss.item()
175
+ loss.backward()
176
+ optimizer.step()
177
+ scheduler.step()
178
+ global_step += 1
179
+
180
+ train_losses.append(loss.item())
181
+
182
+ if LOG_EVERY_STEP:
183
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}")
184
+ update_loss_charts()
185
+
186
+ if global_step % VAL_EVERY_STEPS == 0:
187
+ model.eval()
188
+ total_val_loss = 0
189
+ with torch.no_grad():
190
+ for val_batch in val_dataloader:
191
+ input_ids = val_batch['input_ids'].to(device)
192
+ attention_mask = val_batch['attention_mask'].to(device)
193
+ parent_ids = val_batch['parent_ids'].to(device)
194
+ labels = val_batch['labels'].to(device)
195
+ outputs = model(input_ids, attention_mask, parent_ids)
196
+ loss = loss_fn(outputs, labels)
197
+ total_val_loss += loss.item()
198
+
199
+ avg_val_loss = total_val_loss / len(val_dataloader)
200
+ val_losses.append(avg_val_loss)
201
+ val_steps.append(global_step)
202
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}")
203
+ update_loss_charts()
204
+
205
+ if SAVE_CHECKPOINTS:
206
+ checkpoint_dir = os.path.join(OUTPUT_DIR, f'level{LEVEL}_step{global_step}')
207
+ os.makedirs(checkpoint_dir, exist_ok=True)
208
+ torch.save(model.state_dict(), os.path.join(checkpoint_dir, 'model.safetensors'))
209
+ tokenizer.save_pretrained(checkpoint_dir)
210
+ status_text.text(f"Checkpoint saved at step {global_step}")
211
+
212
+ if avg_val_loss < best_val_loss:
213
+ best_val_loss = avg_val_loss
214
+ early_stopping_counter = 0
215
+ else:
216
+ early_stopping_counter += 1
217
+ if early_stopping_counter >= EARLY_STOPPING_PATIENCE:
218
+ status_text.text(f"Early stopping triggered at step {global_step}")
219
+ progress_bar.progress(100)
220
+ # Save final model before stopping
221
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
222
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
223
+ exit() # Stop training
224
+ progress_bar.progress(int((global_step / total_steps) * 100))
225
+
226
+ avg_train_loss = total_train_loss / len(train_dataloader)
227
+ print(f'Epoch {epoch+1}/{EPOCHS} Average Training Loss: {avg_train_loss:.4f}')
228
+
229
+ # Save final model
230
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
231
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
232
+ status_text.success("Training complete!")
training/train_3.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ from transformers import AlbertTokenizer, AlbertModel, AdamW, get_linear_schedule_with_warmup
5
+ from sklearn.model_selection import train_test_split
6
+ import numpy as np
7
+ import os
8
+ from tqdm.auto import tqdm
9
+ import streamlit as st
10
+ import matplotlib.pyplot as plt
11
+ import torch.nn as nn
12
+
13
+ # Constants
14
+ EPOCHS = 10
15
+ VAL_SPLIT = 0.1
16
+ VAL_EVERY_STEPS = 1000
17
+ BATCH_SIZE = 38
18
+ LEARNING_RATE = 5e-5
19
+ LOG_EVERY_STEP = True
20
+ SAVE_CHECKPOINTS = True
21
+ MAX_SEQ_LENGTH = 512
22
+ EARLY_STOPPING_PATIENCE = 3
23
+ MODEL_NAME = 'albert/albert-base-v2'
24
+ LEVEL = 3
25
+ OUTPUT_DIR = f'level{LEVEL}'
26
+
27
+ # Ensure output directory exists
28
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
29
+
30
+ # Load data
31
+ df = pd.read_csv(f'level_{LEVEL}.csv')
32
+ df.rename(columns={'response': 'text'}, inplace=True)
33
+
34
+ # Get unique labels for current level and create mapping
35
+ labels = sorted(df[str(LEVEL)].unique())
36
+ label_to_index = {label: i for i, label in enumerate(labels)}
37
+ index_to_label = {i: label for label, i in label_to_index.items()}
38
+ num_labels = len(labels)
39
+
40
+ # Save label mapping for current level
41
+ np.save(os.path.join(OUTPUT_DIR, 'label_map.npy'), label_to_index)
42
+
43
+ # Load parent level ID mapping
44
+ parent_level = LEVEL - 1
45
+ parent_label_to_index = np.load(f'level{parent_level}/label_map.npy', allow_pickle=True).item()
46
+ num_parent_labels = len(parent_label_to_index)
47
+
48
+ # Prepare data for training
49
+ df['label'] = df[str(LEVEL)].map(label_to_index)
50
+ train_df, val_df = train_test_split(df, test_size=VAL_SPLIT, random_state=42)
51
+
52
+ # Tokenizer
53
+ tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)
54
+
55
+ class TaxonomyDataset(Dataset):
56
+ def __init__(self, dataframe, tokenizer, max_len, parent_label_to_index):
57
+ self.data = dataframe
58
+ self.tokenizer = tokenizer
59
+ self.max_len = max_len
60
+ self.parent_label_to_index = parent_label_to_index
61
+
62
+ def __len__(self):
63
+ return len(self.data)
64
+
65
+ def __getitem__(self, index):
66
+ text = str(self.data.iloc[index].text)
67
+ label = int(self.data.iloc[index].label)
68
+ parent_id = int(self.data.iloc[index][str(LEVEL - 1)])
69
+
70
+ encoding = self.tokenizer.encode_plus(
71
+ text,
72
+ add_special_tokens=True,
73
+ max_length=self.max_len,
74
+ padding='max_length',
75
+ truncation=True,
76
+ return_attention_mask=True,
77
+ return_tensors='pt'
78
+ )
79
+
80
+ # One-hot encode parent ID
81
+ parent_one_hot = torch.zeros(len(self.parent_label_to_index))
82
+ if parent_id != 0:
83
+ parent_index = self.parent_label_to_index.get(parent_id)
84
+ if parent_index is not None:
85
+ parent_one_hot[parent_index] = 1
86
+
87
+ return {
88
+ 'input_ids': encoding['input_ids'].flatten(),
89
+ 'attention_mask': encoding['attention_mask'].flatten(),
90
+ 'parent_ids': parent_one_hot,
91
+ 'labels': torch.tensor(label, dtype=torch.long)
92
+ }
93
+
94
+ # Create datasets and dataloaders
95
+ train_dataset = TaxonomyDataset(train_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
96
+ val_dataset = TaxonomyDataset(val_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
97
+
98
+ train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
99
+ val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
100
+
101
+ # Model Definition
102
+ class TaxonomyClassifier(nn.Module):
103
+ def __init__(self, base_model_name, num_parent_labels, num_labels):
104
+ super().__init__()
105
+ self.albert = AlbertModel.from_pretrained(base_model_name)
106
+ self.dropout = nn.Dropout(0.1)
107
+ self.classifier = nn.Linear(self.albert.config.hidden_size + num_parent_labels, num_labels)
108
+
109
+ def forward(self, input_ids, attention_mask, parent_ids):
110
+ outputs = self.albert(input_ids, attention_mask=attention_mask)
111
+ pooled_output = outputs.pooler_output
112
+ pooled_output = self.dropout(pooled_output)
113
+ combined_features = torch.cat((pooled_output, parent_ids), dim=1)
114
+ logits = self.classifier(combined_features)
115
+ return logits
116
+
117
+ # Model Initialization
118
+ model = TaxonomyClassifier(MODEL_NAME, num_parent_labels, num_labels)
119
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ model.to(device)
121
+
122
+ # Optimizer and scheduler
123
+ optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
124
+ total_steps = len(train_dataloader) * EPOCHS
125
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
126
+
127
+ # Loss Function
128
+ loss_fn = nn.CrossEntropyLoss()
129
+
130
+ # Loss tracking
131
+ train_losses = []
132
+ val_losses = []
133
+ val_steps = []
134
+ best_val_loss = float('inf')
135
+ early_stopping_counter = 0
136
+ global_step = 0
137
+
138
+ # Streamlit setup
139
+ st.title(f'Level {LEVEL} Model Training')
140
+ progress_bar = st.progress(0)
141
+ status_text = st.empty()
142
+ train_loss_fig, train_loss_ax = plt.subplots()
143
+ val_loss_fig, val_loss_ax = plt.subplots()
144
+ train_loss_chart = st.pyplot(train_loss_fig)
145
+ val_loss_chart = st.pyplot(val_loss_fig)
146
+
147
+ def update_loss_charts():
148
+ train_loss_ax.clear()
149
+ train_loss_ax.plot(range(len(train_losses)), train_losses)
150
+ train_loss_ax.set_xlabel("Steps")
151
+ train_loss_ax.set_ylabel("Loss")
152
+ train_loss_ax.set_title("Training Loss")
153
+ train_loss_chart.pyplot(train_loss_fig)
154
+
155
+ val_loss_ax.clear()
156
+ val_loss_ax.plot(val_steps, val_losses)
157
+ val_loss_ax.set_xlabel("Steps")
158
+ val_loss_ax.set_ylabel("Loss")
159
+ val_loss_ax.set_title("Validation Loss")
160
+ val_loss_chart.pyplot(val_loss_fig)
161
+
162
+ # Training loop
163
+ for epoch in range(EPOCHS):
164
+ model.train()
165
+ total_train_loss = 0
166
+ for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False):
167
+ optimizer.zero_grad()
168
+ input_ids = batch['input_ids'].to(device)
169
+ attention_mask = batch['attention_mask'].to(device)
170
+ parent_ids = batch['parent_ids'].to(device)
171
+ labels = batch['labels'].to(device)
172
+ outputs = model(input_ids, attention_mask, parent_ids)
173
+ loss = loss_fn(outputs, labels)
174
+ total_train_loss += loss.item()
175
+ loss.backward()
176
+ optimizer.step()
177
+ scheduler.step()
178
+ global_step += 1
179
+
180
+ train_losses.append(loss.item())
181
+
182
+ if LOG_EVERY_STEP:
183
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}")
184
+ update_loss_charts()
185
+
186
+ if global_step % VAL_EVERY_STEPS == 0:
187
+ model.eval()
188
+ total_val_loss = 0
189
+ with torch.no_grad():
190
+ for val_batch in val_dataloader:
191
+ input_ids = val_batch['input_ids'].to(device)
192
+ attention_mask = val_batch['attention_mask'].to(device)
193
+ parent_ids = val_batch['parent_ids'].to(device)
194
+ labels = val_batch['labels'].to(device)
195
+ outputs = model(input_ids, attention_mask, parent_ids)
196
+ loss = loss_fn(outputs, labels)
197
+ total_val_loss += loss.item()
198
+
199
+ avg_val_loss = total_val_loss / len(val_dataloader)
200
+ val_losses.append(avg_val_loss)
201
+ val_steps.append(global_step)
202
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}")
203
+ update_loss_charts()
204
+
205
+ if SAVE_CHECKPOINTS:
206
+ checkpoint_dir = os.path.join(OUTPUT_DIR, f'level{LEVEL}_step{global_step}')
207
+ os.makedirs(checkpoint_dir, exist_ok=True)
208
+ torch.save(model.state_dict(), os.path.join(checkpoint_dir, 'model.safetensors'))
209
+ tokenizer.save_pretrained(checkpoint_dir)
210
+ status_text.text(f"Checkpoint saved at step {global_step}")
211
+
212
+ if avg_val_loss < best_val_loss:
213
+ best_val_loss = avg_val_loss
214
+ early_stopping_counter = 0
215
+ else:
216
+ early_stopping_counter += 1
217
+ if early_stopping_counter >= EARLY_STOPPING_PATIENCE:
218
+ status_text.text(f"Early stopping triggered at step {global_step}")
219
+ progress_bar.progress(100)
220
+ # Save final model before stopping
221
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
222
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
223
+ exit() # Stop training
224
+ progress_bar.progress(int((global_step / total_steps) * 100))
225
+
226
+ avg_train_loss = total_train_loss / len(train_dataloader)
227
+ print(f'Epoch {epoch+1}/{EPOCHS} Average Training Loss: {avg_train_loss:.4f}')
228
+
229
+ # Save final model
230
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
231
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
232
+ status_text.success("Training complete!")
training/train_4.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ from transformers import AlbertTokenizer, AlbertModel, AdamW, get_linear_schedule_with_warmup
5
+ from sklearn.model_selection import train_test_split
6
+ import numpy as np
7
+ import os
8
+ from tqdm.auto import tqdm
9
+ import streamlit as st
10
+ import matplotlib.pyplot as plt
11
+ import torch.nn as nn
12
+
13
+ # Constants
14
+ EPOCHS = 10
15
+ VAL_SPLIT = 0.1
16
+ VAL_EVERY_STEPS = 1000
17
+ BATCH_SIZE = 38
18
+ LEARNING_RATE = 5e-5
19
+ LOG_EVERY_STEP = True
20
+ SAVE_CHECKPOINTS = True
21
+ MAX_SEQ_LENGTH = 512
22
+ EARLY_STOPPING_PATIENCE = 3
23
+ MODEL_NAME = 'albert/albert-base-v2'
24
+ LEVEL = 4
25
+ OUTPUT_DIR = f'level{LEVEL}'
26
+
27
+ # Ensure output directory exists
28
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
29
+
30
+ # Load data
31
+ df = pd.read_csv(f'level_{LEVEL}.csv')
32
+ df.rename(columns={'response': 'text'}, inplace=True)
33
+
34
+ # Get unique labels for current level and create mapping
35
+ labels = sorted(df[str(LEVEL)].unique())
36
+ label_to_index = {label: i for i, label in enumerate(labels)}
37
+ index_to_label = {i: label for label, i in label_to_index.items()}
38
+ num_labels = len(labels)
39
+
40
+ # Save label mapping for current level
41
+ np.save(os.path.join(OUTPUT_DIR, 'label_map.npy'), label_to_index)
42
+
43
+ # Load parent level ID mapping
44
+ parent_level = LEVEL - 1
45
+ parent_label_to_index = np.load(f'level{parent_level}/label_map.npy', allow_pickle=True).item()
46
+ num_parent_labels = len(parent_label_to_index)
47
+
48
+ # Prepare data for training
49
+ df['label'] = df[str(LEVEL)].map(label_to_index)
50
+ train_df, val_df = train_test_split(df, test_size=VAL_SPLIT, random_state=42)
51
+
52
+ # Tokenizer
53
+ tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)
54
+
55
+ class TaxonomyDataset(Dataset):
56
+ def __init__(self, dataframe, tokenizer, max_len, parent_label_to_index):
57
+ self.data = dataframe
58
+ self.tokenizer = tokenizer
59
+ self.max_len = max_len
60
+ self.parent_label_to_index = parent_label_to_index
61
+
62
+ def __len__(self):
63
+ return len(self.data)
64
+
65
+ def __getitem__(self, index):
66
+ text = str(self.data.iloc[index].text)
67
+ label = int(self.data.iloc[index].label)
68
+ parent_id = int(self.data.iloc[index][str(LEVEL - 1)])
69
+
70
+ encoding = self.tokenizer.encode_plus(
71
+ text,
72
+ add_special_tokens=True,
73
+ max_length=self.max_len,
74
+ padding='max_length',
75
+ truncation=True,
76
+ return_attention_mask=True,
77
+ return_tensors='pt'
78
+ )
79
+
80
+ # One-hot encode parent ID
81
+ parent_one_hot = torch.zeros(len(self.parent_label_to_index))
82
+ if parent_id != 0:
83
+ parent_index = self.parent_label_to_index.get(parent_id)
84
+ if parent_index is not None:
85
+ parent_one_hot[parent_index] = 1
86
+
87
+ return {
88
+ 'input_ids': encoding['input_ids'].flatten(),
89
+ 'attention_mask': encoding['attention_mask'].flatten(),
90
+ 'parent_ids': parent_one_hot,
91
+ 'labels': torch.tensor(label, dtype=torch.long)
92
+ }
93
+
94
+ # Create datasets and dataloaders
95
+ train_dataset = TaxonomyDataset(train_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
96
+ val_dataset = TaxonomyDataset(val_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
97
+
98
+ train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
99
+ val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
100
+
101
+ # Model Definition
102
+ class TaxonomyClassifier(nn.Module):
103
+ def __init__(self, base_model_name, num_parent_labels, num_labels):
104
+ super().__init__()
105
+ self.albert = AlbertModel.from_pretrained(base_model_name)
106
+ self.dropout = nn.Dropout(0.1)
107
+ self.classifier = nn.Linear(self.albert.config.hidden_size + num_parent_labels, num_labels)
108
+
109
+ def forward(self, input_ids, attention_mask, parent_ids):
110
+ outputs = self.albert(input_ids, attention_mask=attention_mask)
111
+ pooled_output = outputs.pooler_output
112
+ pooled_output = self.dropout(pooled_output)
113
+ combined_features = torch.cat((pooled_output, parent_ids), dim=1)
114
+ logits = self.classifier(combined_features)
115
+ return logits
116
+
117
+ # Model Initialization
118
+ model = TaxonomyClassifier(MODEL_NAME, num_parent_labels, num_labels)
119
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ model.to(device)
121
+
122
+ # Optimizer and scheduler
123
+ optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
124
+ total_steps = len(train_dataloader) * EPOCHS
125
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
126
+
127
+ # Loss Function
128
+ loss_fn = nn.CrossEntropyLoss()
129
+
130
+ # Loss tracking
131
+ train_losses = []
132
+ val_losses = []
133
+ val_steps = []
134
+ best_val_loss = float('inf')
135
+ early_stopping_counter = 0
136
+ global_step = 0
137
+
138
+ # Streamlit setup
139
+ st.title(f'Level {LEVEL} Model Training')
140
+ progress_bar = st.progress(0)
141
+ status_text = st.empty()
142
+ train_loss_fig, train_loss_ax = plt.subplots()
143
+ val_loss_fig, val_loss_ax = plt.subplots()
144
+ train_loss_chart = st.pyplot(train_loss_fig)
145
+ val_loss_chart = st.pyplot(val_loss_fig)
146
+
147
+ def update_loss_charts():
148
+ train_loss_ax.clear()
149
+ train_loss_ax.plot(range(len(train_losses)), train_losses)
150
+ train_loss_ax.set_xlabel("Steps")
151
+ train_loss_ax.set_ylabel("Loss")
152
+ train_loss_ax.set_title("Training Loss")
153
+ train_loss_chart.pyplot(train_loss_fig)
154
+
155
+ val_loss_ax.clear()
156
+ val_loss_ax.plot(val_steps, val_losses)
157
+ val_loss_ax.set_xlabel("Steps")
158
+ val_loss_ax.set_ylabel("Loss")
159
+ val_loss_ax.set_title("Validation Loss")
160
+ val_loss_chart.pyplot(val_loss_fig)
161
+
162
+ # Training loop
163
+ for epoch in range(EPOCHS):
164
+ model.train()
165
+ total_train_loss = 0
166
+ for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False):
167
+ optimizer.zero_grad()
168
+ input_ids = batch['input_ids'].to(device)
169
+ attention_mask = batch['attention_mask'].to(device)
170
+ parent_ids = batch['parent_ids'].to(device)
171
+ labels = batch['labels'].to(device)
172
+ outputs = model(input_ids, attention_mask, parent_ids)
173
+ loss = loss_fn(outputs, labels)
174
+ total_train_loss += loss.item()
175
+ loss.backward()
176
+ optimizer.step()
177
+ scheduler.step()
178
+ global_step += 1
179
+
180
+ train_losses.append(loss.item())
181
+
182
+ if LOG_EVERY_STEP:
183
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}")
184
+ update_loss_charts()
185
+
186
+ if global_step % VAL_EVERY_STEPS == 0:
187
+ model.eval()
188
+ total_val_loss = 0
189
+ with torch.no_grad():
190
+ for val_batch in val_dataloader:
191
+ input_ids = val_batch['input_ids'].to(device)
192
+ attention_mask = val_batch['attention_mask'].to(device)
193
+ parent_ids = val_batch['parent_ids'].to(device)
194
+ labels = val_batch['labels'].to(device)
195
+ outputs = model(input_ids, attention_mask, parent_ids)
196
+ loss = loss_fn(outputs, labels)
197
+ total_val_loss += loss.item()
198
+
199
+ avg_val_loss = total_val_loss / len(val_dataloader)
200
+ val_losses.append(avg_val_loss)
201
+ val_steps.append(global_step)
202
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}")
203
+ update_loss_charts()
204
+
205
+ if SAVE_CHECKPOINTS:
206
+ checkpoint_dir = os.path.join(OUTPUT_DIR, f'level{LEVEL}_step{global_step}')
207
+ os.makedirs(checkpoint_dir, exist_ok=True)
208
+ torch.save(model.state_dict(), os.path.join(checkpoint_dir, 'model.safetensors'))
209
+ tokenizer.save_pretrained(checkpoint_dir)
210
+ status_text.text(f"Checkpoint saved at step {global_step}")
211
+
212
+ if avg_val_loss < best_val_loss:
213
+ best_val_loss = avg_val_loss
214
+ early_stopping_counter = 0
215
+ else:
216
+ early_stopping_counter += 1
217
+ if early_stopping_counter >= EARLY_STOPPING_PATIENCE:
218
+ status_text.text(f"Early stopping triggered at step {global_step}")
219
+ progress_bar.progress(100)
220
+ # Save final model before stopping
221
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
222
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
223
+ exit() # Stop training
224
+ progress_bar.progress(int((global_step / total_steps) * 100))
225
+
226
+ avg_train_loss = total_train_loss / len(train_dataloader)
227
+ print(f'Epoch {epoch+1}/{EPOCHS} Average Training Loss: {avg_train_loss:.4f}')
228
+
229
+ # Save final model
230
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
231
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
232
+ status_text.success("Training complete!")
training/train_5.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ from transformers import AlbertTokenizer, AlbertModel, AdamW, get_linear_schedule_with_warmup
5
+ from sklearn.model_selection import train_test_split
6
+ import numpy as np
7
+ import os
8
+ from tqdm.auto import tqdm
9
+ import streamlit as st
10
+ import matplotlib.pyplot as plt
11
+ import torch.nn as nn
12
+
13
+ # Constants
14
+ EPOCHS = 10
15
+ VAL_SPLIT = 0.1
16
+ VAL_EVERY_STEPS = 1000
17
+ BATCH_SIZE = 38
18
+ LEARNING_RATE = 5e-5
19
+ LOG_EVERY_STEP = True
20
+ SAVE_CHECKPOINTS = True
21
+ MAX_SEQ_LENGTH = 512
22
+ EARLY_STOPPING_PATIENCE = 3
23
+ MODEL_NAME = 'albert/albert-base-v2'
24
+ LEVEL = 5
25
+ OUTPUT_DIR = f'level{LEVEL}'
26
+
27
+ # Ensure output directory exists
28
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
29
+
30
+ # Load data
31
+ df = pd.read_csv(f'level_{LEVEL}.csv')
32
+ df.rename(columns={'response': 'text'}, inplace=True)
33
+
34
+ # Get unique labels for current level and create mapping
35
+ labels = sorted(df[str(LEVEL)].unique())
36
+ label_to_index = {label: i for i, label in enumerate(labels)}
37
+ index_to_label = {i: label for label, i in label_to_index.items()}
38
+ num_labels = len(labels)
39
+
40
+ # Save label mapping for current level
41
+ np.save(os.path.join(OUTPUT_DIR, 'label_map.npy'), label_to_index)
42
+
43
+ # Load parent level ID mapping
44
+ parent_level = LEVEL - 1
45
+ parent_label_to_index = np.load(f'level{parent_level}/label_map.npy', allow_pickle=True).item()
46
+ num_parent_labels = len(parent_label_to_index)
47
+
48
+ # Prepare data for training
49
+ df['label'] = df[str(LEVEL)].map(label_to_index)
50
+ train_df, val_df = train_test_split(df, test_size=VAL_SPLIT, random_state=42)
51
+
52
+ # Tokenizer
53
+ tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)
54
+
55
+ class TaxonomyDataset(Dataset):
56
+ def __init__(self, dataframe, tokenizer, max_len, parent_label_to_index):
57
+ self.data = dataframe
58
+ self.tokenizer = tokenizer
59
+ self.max_len = max_len
60
+ self.parent_label_to_index = parent_label_to_index
61
+
62
+ def __len__(self):
63
+ return len(self.data)
64
+
65
+ def __getitem__(self, index):
66
+ text = str(self.data.iloc[index].text)
67
+ label = int(self.data.iloc[index].label)
68
+ parent_id = int(self.data.iloc[index][str(LEVEL - 1)])
69
+
70
+ encoding = self.tokenizer.encode_plus(
71
+ text,
72
+ add_special_tokens=True,
73
+ max_length=self.max_len,
74
+ padding='max_length',
75
+ truncation=True,
76
+ return_attention_mask=True,
77
+ return_tensors='pt'
78
+ )
79
+
80
+ # One-hot encode parent ID
81
+ parent_one_hot = torch.zeros(len(self.parent_label_to_index))
82
+ if parent_id != 0:
83
+ parent_index = self.parent_label_to_index.get(parent_id)
84
+ if parent_index is not None:
85
+ parent_one_hot[parent_index] = 1
86
+
87
+ return {
88
+ 'input_ids': encoding['input_ids'].flatten(),
89
+ 'attention_mask': encoding['attention_mask'].flatten(),
90
+ 'parent_ids': parent_one_hot,
91
+ 'labels': torch.tensor(label, dtype=torch.long)
92
+ }
93
+
94
+ # Create datasets and dataloaders
95
+ train_dataset = TaxonomyDataset(train_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
96
+ val_dataset = TaxonomyDataset(val_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
97
+
98
+ train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
99
+ val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
100
+
101
+ # Model Definition
102
+ class TaxonomyClassifier(nn.Module):
103
+ def __init__(self, base_model_name, num_parent_labels, num_labels):
104
+ super().__init__()
105
+ self.albert = AlbertModel.from_pretrained(base_model_name)
106
+ self.dropout = nn.Dropout(0.1)
107
+ self.classifier = nn.Linear(self.albert.config.hidden_size + num_parent_labels, num_labels)
108
+
109
+ def forward(self, input_ids, attention_mask, parent_ids):
110
+ outputs = self.albert(input_ids, attention_mask=attention_mask)
111
+ pooled_output = outputs.pooler_output
112
+ pooled_output = self.dropout(pooled_output)
113
+ combined_features = torch.cat((pooled_output, parent_ids), dim=1)
114
+ logits = self.classifier(combined_features)
115
+ return logits
116
+
117
+ # Model Initialization
118
+ model = TaxonomyClassifier(MODEL_NAME, num_parent_labels, num_labels)
119
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ model.to(device)
121
+
122
+ # Optimizer and scheduler
123
+ optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
124
+ total_steps = len(train_dataloader) * EPOCHS
125
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
126
+
127
+ # Loss Function
128
+ loss_fn = nn.CrossEntropyLoss()
129
+
130
+ # Loss tracking
131
+ train_losses = []
132
+ val_losses = []
133
+ val_steps = []
134
+ best_val_loss = float('inf')
135
+ early_stopping_counter = 0
136
+ global_step = 0
137
+
138
+ # Streamlit setup
139
+ st.title(f'Level {LEVEL} Model Training')
140
+ progress_bar = st.progress(0)
141
+ status_text = st.empty()
142
+ train_loss_fig, train_loss_ax = plt.subplots()
143
+ val_loss_fig, val_loss_ax = plt.subplots()
144
+ train_loss_chart = st.pyplot(train_loss_fig)
145
+ val_loss_chart = st.pyplot(val_loss_fig)
146
+
147
+ def update_loss_charts():
148
+ train_loss_ax.clear()
149
+ train_loss_ax.plot(range(len(train_losses)), train_losses)
150
+ train_loss_ax.set_xlabel("Steps")
151
+ train_loss_ax.set_ylabel("Loss")
152
+ train_loss_ax.set_title("Training Loss")
153
+ train_loss_chart.pyplot(train_loss_fig)
154
+
155
+ val_loss_ax.clear()
156
+ val_loss_ax.plot(val_steps, val_losses)
157
+ val_loss_ax.set_xlabel("Steps")
158
+ val_loss_ax.set_ylabel("Loss")
159
+ val_loss_ax.set_title("Validation Loss")
160
+ val_loss_chart.pyplot(val_loss_fig)
161
+
162
+ # Training loop
163
+ for epoch in range(EPOCHS):
164
+ model.train()
165
+ total_train_loss = 0
166
+ for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False):
167
+ optimizer.zero_grad()
168
+ input_ids = batch['input_ids'].to(device)
169
+ attention_mask = batch['attention_mask'].to(device)
170
+ parent_ids = batch['parent_ids'].to(device)
171
+ labels = batch['labels'].to(device)
172
+ outputs = model(input_ids, attention_mask, parent_ids)
173
+ loss = loss_fn(outputs, labels)
174
+ total_train_loss += loss.item()
175
+ loss.backward()
176
+ optimizer.step()
177
+ scheduler.step()
178
+ global_step += 1
179
+
180
+ train_losses.append(loss.item())
181
+
182
+ if LOG_EVERY_STEP:
183
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}")
184
+ update_loss_charts()
185
+
186
+ if global_step % VAL_EVERY_STEPS == 0:
187
+ model.eval()
188
+ total_val_loss = 0
189
+ with torch.no_grad():
190
+ for val_batch in val_dataloader:
191
+ input_ids = val_batch['input_ids'].to(device)
192
+ attention_mask = val_batch['attention_mask'].to(device)
193
+ parent_ids = val_batch['parent_ids'].to(device)
194
+ labels = val_batch['labels'].to(device)
195
+ outputs = model(input_ids, attention_mask, parent_ids)
196
+ loss = loss_fn(outputs, labels)
197
+ total_val_loss += loss.item()
198
+
199
+ avg_val_loss = total_val_loss / len(val_dataloader)
200
+ val_losses.append(avg_val_loss)
201
+ val_steps.append(global_step)
202
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}")
203
+ update_loss_charts()
204
+
205
+ if SAVE_CHECKPOINTS:
206
+ checkpoint_dir = os.path.join(OUTPUT_DIR, f'level{LEVEL}_step{global_step}')
207
+ os.makedirs(checkpoint_dir, exist_ok=True)
208
+ torch.save(model.state_dict(), os.path.join(checkpoint_dir, 'model.safetensors'))
209
+ tokenizer.save_pretrained(checkpoint_dir)
210
+ status_text.text(f"Checkpoint saved at step {global_step}")
211
+
212
+ if avg_val_loss < best_val_loss:
213
+ best_val_loss = avg_val_loss
214
+ early_stopping_counter = 0
215
+ else:
216
+ early_stopping_counter += 1
217
+ if early_stopping_counter >= EARLY_STOPPING_PATIENCE:
218
+ status_text.text(f"Early stopping triggered at step {global_step}")
219
+ progress_bar.progress(100)
220
+ # Save final model before stopping
221
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
222
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
223
+ exit() # Stop training
224
+ progress_bar.progress(int((global_step / total_steps) * 100))
225
+
226
+ avg_train_loss = total_train_loss / len(train_dataloader)
227
+ print(f'Epoch {epoch+1}/{EPOCHS} Average Training Loss: {avg_train_loss:.4f}')
228
+
229
+ # Save final model
230
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
231
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
232
+ status_text.success("Training complete!")
training/train_6.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ from transformers import AlbertTokenizer, AlbertModel, AdamW, get_linear_schedule_with_warmup
5
+ from sklearn.model_selection import train_test_split
6
+ import numpy as np
7
+ import os
8
+ from tqdm.auto import tqdm
9
+ import streamlit as st
10
+ import matplotlib.pyplot as plt
11
+ import torch.nn as nn
12
+
13
+ # Constants
14
+ EPOCHS = 10
15
+ VAL_SPLIT = 0.1
16
+ VAL_EVERY_STEPS = 1000
17
+ BATCH_SIZE = 38
18
+ LEARNING_RATE = 5e-5
19
+ LOG_EVERY_STEP = True
20
+ SAVE_CHECKPOINTS = True
21
+ MAX_SEQ_LENGTH = 512
22
+ EARLY_STOPPING_PATIENCE = 3
23
+ MODEL_NAME = 'albert/albert-base-v2'
24
+ LEVEL = 6
25
+ OUTPUT_DIR = f'level{LEVEL}'
26
+
27
+ # Ensure output directory exists
28
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
29
+
30
+ # Load data
31
+ df = pd.read_csv(f'level_{LEVEL}.csv')
32
+ df.rename(columns={'response': 'text'}, inplace=True)
33
+
34
+ # Get unique labels for current level and create mapping
35
+ labels = sorted(df[str(LEVEL)].unique())
36
+ label_to_index = {label: i for i, label in enumerate(labels)}
37
+ index_to_label = {i: label for label, i in label_to_index.items()}
38
+ num_labels = len(labels)
39
+
40
+ # Save label mapping for current level
41
+ np.save(os.path.join(OUTPUT_DIR, 'label_map.npy'), label_to_index)
42
+
43
+ # Load parent level ID mapping
44
+ parent_level = LEVEL - 1
45
+ parent_label_to_index = np.load(f'level{parent_level}/label_map.npy', allow_pickle=True).item()
46
+ num_parent_labels = len(parent_label_to_index)
47
+
48
+ # Prepare data for training
49
+ df['label'] = df[str(LEVEL)].map(label_to_index)
50
+ train_df, val_df = train_test_split(df, test_size=VAL_SPLIT, random_state=42)
51
+
52
+ # Tokenizer
53
+ tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)
54
+
55
+ class TaxonomyDataset(Dataset):
56
+ def __init__(self, dataframe, tokenizer, max_len, parent_label_to_index):
57
+ self.data = dataframe
58
+ self.tokenizer = tokenizer
59
+ self.max_len = max_len
60
+ self.parent_label_to_index = parent_label_to_index
61
+
62
+ def __len__(self):
63
+ return len(self.data)
64
+
65
+ def __getitem__(self, index):
66
+ text = str(self.data.iloc[index].text)
67
+ label = int(self.data.iloc[index].label)
68
+ parent_id = int(self.data.iloc[index][str(LEVEL - 1)])
69
+
70
+ encoding = self.tokenizer.encode_plus(
71
+ text,
72
+ add_special_tokens=True,
73
+ max_length=self.max_len,
74
+ padding='max_length',
75
+ truncation=True,
76
+ return_attention_mask=True,
77
+ return_tensors='pt'
78
+ )
79
+
80
+ # One-hot encode parent ID
81
+ parent_one_hot = torch.zeros(len(self.parent_label_to_index))
82
+ if parent_id != 0:
83
+ parent_index = self.parent_label_to_index.get(parent_id)
84
+ if parent_index is not None:
85
+ parent_one_hot[parent_index] = 1
86
+
87
+ return {
88
+ 'input_ids': encoding['input_ids'].flatten(),
89
+ 'attention_mask': encoding['attention_mask'].flatten(),
90
+ 'parent_ids': parent_one_hot,
91
+ 'labels': torch.tensor(label, dtype=torch.long)
92
+ }
93
+
94
+ # Create datasets and dataloaders
95
+ train_dataset = TaxonomyDataset(train_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
96
+ val_dataset = TaxonomyDataset(val_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
97
+
98
+ train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
99
+ val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
100
+
101
+ # Model Definition
102
+ class TaxonomyClassifier(nn.Module):
103
+ def __init__(self, base_model_name, num_parent_labels, num_labels):
104
+ super().__init__()
105
+ self.albert = AlbertModel.from_pretrained(base_model_name)
106
+ self.dropout = nn.Dropout(0.1)
107
+ self.classifier = nn.Linear(self.albert.config.hidden_size + num_parent_labels, num_labels)
108
+
109
+ def forward(self, input_ids, attention_mask, parent_ids):
110
+ outputs = self.albert(input_ids, attention_mask=attention_mask)
111
+ pooled_output = outputs.pooler_output
112
+ pooled_output = self.dropout(pooled_output)
113
+ combined_features = torch.cat((pooled_output, parent_ids), dim=1)
114
+ logits = self.classifier(combined_features)
115
+ return logits
116
+
117
+ # Model Initialization
118
+ model = TaxonomyClassifier(MODEL_NAME, num_parent_labels, num_labels)
119
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ model.to(device)
121
+
122
+ # Optimizer and scheduler
123
+ optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
124
+ total_steps = len(train_dataloader) * EPOCHS
125
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
126
+
127
+ # Loss Function
128
+ loss_fn = nn.CrossEntropyLoss()
129
+
130
+ # Loss tracking
131
+ train_losses = []
132
+ val_losses = []
133
+ val_steps = []
134
+ best_val_loss = float('inf')
135
+ early_stopping_counter = 0
136
+ global_step = 0
137
+
138
+ # Streamlit setup
139
+ st.title(f'Level {LEVEL} Model Training')
140
+ progress_bar = st.progress(0)
141
+ status_text = st.empty()
142
+ train_loss_fig, train_loss_ax = plt.subplots()
143
+ val_loss_fig, val_loss_ax = plt.subplots()
144
+ train_loss_chart = st.pyplot(train_loss_fig)
145
+ val_loss_chart = st.pyplot(val_loss_fig)
146
+
147
+ def update_loss_charts():
148
+ train_loss_ax.clear()
149
+ train_loss_ax.plot(range(len(train_losses)), train_losses)
150
+ train_loss_ax.set_xlabel("Steps")
151
+ train_loss_ax.set_ylabel("Loss")
152
+ train_loss_ax.set_title("Training Loss")
153
+ train_loss_chart.pyplot(train_loss_fig)
154
+
155
+ val_loss_ax.clear()
156
+ val_loss_ax.plot(val_steps, val_losses)
157
+ val_loss_ax.set_xlabel("Steps")
158
+ val_loss_ax.set_ylabel("Loss")
159
+ val_loss_ax.set_title("Validation Loss")
160
+ val_loss_chart.pyplot(val_loss_fig)
161
+
162
+ # Training loop
163
+ for epoch in range(EPOCHS):
164
+ model.train()
165
+ total_train_loss = 0
166
+ for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False):
167
+ optimizer.zero_grad()
168
+ input_ids = batch['input_ids'].to(device)
169
+ attention_mask = batch['attention_mask'].to(device)
170
+ parent_ids = batch['parent_ids'].to(device)
171
+ labels = batch['labels'].to(device)
172
+ outputs = model(input_ids, attention_mask, parent_ids)
173
+ loss = loss_fn(outputs, labels)
174
+ total_train_loss += loss.item()
175
+ loss.backward()
176
+ optimizer.step()
177
+ scheduler.step()
178
+ global_step += 1
179
+
180
+ train_losses.append(loss.item())
181
+
182
+ if LOG_EVERY_STEP:
183
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}")
184
+ update_loss_charts()
185
+
186
+ if global_step % VAL_EVERY_STEPS == 0:
187
+ model.eval()
188
+ total_val_loss = 0
189
+ with torch.no_grad():
190
+ for val_batch in val_dataloader:
191
+ input_ids = val_batch['input_ids'].to(device)
192
+ attention_mask = val_batch['attention_mask'].to(device)
193
+ parent_ids = val_batch['parent_ids'].to(device)
194
+ labels = val_batch['labels'].to(device)
195
+ outputs = model(input_ids, attention_mask, parent_ids)
196
+ loss = loss_fn(outputs, labels)
197
+ total_val_loss += loss.item()
198
+
199
+ avg_val_loss = total_val_loss / len(val_dataloader)
200
+ val_losses.append(avg_val_loss)
201
+ val_steps.append(global_step)
202
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}")
203
+ update_loss_charts()
204
+
205
+ if SAVE_CHECKPOINTS:
206
+ checkpoint_dir = os.path.join(OUTPUT_DIR, f'level{LEVEL}_step{global_step}')
207
+ os.makedirs(checkpoint_dir, exist_ok=True)
208
+ torch.save(model.state_dict(), os.path.join(checkpoint_dir, 'model.safetensors'))
209
+ tokenizer.save_pretrained(checkpoint_dir)
210
+ status_text.text(f"Checkpoint saved at step {global_step}")
211
+
212
+ if avg_val_loss < best_val_loss:
213
+ best_val_loss = avg_val_loss
214
+ early_stopping_counter = 0
215
+ else:
216
+ early_stopping_counter += 1
217
+ if early_stopping_counter >= EARLY_STOPPING_PATIENCE:
218
+ status_text.text(f"Early stopping triggered at step {global_step}")
219
+ progress_bar.progress(100)
220
+ # Save final model before stopping
221
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
222
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
223
+ exit() # Stop training
224
+ progress_bar.progress(int((global_step / total_steps) * 100))
225
+
226
+ avg_train_loss = total_train_loss / len(train_dataloader)
227
+ print(f'Epoch {epoch+1}/{EPOCHS} Average Training Loss: {avg_train_loss:.4f}')
228
+
229
+ # Save final model
230
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
231
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
232
+ status_text.success("Training complete!")
training/train_7.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ from transformers import AlbertTokenizer, AlbertModel, AdamW, get_linear_schedule_with_warmup
5
+ from sklearn.model_selection import train_test_split
6
+ import numpy as np
7
+ import os
8
+ from tqdm.auto import tqdm
9
+ import streamlit as st
10
+ import matplotlib.pyplot as plt
11
+ import torch.nn as nn
12
+
13
+ # Constants
14
+ EPOCHS = 10
15
+ VAL_SPLIT = 0.1
16
+ VAL_EVERY_STEPS = 1000
17
+ BATCH_SIZE = 38
18
+ LEARNING_RATE = 5e-5
19
+ LOG_EVERY_STEP = True
20
+ SAVE_CHECKPOINTS = True
21
+ MAX_SEQ_LENGTH = 512
22
+ EARLY_STOPPING_PATIENCE = 3
23
+ MODEL_NAME = 'albert/albert-base-v2'
24
+ LEVEL = 7
25
+ OUTPUT_DIR = f'level{LEVEL}'
26
+
27
+ # Ensure output directory exists
28
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
29
+
30
+ # Load data
31
+ df = pd.read_csv(f'level_{LEVEL}.csv')
32
+ df.rename(columns={'response': 'text'}, inplace=True)
33
+
34
+ # Get unique labels for current level and create mapping
35
+ labels = sorted(df[str(LEVEL)].unique())
36
+ label_to_index = {label: i for i, label in enumerate(labels)}
37
+ index_to_label = {i: label for label, i in label_to_index.items()}
38
+ num_labels = len(labels)
39
+
40
+ # Save label mapping for current level
41
+ np.save(os.path.join(OUTPUT_DIR, 'label_map.npy'), label_to_index)
42
+
43
+ # Load parent level ID mapping
44
+ parent_level = LEVEL - 1
45
+ parent_label_to_index = np.load(f'level{parent_level}/label_map.npy', allow_pickle=True).item()
46
+ num_parent_labels = len(parent_label_to_index)
47
+
48
+ # Prepare data for training
49
+ df['label'] = df[str(LEVEL)].map(label_to_index)
50
+ train_df, val_df = train_test_split(df, test_size=VAL_SPLIT, random_state=42)
51
+
52
+ # Tokenizer
53
+ tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)
54
+
55
+ class TaxonomyDataset(Dataset):
56
+ def __init__(self, dataframe, tokenizer, max_len, parent_label_to_index):
57
+ self.data = dataframe
58
+ self.tokenizer = tokenizer
59
+ self.max_len = max_len
60
+ self.parent_label_to_index = parent_label_to_index
61
+
62
+ def __len__(self):
63
+ return len(self.data)
64
+
65
+ def __getitem__(self, index):
66
+ text = str(self.data.iloc[index].text)
67
+ label = int(self.data.iloc[index].label)
68
+ parent_id = int(self.data.iloc[index][str(LEVEL - 1)])
69
+
70
+ encoding = self.tokenizer.encode_plus(
71
+ text,
72
+ add_special_tokens=True,
73
+ max_length=self.max_len,
74
+ padding='max_length',
75
+ truncation=True,
76
+ return_attention_mask=True,
77
+ return_tensors='pt'
78
+ )
79
+
80
+ # One-hot encode parent ID
81
+ parent_one_hot = torch.zeros(len(self.parent_label_to_index))
82
+ if parent_id != 0:
83
+ parent_index = self.parent_label_to_index.get(parent_id)
84
+ if parent_index is not None:
85
+ parent_one_hot[parent_index] = 1
86
+
87
+ return {
88
+ 'input_ids': encoding['input_ids'].flatten(),
89
+ 'attention_mask': encoding['attention_mask'].flatten(),
90
+ 'parent_ids': parent_one_hot,
91
+ 'labels': torch.tensor(label, dtype=torch.long)
92
+ }
93
+
94
+ # Create datasets and dataloaders
95
+ train_dataset = TaxonomyDataset(train_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
96
+ val_dataset = TaxonomyDataset(val_df, tokenizer, MAX_SEQ_LENGTH, parent_label_to_index)
97
+
98
+ train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
99
+ val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
100
+
101
+ # Model Definition
102
+ class TaxonomyClassifier(nn.Module):
103
+ def __init__(self, base_model_name, num_parent_labels, num_labels):
104
+ super().__init__()
105
+ self.albert = AlbertModel.from_pretrained(base_model_name)
106
+ self.dropout = nn.Dropout(0.1)
107
+ self.classifier = nn.Linear(self.albert.config.hidden_size + num_parent_labels, num_labels)
108
+
109
+ def forward(self, input_ids, attention_mask, parent_ids):
110
+ outputs = self.albert(input_ids, attention_mask=attention_mask)
111
+ pooled_output = outputs.pooler_output
112
+ pooled_output = self.dropout(pooled_output)
113
+ combined_features = torch.cat((pooled_output, parent_ids), dim=1)
114
+ logits = self.classifier(combined_features)
115
+ return logits
116
+
117
+ # Model Initialization
118
+ model = TaxonomyClassifier(MODEL_NAME, num_parent_labels, num_labels)
119
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ model.to(device)
121
+
122
+ # Optimizer and scheduler
123
+ optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
124
+ total_steps = len(train_dataloader) * EPOCHS
125
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
126
+
127
+ # Loss Function
128
+ loss_fn = nn.CrossEntropyLoss()
129
+
130
+ # Loss tracking
131
+ train_losses = []
132
+ val_losses = []
133
+ val_steps = []
134
+ best_val_loss = float('inf')
135
+ early_stopping_counter = 0
136
+ global_step = 0
137
+
138
+ # Streamlit setup
139
+ st.title(f'Level {LEVEL} Model Training')
140
+ progress_bar = st.progress(0)
141
+ status_text = st.empty()
142
+ train_loss_fig, train_loss_ax = plt.subplots()
143
+ val_loss_fig, val_loss_ax = plt.subplots()
144
+ train_loss_chart = st.pyplot(train_loss_fig)
145
+ val_loss_chart = st.pyplot(val_loss_fig)
146
+
147
+ def update_loss_charts():
148
+ train_loss_ax.clear()
149
+ train_loss_ax.plot(range(len(train_losses)), train_losses)
150
+ train_loss_ax.set_xlabel("Steps")
151
+ train_loss_ax.set_ylabel("Loss")
152
+ train_loss_ax.set_title("Training Loss")
153
+ train_loss_chart.pyplot(train_loss_fig)
154
+
155
+ val_loss_ax.clear()
156
+ val_loss_ax.plot(val_steps, val_losses)
157
+ val_loss_ax.set_xlabel("Steps")
158
+ val_loss_ax.set_ylabel("Loss")
159
+ val_loss_ax.set_title("Validation Loss")
160
+ val_loss_chart.pyplot(val_loss_fig)
161
+
162
+ # Training loop
163
+ for epoch in range(EPOCHS):
164
+ model.train()
165
+ total_train_loss = 0
166
+ for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False):
167
+ optimizer.zero_grad()
168
+ input_ids = batch['input_ids'].to(device)
169
+ attention_mask = batch['attention_mask'].to(device)
170
+ parent_ids = batch['parent_ids'].to(device)
171
+ labels = batch['labels'].to(device)
172
+ outputs = model(input_ids, attention_mask, parent_ids)
173
+ loss = loss_fn(outputs, labels)
174
+ total_train_loss += loss.item()
175
+ loss.backward()
176
+ optimizer.step()
177
+ scheduler.step()
178
+ global_step += 1
179
+
180
+ train_losses.append(loss.item())
181
+
182
+ if LOG_EVERY_STEP:
183
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}")
184
+ update_loss_charts()
185
+
186
+ if global_step % VAL_EVERY_STEPS == 0:
187
+ model.eval()
188
+ total_val_loss = 0
189
+ with torch.no_grad():
190
+ for val_batch in val_dataloader:
191
+ input_ids = val_batch['input_ids'].to(device)
192
+ attention_mask = val_batch['attention_mask'].to(device)
193
+ parent_ids = val_batch['parent_ids'].to(device)
194
+ labels = val_batch['labels'].to(device)
195
+ outputs = model(input_ids, attention_mask, parent_ids)
196
+ loss = loss_fn(outputs, labels)
197
+ total_val_loss += loss.item()
198
+
199
+ avg_val_loss = total_val_loss / len(val_dataloader)
200
+ val_losses.append(avg_val_loss)
201
+ val_steps.append(global_step)
202
+ status_text.text(f"Epoch {epoch+1}/{EPOCHS}, Step {global_step}, Training Loss: {loss.item():.4f}, Validation Loss: {avg_val_loss:.4f}")
203
+ update_loss_charts()
204
+
205
+ if SAVE_CHECKPOINTS:
206
+ checkpoint_dir = os.path.join(OUTPUT_DIR, f'level{LEVEL}_step{global_step}')
207
+ os.makedirs(checkpoint_dir, exist_ok=True)
208
+ torch.save(model.state_dict(), os.path.join(checkpoint_dir, 'model.safetensors'))
209
+ tokenizer.save_pretrained(checkpoint_dir)
210
+ status_text.text(f"Checkpoint saved at step {global_step}")
211
+
212
+ if avg_val_loss < best_val_loss:
213
+ best_val_loss = avg_val_loss
214
+ early_stopping_counter = 0
215
+ else:
216
+ early_stopping_counter += 1
217
+ if early_stopping_counter >= EARLY_STOPPING_PATIENCE:
218
+ status_text.text(f"Early stopping triggered at step {global_step}")
219
+ progress_bar.progress(100)
220
+ # Save final model before stopping
221
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
222
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
223
+ exit() # Stop training
224
+ progress_bar.progress(int((global_step / total_steps) * 100))
225
+
226
+ avg_train_loss = total_train_loss / len(train_dataloader)
227
+ print(f'Epoch {epoch+1}/{EPOCHS} Average Training Loss: {avg_train_loss:.4f}')
228
+
229
+ # Save final model
230
+ torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, 'model.safetensors'))
231
+ tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'model'))
232
+ status_text.success("Training complete!")