yxyyeah commited on
Commit
edb2493
·
verified ·
1 Parent(s): 3478137

Upload 4 files

Browse files
main.ipynb ADDED
@@ -0,0 +1,1453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "e:\\plant\\venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "from autogluon.tabular import TabularDataset, TabularPredictor\n",
19
+ "from autogluon.common.utils.utils import setup_outputdir\n",
20
+ "from autogluon.core.utils.loaders import load_pkl\n",
21
+ "from autogluon.core.utils.savers import save_pkl\n",
22
+ "import os.path\n",
23
+ "import os\n",
24
+ "import pandas as pd\n",
25
+ "from PIL import Image\n",
26
+ "import torch\n",
27
+ "from transformers import ViTModel, ViTFeatureExtractor\n",
28
+ "import pickle\n",
29
+ "\n",
30
+ "class MultilabelPredictor:\n",
31
+ " \"\"\" Tabular Predictor for predicting multiple columns in table.\n",
32
+ " Creates multiple TabularPredictor objects which you can also use individually.\n",
33
+ " You can access the TabularPredictor for a particular label via: `multilabel_predictor.get_predictor(label_i)`\n",
34
+ "\n",
35
+ " Parameters\n",
36
+ " ----------\n",
37
+ " labels : List[str]\n",
38
+ " The ith element of this list is the column (i.e. `label`) predicted by the ith TabularPredictor stored in this object.\n",
39
+ " path : str, default = None\n",
40
+ " Path to directory where models and intermediate outputs should be saved.\n",
41
+ " If unspecified, a time-stamped folder called \"AutogluonModels/ag-[TIMESTAMP]\" will be created in the working directory to store all models.\n",
42
+ " Note: To call `fit()` twice and save all results of each fit, you must specify different `path` locations or don't specify `path` at all.\n",
43
+ " Otherwise files from first `fit()` will be overwritten by second `fit()`.\n",
44
+ " Caution: when predicting many labels, this directory may grow large as it needs to store many TabularPredictors.\n",
45
+ " problem_types : List[str], default = None\n",
46
+ " The ith element is the `problem_type` for the ith TabularPredictor stored in this object.\n",
47
+ " eval_metrics : List[str], default = None\n",
48
+ " The ith element is the `eval_metric` for the ith TabularPredictor stored in this object.\n",
49
+ " consider_labels_correlation : bool, default = True\n",
50
+ " Whether the predictions of multiple labels should account for label correlations or predict each label independently of the others.\n",
51
+ " If True, the ordering of `labels` may affect resulting accuracy as each label is predicted conditional on the previous labels appearing earlier in this list (i.e. in an auto-regressive fashion).\n",
52
+ " Set to False if during inference you may want to individually use just the ith TabularPredictor without predicting all the other labels.\n",
53
+ " kwargs :\n",
54
+ " Arguments passed into the initialization of each TabularPredictor.\n",
55
+ "\n",
56
+ " \"\"\"\n",
57
+ "\n",
58
+ " multi_predictor_file = 'multilabel_predictor.pkl'\n",
59
+ "\n",
60
+ " def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=True, **kwargs):\n",
61
+ " if len(labels) < 2:\n",
62
+ " raise ValueError(\"MultilabelPredictor is only intended for predicting MULTIPLE labels (columns), use TabularPredictor for predicting one label (column).\")\n",
63
+ " if (problem_types is not None) and (len(problem_types) != len(labels)):\n",
64
+ " raise ValueError(\"If provided, `problem_types` must have same length as `labels`\")\n",
65
+ " if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):\n",
66
+ " raise ValueError(\"If provided, `eval_metrics` must have same length as `labels`\")\n",
67
+ " self.path = setup_outputdir(path, warn_if_exist=False)\n",
68
+ " self.labels = labels\n",
69
+ " self.consider_labels_correlation = consider_labels_correlation\n",
70
+ " self.predictors = {} # key = label, value = TabularPredictor or str path to the TabularPredictor for this label\n",
71
+ " if eval_metrics is None:\n",
72
+ " self.eval_metrics = {}\n",
73
+ " else:\n",
74
+ " self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}\n",
75
+ " problem_type = None\n",
76
+ " eval_metric = None\n",
77
+ " for i in range(len(labels)):\n",
78
+ " label = labels[i]\n",
79
+ " path_i = os.path.join(self.path, \"Predictor_\" + str(label))\n",
80
+ " if problem_types is not None:\n",
81
+ " problem_type = problem_types[i]\n",
82
+ " if eval_metrics is not None:\n",
83
+ " eval_metric = eval_metrics[i]\n",
84
+ " self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)\n",
85
+ "\n",
86
+ " def fit(self, train_data, tuning_data=None, **kwargs):\n",
87
+ " \"\"\" Fits a separate TabularPredictor to predict each of the labels.\n",
88
+ "\n",
89
+ " Parameters\n",
90
+ " ----------\n",
91
+ " train_data, tuning_data : str or autogluon.tabular.TabularDataset or pd.DataFrame\n",
92
+ " See documentation for `TabularPredictor.fit()`.\n",
93
+ " kwargs :\n",
94
+ " Arguments passed into the `fit()` call for each TabularPredictor.\n",
95
+ " \"\"\"\n",
96
+ " if isinstance(train_data, str):\n",
97
+ " train_data = TabularDataset(train_data)\n",
98
+ " if tuning_data is not None and isinstance(tuning_data, str):\n",
99
+ " tuning_data = TabularDataset(tuning_data)\n",
100
+ " train_data_og = train_data.copy()\n",
101
+ " if tuning_data is not None:\n",
102
+ " tuning_data_og = tuning_data.copy()\n",
103
+ " else:\n",
104
+ " tuning_data_og = None\n",
105
+ " save_metrics = len(self.eval_metrics) == 0\n",
106
+ " for i in range(len(self.labels)):\n",
107
+ " label = self.labels[i]\n",
108
+ " predictor = self.get_predictor(label)\n",
109
+ " if not self.consider_labels_correlation:\n",
110
+ " labels_to_drop = [l for l in self.labels if l != label]\n",
111
+ " else:\n",
112
+ " labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]\n",
113
+ " train_data = train_data_og.drop(labels_to_drop, axis=1)\n",
114
+ " if tuning_data is not None:\n",
115
+ " tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)\n",
116
+ " print(f\"Fitting TabularPredictor for label: {label} ...\")\n",
117
+ " predictor.fit(train_data=train_data, tuning_data=tuning_data, **kwargs)\n",
118
+ " self.predictors[label] = predictor.path\n",
119
+ " if save_metrics:\n",
120
+ " self.eval_metrics[label] = predictor.eval_metric\n",
121
+ " self.save()\n",
122
+ "\n",
123
+ " def predict(self, data, **kwargs):\n",
124
+ " \"\"\" Returns DataFrame with label columns containing predictions for each label.\n",
125
+ "\n",
126
+ " Parameters\n",
127
+ " ----------\n",
128
+ " data_copy : str or autogluon.tabular.TabularDataset or pd.DataFrame\n",
129
+ " Data to make predictions for. If label columns are present in this data, they will be ignored. See documentation for `TabularPredictor.predict()`.\n",
130
+ " kwargs :\n",
131
+ " Arguments passed into the predict() call for each TabularPredictor.\n",
132
+ " \"\"\"\n",
133
+ " return self._predict(data, as_proba=False, **kwargs)\n",
134
+ "\n",
135
+ " def predict_proba(self, data, **kwargs):\n",
136
+ " \"\"\" Returns dict where each key is a label and the corresponding value is the `predict_proba()` output for just that label.\n",
137
+ "\n",
138
+ " Parameters\n",
139
+ " ----------\n",
140
+ " data : str or autogluon.tabular.TabularDataset or pd.DataFrame\n",
141
+ " Data to make predictions for. See documentation for `TabularPredictor.predict()` and `TabularPredictor.predict_proba()`.\n",
142
+ " kwargs :\n",
143
+ " Arguments passed into the `predict_proba()` call for each TabularPredictor (also passed into a `predict()` call).\n",
144
+ " \"\"\"\n",
145
+ " return self._predict(data, as_proba=True, **kwargs)\n",
146
+ "\n",
147
+ " def evaluate(self, data, **kwargs):\n",
148
+ " \"\"\" Returns dict where each key is a label and the corresponding value is the `evaluate()` output for just that label.\n",
149
+ "\n",
150
+ " Parameters\n",
151
+ " ----------\n",
152
+ " data : str or autogluon.tabular.TabularDataset or pd.DataFrame\n",
153
+ " Data to evalate predictions of all labels for, must contain all labels as columns. See documentation for `TabularPredictor.evaluate()`.\n",
154
+ " kwargs :\n",
155
+ " Arguments passed into the `evaluate()` call for each TabularPredictor (also passed into the `predict()` call).\n",
156
+ " \"\"\"\n",
157
+ " data = self._get_data(data)\n",
158
+ " eval_dict = {}\n",
159
+ " for label in self.labels:\n",
160
+ " print(f\"Evaluating TabularPredictor for label: {label} ...\")\n",
161
+ " predictor = self.get_predictor(label)\n",
162
+ " eval_dict[label] = predictor.evaluate(data, **kwargs)\n",
163
+ " if self.consider_labels_correlation:\n",
164
+ " data[label] = predictor.predict(data, **kwargs)\n",
165
+ " return eval_dict\n",
166
+ "\n",
167
+ " def save(self):\n",
168
+ " \"\"\" Save MultilabelPredictor to disk. \"\"\"\n",
169
+ " for label in self.labels:\n",
170
+ " if not isinstance(self.predictors[label], str):\n",
171
+ " self.predictors[label] = self.predictors[label].path\n",
172
+ " save_pkl.save(path=os.path.join(self.path, self.multi_predictor_file), object=self)\n",
173
+ " print(f\"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')\")\n",
174
+ "\n",
175
+ " @classmethod\n",
176
+ " def load(cls, path):\n",
177
+ " \"\"\" Load MultilabelPredictor from disk `path` previously specified when creating this MultilabelPredictor. \"\"\"\n",
178
+ " path = os.path.expanduser(path)\n",
179
+ " return load_pkl.load(path=os.path.join(path, cls.multi_predictor_file))\n",
180
+ "\n",
181
+ " def get_predictor(self, label):\n",
182
+ " \"\"\" Returns TabularPredictor which is used to predict this label. \"\"\"\n",
183
+ " predictor = self.predictors[label]\n",
184
+ " if isinstance(predictor, str):\n",
185
+ " return TabularPredictor.load(path=predictor)\n",
186
+ " return predictor\n",
187
+ "\n",
188
+ " def _get_data(self, data):\n",
189
+ " if isinstance(data, str):\n",
190
+ " return TabularDataset(data)\n",
191
+ " return data.copy()\n",
192
+ "\n",
193
+ " def _predict(self, data, as_proba=False, **kwargs):\n",
194
+ " data = self._get_data(data)\n",
195
+ " if as_proba:\n",
196
+ " predproba_dict = {}\n",
197
+ " for label in self.labels:\n",
198
+ " print(f\"Predicting with TabularPredictor for label: {label} ...\")\n",
199
+ " predictor = self.get_predictor(label)\n",
200
+ " if as_proba:\n",
201
+ " predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)\n",
202
+ " data[label] = predictor.predict(data, **kwargs)\n",
203
+ " if not as_proba:\n",
204
+ " return data[self.labels]\n",
205
+ " else:\n",
206
+ " return predproba_dict\n",
207
+ "\n",
208
+ "def extract_image_embeddings_batch(image_paths):\n",
209
+ " \"\"\"Extract embeddings for a batch of images using Vision Transformer.\"\"\"\n",
210
+ " images = []\n",
211
+ " \n",
212
+ " # Load and preprocess all images in the batch\n",
213
+ " for image_path in image_paths:\n",
214
+ " image = Image.open(image_path).convert(\"RGB\")\n",
215
+ " images.append(image)\n",
216
+ " \n",
217
+ " # Prepare inputs as a batch\n",
218
+ " inputs = feature_extractor(images=images, return_tensors=\"pt\", padding=True).to(device)\n",
219
+ " \n",
220
+ " # Get embeddings in a single forward pass\n",
221
+ " with torch.no_grad():\n",
222
+ " outputs = vit_model(**inputs)\n",
223
+ " \n",
224
+ " # Compute mean embeddings for each image in the batch\n",
225
+ " return outputs.last_hidden_state.mean(dim=1).cpu().numpy()\n",
226
+ "\n",
227
+ "def preprocess_images(df, image_dir, image_column='id', batch_size=512):\n",
228
+ " \"\"\"Generate image embeddings for all rows in a DataFrame in batches.\"\"\"\n",
229
+ " embeddings = []\n",
230
+ " n = len(df)\n",
231
+ " \n",
232
+ " for i in range(0, n, batch_size):\n",
233
+ " # Get the current batch of image paths\n",
234
+ " batch = df.iloc[i:i+batch_size]\n",
235
+ " image_paths = [os.path.join(image_dir, f\"{int(row[image_column])}.jpeg\") for _, row in batch.iterrows()]\n",
236
+ " # Extract embeddings for the batch\n",
237
+ " batch_embeddings = extract_image_embeddings_batch(image_paths)\n",
238
+ " embeddings.extend(batch_embeddings)\n",
239
+ " \n",
240
+ " print(f\"Processed batch {i//batch_size + 1}/{(n + batch_size - 1)//batch_size}\")\n",
241
+ " # Convert to DataFrame\n",
242
+ " return pd.DataFrame(embeddings, index=df.index)"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": null,
248
+ "metadata": {},
249
+ "outputs": [
250
+ {
251
+ "name": "stdout",
252
+ "output_type": "stream",
253
+ "text": [
254
+ "Extracting image embeddings for training data...\n",
255
+ "Combining ancillary data and image embeddings...\n"
256
+ ]
257
+ },
258
+ {
259
+ "name": "stderr",
260
+ "output_type": "stream",
261
+ "text": [
262
+ "Verbosity: 2 (Standard Logging)\n",
263
+ "=================== System Info ===================\n",
264
+ "AutoGluon Version: 1.1.1\n",
265
+ "Python Version: 3.10.11\n",
266
+ "Operating System: Windows\n",
267
+ "Platform Machine: AMD64\n",
268
+ "Platform Version: 10.0.22631\n",
269
+ "CPU Count: 12\n",
270
+ "Memory Avail: 5.11 GB / 15.79 GB (32.4%)\n",
271
+ "Disk Space Avail: 79.69 GB / 150.79 GB (52.8%)\n",
272
+ "===================================================\n",
273
+ "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n",
274
+ "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n",
275
+ "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n",
276
+ "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n",
277
+ "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n",
278
+ "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n"
279
+ ]
280
+ },
281
+ {
282
+ "name": "stdout",
283
+ "output_type": "stream",
284
+ "text": [
285
+ "Training MultilabelPredictor...\n",
286
+ "Fitting TabularPredictor for label: X4_mean ...\n"
287
+ ]
288
+ },
289
+ {
290
+ "name": "stderr",
291
+ "output_type": "stream",
292
+ "text": [
293
+ "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 190.45 MB).\n",
294
+ "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n",
295
+ "Beginning AutoGluon training ...\n",
296
+ "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X4_mean\"\n",
297
+ "Train Data Rows: 43363\n",
298
+ "Train Data Columns: 932\n",
299
+ "Label Column: X4_mean\n",
300
+ "Problem Type: regression\n",
301
+ "Preprocessing data ...\n",
302
+ "Using Feature Generators to preprocess the data ...\n",
303
+ "Fitting AutoMLPipelineFeatureGenerator...\n",
304
+ "\tAvailable Memory: 5219.75 MB\n",
305
+ "\tTrain Data (Original) Memory Usage: 181.30 MB (3.5% of available memory)\n",
306
+ "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n",
307
+ "\tStage 1 Generators:\n",
308
+ "\t\tFitting AsTypeFeatureGenerator...\n",
309
+ "\tStage 2 Generators:\n",
310
+ "\t\tFitting FillNaFeatureGenerator...\n",
311
+ "\tStage 3 Generators:\n",
312
+ "\t\tFitting IdentityFeatureGenerator...\n",
313
+ "\tStage 4 Generators:\n",
314
+ "\t\tFitting DropUniqueFeatureGenerator...\n",
315
+ "\tStage 5 Generators:\n",
316
+ "\t\tFitting DropDuplicatesFeatureGenerator...\n",
317
+ "\tTypes of features in original data (raw dtype, special dtypes):\n",
318
+ "\t\t('float', []) : 810 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
319
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
320
+ "\tTypes of features in processed data (raw dtype, special dtypes):\n",
321
+ "\t\t('float', []) : 810 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
322
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
323
+ "\t5.1s = Fit runtime\n",
324
+ "\t932 features in original data used to generate 932 features in processed data.\n",
325
+ "\tTrain Data (Processed) Memory Usage: 181.30 MB (3.5% of available memory)\n",
326
+ "Data preprocessing and feature engineering runtime = 5.57s ...\n",
327
+ "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n",
328
+ "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n",
329
+ "\tTo change this, specify the eval_metric parameter of Predictor()\n",
330
+ "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n",
331
+ "User-specified model hyperparameters to be fit:\n",
332
+ "{\n",
333
+ "\t'NN_TORCH': {},\n",
334
+ "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n",
335
+ "\t'FASTAI': {},\n",
336
+ "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
337
+ "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
338
+ "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n",
339
+ "}\n",
340
+ "Fitting 9 L1 models ...\n",
341
+ "Fitting model: KNeighborsUnif ...\n",
342
+ "\t-0.1421\t = Validation score (-root_mean_squared_error)\n",
343
+ "\t1.47s\t = Training runtime\n",
344
+ "\t2.66s\t = Validation runtime\n",
345
+ "Fitting model: KNeighborsDist ...\n",
346
+ "\t-0.1426\t = Validation score (-root_mean_squared_error)\n",
347
+ "\t1.45s\t = Training runtime\n",
348
+ "\t2.88s\t = Validation runtime\n",
349
+ "Fitting model: LightGBMXT ...\n"
350
+ ]
351
+ },
352
+ {
353
+ "name": "stdout",
354
+ "output_type": "stream",
355
+ "text": [
356
+ "[1000]\tvalid_set's rmse: 0.10796\n",
357
+ "[2000]\tvalid_set's rmse: 0.107227\n",
358
+ "[3000]\tvalid_set's rmse: 0.106933\n",
359
+ "[4000]\tvalid_set's rmse: 0.106685\n",
360
+ "[5000]\tvalid_set's rmse: 0.106466\n",
361
+ "[6000]\tvalid_set's rmse: 0.106427\n",
362
+ "[7000]\tvalid_set's rmse: 0.106386\n",
363
+ "[8000]\tvalid_set's rmse: 0.106361\n",
364
+ "[9000]\tvalid_set's rmse: 0.106337\n",
365
+ "[10000]\tvalid_set's rmse: 0.106303\n"
366
+ ]
367
+ },
368
+ {
369
+ "name": "stderr",
370
+ "output_type": "stream",
371
+ "text": [
372
+ "\t-0.1063\t = Validation score (-root_mean_squared_error)\n",
373
+ "\t863.4s\t = Training runtime\n",
374
+ "\t0.93s\t = Validation runtime\n",
375
+ "Fitting model: LightGBM ...\n"
376
+ ]
377
+ },
378
+ {
379
+ "name": "stdout",
380
+ "output_type": "stream",
381
+ "text": [
382
+ "[1000]\tvalid_set's rmse: 0.108342\n",
383
+ "[2000]\tvalid_set's rmse: 0.107862\n",
384
+ "[3000]\tvalid_set's rmse: 0.107599\n",
385
+ "[4000]\tvalid_set's rmse: 0.107513\n",
386
+ "[5000]\tvalid_set's rmse: 0.107464\n",
387
+ "[6000]\tvalid_set's rmse: 0.107424\n",
388
+ "[7000]\tvalid_set's rmse: 0.107404\n",
389
+ "[8000]\tvalid_set's rmse: 0.107379\n",
390
+ "[9000]\tvalid_set's rmse: 0.107371\n",
391
+ "[10000]\tvalid_set's rmse: 0.107365\n"
392
+ ]
393
+ },
394
+ {
395
+ "name": "stderr",
396
+ "output_type": "stream",
397
+ "text": [
398
+ "\t-0.1074\t = Validation score (-root_mean_squared_error)\n",
399
+ "\t1027.06s\t = Training runtime\n",
400
+ "\t0.83s\t = Validation runtime\n",
401
+ "Fitting model: RandomForestMSE ...\n",
402
+ "\t-0.112\t = Validation score (-root_mean_squared_error)\n",
403
+ "\t3077.41s\t = Training runtime\n",
404
+ "\t0.22s\t = Validation runtime\n",
405
+ "Fitting model: ExtraTreesMSE ...\n",
406
+ "\t-0.1119\t = Validation score (-root_mean_squared_error)\n",
407
+ "\t1255.77s\t = Training runtime\n",
408
+ "\t0.24s\t = Validation runtime\n",
409
+ "Fitting model: NeuralNetFastAI ...\n",
410
+ "No improvement since epoch 2: early stopping\n",
411
+ "\t-0.1104\t = Validation score (-root_mean_squared_error)\n",
412
+ "\t135.6s\t = Training runtime\n",
413
+ "\t0.28s\t = Validation runtime\n",
414
+ "Fitting model: NeuralNetTorch ...\n",
415
+ "\t-0.1095\t = Validation score (-root_mean_squared_error)\n",
416
+ "\t143.11s\t = Training runtime\n",
417
+ "\t0.32s\t = Validation runtime\n",
418
+ "Fitting model: LightGBMLarge ...\n"
419
+ ]
420
+ },
421
+ {
422
+ "name": "stdout",
423
+ "output_type": "stream",
424
+ "text": [
425
+ "[1000]\tvalid_set's rmse: 0.107068\n",
426
+ "[2000]\tvalid_set's rmse: 0.10661\n",
427
+ "[3000]\tvalid_set's rmse: 0.10653\n",
428
+ "[4000]\tvalid_set's rmse: 0.106503\n",
429
+ "[5000]\tvalid_set's rmse: 0.106497\n",
430
+ "[6000]\tvalid_set's rmse: 0.106495\n",
431
+ "[7000]\tvalid_set's rmse: 0.106495\n",
432
+ "[8000]\tvalid_set's rmse: 0.106495\n",
433
+ "[9000]\tvalid_set's rmse: 0.106495\n",
434
+ "[10000]\tvalid_set's rmse: 0.106495\n"
435
+ ]
436
+ },
437
+ {
438
+ "name": "stderr",
439
+ "output_type": "stream",
440
+ "text": [
441
+ "\t-0.1065\t = Validation score (-root_mean_squared_error)\n",
442
+ "\t2938.26s\t = Training runtime\n",
443
+ "\t1.38s\t = Validation runtime\n",
444
+ "Fitting model: WeightedEnsemble_L2 ...\n",
445
+ "\tEnsemble Weights: {'LightGBMXT': 0.333, 'NeuralNetTorch': 0.238, 'LightGBMLarge': 0.238, 'NeuralNetFastAI': 0.095, 'KNeighborsDist': 0.048, 'LightGBM': 0.048}\n",
446
+ "\t-0.1047\t = Validation score (-root_mean_squared_error)\n",
447
+ "\t0.03s\t = Training runtime\n",
448
+ "\t0.0s\t = Validation runtime\n",
449
+ "AutoGluon training complete, total runtime = 9466.82s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 378.7 rows/s (2500 batch size)\n",
450
+ "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X4_mean\")\n",
451
+ "Verbosity: 2 (Standard Logging)\n",
452
+ "=================== System Info ===================\n",
453
+ "AutoGluon Version: 1.1.1\n",
454
+ "Python Version: 3.10.11\n",
455
+ "Operating System: Windows\n",
456
+ "Platform Machine: AMD64\n",
457
+ "Platform Version: 10.0.22631\n",
458
+ "CPU Count: 12\n",
459
+ "Memory Avail: 5.24 GB / 15.79 GB (33.2%)\n",
460
+ "Disk Space Avail: 77.84 GB / 150.79 GB (51.6%)\n",
461
+ "===================================================\n",
462
+ "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n",
463
+ "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n",
464
+ "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n",
465
+ "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n",
466
+ "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n",
467
+ "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n",
468
+ "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 190.8 MB).\n",
469
+ "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n",
470
+ "Beginning AutoGluon training ...\n",
471
+ "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X11_mean\"\n",
472
+ "Train Data Rows: 43363\n",
473
+ "Train Data Columns: 933\n",
474
+ "Label Column: X11_mean\n",
475
+ "Problem Type: regression\n",
476
+ "Preprocessing data ...\n",
477
+ "Using Feature Generators to preprocess the data ...\n"
478
+ ]
479
+ },
480
+ {
481
+ "name": "stdout",
482
+ "output_type": "stream",
483
+ "text": [
484
+ "Fitting TabularPredictor for label: X11_mean ...\n"
485
+ ]
486
+ },
487
+ {
488
+ "name": "stderr",
489
+ "output_type": "stream",
490
+ "text": [
491
+ "Fitting AutoMLPipelineFeatureGenerator...\n",
492
+ "\tAvailable Memory: 5340.17 MB\n",
493
+ "\tTrain Data (Original) Memory Usage: 181.63 MB (3.4% of available memory)\n",
494
+ "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n",
495
+ "\tStage 1 Generators:\n",
496
+ "\t\tFitting AsTypeFeatureGenerator...\n",
497
+ "\tStage 2 Generators:\n",
498
+ "\t\tFitting FillNaFeatureGenerator...\n",
499
+ "\tStage 3 Generators:\n",
500
+ "\t\tFitting IdentityFeatureGenerator...\n",
501
+ "\tStage 4 Generators:\n",
502
+ "\t\tFitting DropUniqueFeatureGenerator...\n",
503
+ "\tStage 5 Generators:\n",
504
+ "\t\tFitting DropDuplicatesFeatureGenerator...\n",
505
+ "\tTypes of features in original data (raw dtype, special dtypes):\n",
506
+ "\t\t('float', []) : 811 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
507
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
508
+ "\tTypes of features in processed data (raw dtype, special dtypes):\n",
509
+ "\t\t('float', []) : 811 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
510
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
511
+ "\t5.5s = Fit runtime\n",
512
+ "\t933 features in original data used to generate 933 features in processed data.\n",
513
+ "\tTrain Data (Processed) Memory Usage: 181.63 MB (3.4% of available memory)\n",
514
+ "Data preprocessing and feature engineering runtime = 5.89s ...\n",
515
+ "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n",
516
+ "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n",
517
+ "\tTo change this, specify the eval_metric parameter of Predictor()\n",
518
+ "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n",
519
+ "User-specified model hyperparameters to be fit:\n",
520
+ "{\n",
521
+ "\t'NN_TORCH': {},\n",
522
+ "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n",
523
+ "\t'FASTAI': {},\n",
524
+ "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
525
+ "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
526
+ "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n",
527
+ "}\n",
528
+ "Fitting 9 L1 models ...\n",
529
+ "Fitting model: KNeighborsUnif ...\n",
530
+ "\t-7.1893\t = Validation score (-root_mean_squared_error)\n",
531
+ "\t1.57s\t = Training runtime\n",
532
+ "\t2.38s\t = Validation runtime\n",
533
+ "Fitting model: KNeighborsDist ...\n",
534
+ "\t-7.2766\t = Validation score (-root_mean_squared_error)\n",
535
+ "\t1.58s\t = Training runtime\n",
536
+ "\t2.41s\t = Validation runtime\n",
537
+ "Fitting model: LightGBMXT ...\n"
538
+ ]
539
+ },
540
+ {
541
+ "name": "stdout",
542
+ "output_type": "stream",
543
+ "text": [
544
+ "[1000]\tvalid_set's rmse: 5.34109\n",
545
+ "[2000]\tvalid_set's rmse: 5.3167\n",
546
+ "[3000]\tvalid_set's rmse: 5.29916\n",
547
+ "[4000]\tvalid_set's rmse: 5.29677\n",
548
+ "[5000]\tvalid_set's rmse: 5.29458\n",
549
+ "[6000]\tvalid_set's rmse: 5.29489\n",
550
+ "[7000]\tvalid_set's rmse: 5.29236\n",
551
+ "[8000]\tvalid_set's rmse: 5.29263\n",
552
+ "[9000]\tvalid_set's rmse: 5.29315\n"
553
+ ]
554
+ },
555
+ {
556
+ "name": "stderr",
557
+ "output_type": "stream",
558
+ "text": [
559
+ "\t-5.2913\t = Validation score (-root_mean_squared_error)\n",
560
+ "\t831.77s\t = Training runtime\n",
561
+ "\t0.34s\t = Validation runtime\n",
562
+ "Fitting model: LightGBM ...\n"
563
+ ]
564
+ },
565
+ {
566
+ "name": "stdout",
567
+ "output_type": "stream",
568
+ "text": [
569
+ "[1000]\tvalid_set's rmse: 5.29744\n",
570
+ "[2000]\tvalid_set's rmse: 5.26782\n",
571
+ "[3000]\tvalid_set's rmse: 5.26091\n",
572
+ "[4000]\tvalid_set's rmse: 5.25295\n",
573
+ "[5000]\tvalid_set's rmse: 5.24923\n",
574
+ "[6000]\tvalid_set's rmse: 5.24709\n",
575
+ "[7000]\tvalid_set's rmse: 5.24592\n",
576
+ "[8000]\tvalid_set's rmse: 5.24511\n",
577
+ "[9000]\tvalid_set's rmse: 5.24443\n",
578
+ "[10000]\tvalid_set's rmse: 5.24422\n"
579
+ ]
580
+ },
581
+ {
582
+ "name": "stderr",
583
+ "output_type": "stream",
584
+ "text": [
585
+ "\t-5.2442\t = Validation score (-root_mean_squared_error)\n",
586
+ "\t1007.46s\t = Training runtime\n",
587
+ "\t0.8s\t = Validation runtime\n",
588
+ "Fitting model: RandomForestMSE ...\n",
589
+ "\t-5.466\t = Validation score (-root_mean_squared_error)\n",
590
+ "\t3405.54s\t = Training runtime\n",
591
+ "\t0.21s\t = Validation runtime\n",
592
+ "Fitting model: ExtraTreesMSE ...\n",
593
+ "\t-5.5053\t = Validation score (-root_mean_squared_error)\n",
594
+ "\t1100.81s\t = Training runtime\n",
595
+ "\t0.19s\t = Validation runtime\n",
596
+ "Fitting model: NeuralNetFastAI ...\n",
597
+ "No improvement since epoch 8: early stopping\n",
598
+ "\t-5.3575\t = Validation score (-root_mean_squared_error)\n",
599
+ "\t156.5s\t = Training runtime\n",
600
+ "\t0.26s\t = Validation runtime\n",
601
+ "Fitting model: NeuralNetTorch ...\n",
602
+ "\t-5.3648\t = Validation score (-root_mean_squared_error)\n",
603
+ "\t123.3s\t = Training runtime\n",
604
+ "\t0.3s\t = Validation runtime\n",
605
+ "Fitting model: LightGBMLarge ...\n"
606
+ ]
607
+ },
608
+ {
609
+ "name": "stdout",
610
+ "output_type": "stream",
611
+ "text": [
612
+ "[1000]\tvalid_set's rmse: 5.22467\n",
613
+ "[2000]\tvalid_set's rmse: 5.20862\n",
614
+ "[3000]\tvalid_set's rmse: 5.20477\n",
615
+ "[4000]\tvalid_set's rmse: 5.20326\n",
616
+ "[5000]\tvalid_set's rmse: 5.20295\n",
617
+ "[6000]\tvalid_set's rmse: 5.20281\n",
618
+ "[7000]\tvalid_set's rmse: 5.20276\n",
619
+ "[8000]\tvalid_set's rmse: 5.20275\n",
620
+ "[9000]\tvalid_set's rmse: 5.20275\n",
621
+ "[10000]\tvalid_set's rmse: 5.20275\n"
622
+ ]
623
+ },
624
+ {
625
+ "name": "stderr",
626
+ "output_type": "stream",
627
+ "text": [
628
+ "\t-5.2028\t = Validation score (-root_mean_squared_error)\n",
629
+ "\t2423.97s\t = Training runtime\n",
630
+ "\t1.28s\t = Validation runtime\n",
631
+ "Fitting model: WeightedEnsemble_L2 ...\n",
632
+ "\tEnsemble Weights: {'LightGBMLarge': 0.417, 'NeuralNetFastAI': 0.375, 'LightGBM': 0.208}\n",
633
+ "\t-5.0914\t = Validation score (-root_mean_squared_error)\n",
634
+ "\t0.02s\t = Training runtime\n",
635
+ "\t0.0s\t = Validation runtime\n",
636
+ "AutoGluon training complete, total runtime = 9074.56s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1068.5 rows/s (2500 batch size)\n",
637
+ "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X11_mean\")\n",
638
+ "Verbosity: 2 (Standard Logging)\n",
639
+ "=================== System Info ===================\n",
640
+ "AutoGluon Version: 1.1.1\n",
641
+ "Python Version: 3.10.11\n",
642
+ "Operating System: Windows\n",
643
+ "Platform Machine: AMD64\n",
644
+ "Platform Version: 10.0.22631\n",
645
+ "CPU Count: 12\n",
646
+ "Memory Avail: 7.64 GB / 15.79 GB (48.4%)\n",
647
+ "Disk Space Avail: 75.99 GB / 150.79 GB (50.4%)\n",
648
+ "===================================================\n",
649
+ "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n",
650
+ "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n",
651
+ "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n",
652
+ "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n",
653
+ "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n",
654
+ "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n",
655
+ "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 191.14 MB).\n",
656
+ "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n",
657
+ "Beginning AutoGluon training ...\n",
658
+ "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X18_mean\"\n",
659
+ "Train Data Rows: 43363\n",
660
+ "Train Data Columns: 934\n",
661
+ "Label Column: X18_mean\n",
662
+ "Problem Type: regression\n",
663
+ "Preprocessing data ...\n"
664
+ ]
665
+ },
666
+ {
667
+ "name": "stdout",
668
+ "output_type": "stream",
669
+ "text": [
670
+ "Fitting TabularPredictor for label: X18_mean ...\n"
671
+ ]
672
+ },
673
+ {
674
+ "name": "stderr",
675
+ "output_type": "stream",
676
+ "text": [
677
+ "Using Feature Generators to preprocess the data ...\n",
678
+ "Fitting AutoMLPipelineFeatureGenerator...\n",
679
+ "\tAvailable Memory: 7901.67 MB\n",
680
+ "\tTrain Data (Original) Memory Usage: 181.96 MB (2.3% of available memory)\n",
681
+ "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n",
682
+ "\tStage 1 Generators:\n",
683
+ "\t\tFitting AsTypeFeatureGenerator...\n",
684
+ "\tStage 2 Generators:\n",
685
+ "\t\tFitting FillNaFeatureGenerator...\n",
686
+ "\tStage 3 Generators:\n",
687
+ "\t\tFitting IdentityFeatureGenerator...\n",
688
+ "\tStage 4 Generators:\n",
689
+ "\t\tFitting DropUniqueFeatureGenerator...\n",
690
+ "\tStage 5 Generators:\n",
691
+ "\t\tFitting DropDuplicatesFeatureGenerator...\n",
692
+ "\tTypes of features in original data (raw dtype, special dtypes):\n",
693
+ "\t\t('float', []) : 812 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
694
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
695
+ "\tTypes of features in processed data (raw dtype, special dtypes):\n",
696
+ "\t\t('float', []) : 812 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
697
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
698
+ "\t4.8s = Fit runtime\n",
699
+ "\t934 features in original data used to generate 934 features in processed data.\n",
700
+ "\tTrain Data (Processed) Memory Usage: 181.96 MB (2.3% of available memory)\n",
701
+ "Data preprocessing and feature engineering runtime = 5.04s ...\n",
702
+ "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n",
703
+ "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n",
704
+ "\tTo change this, specify the eval_metric parameter of Predictor()\n",
705
+ "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n",
706
+ "User-specified model hyperparameters to be fit:\n",
707
+ "{\n",
708
+ "\t'NN_TORCH': {},\n",
709
+ "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n",
710
+ "\t'FASTAI': {},\n",
711
+ "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
712
+ "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
713
+ "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n",
714
+ "}\n",
715
+ "Fitting 9 L1 models ...\n",
716
+ "Fitting model: KNeighborsUnif ...\n",
717
+ "\t-4.4719\t = Validation score (-root_mean_squared_error)\n",
718
+ "\t1.33s\t = Training runtime\n",
719
+ "\t2.34s\t = Validation runtime\n",
720
+ "Fitting model: KNeighborsDist ...\n",
721
+ "\t-4.4852\t = Validation score (-root_mean_squared_error)\n",
722
+ "\t1.35s\t = Training runtime\n",
723
+ "\t2.87s\t = Validation runtime\n",
724
+ "Fitting model: LightGBMXT ...\n"
725
+ ]
726
+ },
727
+ {
728
+ "name": "stdout",
729
+ "output_type": "stream",
730
+ "text": [
731
+ "[1000]\tvalid_set's rmse: 2.7975\n",
732
+ "[2000]\tvalid_set's rmse: 2.77084\n",
733
+ "[3000]\tvalid_set's rmse: 2.76197\n",
734
+ "[4000]\tvalid_set's rmse: 2.76049\n",
735
+ "[5000]\tvalid_set's rmse: 2.75914\n",
736
+ "[6000]\tvalid_set's rmse: 2.75773\n",
737
+ "[7000]\tvalid_set's rmse: 2.75728\n",
738
+ "[8000]\tvalid_set's rmse: 2.75624\n",
739
+ "[9000]\tvalid_set's rmse: 2.75584\n",
740
+ "[10000]\tvalid_set's rmse: 2.75552\n"
741
+ ]
742
+ },
743
+ {
744
+ "name": "stderr",
745
+ "output_type": "stream",
746
+ "text": [
747
+ "\t-2.7555\t = Validation score (-root_mean_squared_error)\n",
748
+ "\t722.76s\t = Training runtime\n",
749
+ "\t0.62s\t = Validation runtime\n",
750
+ "Fitting model: LightGBM ...\n"
751
+ ]
752
+ },
753
+ {
754
+ "name": "stdout",
755
+ "output_type": "stream",
756
+ "text": [
757
+ "[1000]\tvalid_set's rmse: 2.79461\n",
758
+ "[2000]\tvalid_set's rmse: 2.77581\n",
759
+ "[3000]\tvalid_set's rmse: 2.76911\n",
760
+ "[4000]\tvalid_set's rmse: 2.76665\n",
761
+ "[5000]\tvalid_set's rmse: 2.76656\n"
762
+ ]
763
+ },
764
+ {
765
+ "name": "stderr",
766
+ "output_type": "stream",
767
+ "text": [
768
+ "\t-2.7665\t = Validation score (-root_mean_squared_error)\n",
769
+ "\t455.92s\t = Training runtime\n",
770
+ "\t0.25s\t = Validation runtime\n",
771
+ "Fitting model: RandomForestMSE ...\n",
772
+ "\t-3.0041\t = Validation score (-root_mean_squared_error)\n",
773
+ "\t5707.16s\t = Training runtime\n",
774
+ "\t0.29s\t = Validation runtime\n",
775
+ "Fitting model: ExtraTreesMSE ...\n",
776
+ "\t-3.0281\t = Validation score (-root_mean_squared_error)\n",
777
+ "\t1414.74s\t = Training runtime\n",
778
+ "\t0.24s\t = Validation runtime\n",
779
+ "Fitting model: NeuralNetFastAI ...\n",
780
+ "\t-2.7646\t = Validation score (-root_mean_squared_error)\n",
781
+ "\t158.74s\t = Training runtime\n",
782
+ "\t0.24s\t = Validation runtime\n",
783
+ "Fitting model: NeuralNetTorch ...\n",
784
+ "\t-2.7368\t = Validation score (-root_mean_squared_error)\n",
785
+ "\t132.61s\t = Training runtime\n",
786
+ "\t0.27s\t = Validation runtime\n",
787
+ "Fitting model: LightGBMLarge ...\n"
788
+ ]
789
+ },
790
+ {
791
+ "name": "stdout",
792
+ "output_type": "stream",
793
+ "text": [
794
+ "[1000]\tvalid_set's rmse: 2.76306\n",
795
+ "[2000]\tvalid_set's rmse: 2.75877\n",
796
+ "[3000]\tvalid_set's rmse: 2.75837\n",
797
+ "[4000]\tvalid_set's rmse: 2.75822\n",
798
+ "[5000]\tvalid_set's rmse: 2.75819\n",
799
+ "[6000]\tvalid_set's rmse: 2.75819\n",
800
+ "[7000]\tvalid_set's rmse: 2.75818\n",
801
+ "[8000]\tvalid_set's rmse: 2.75818\n",
802
+ "[9000]\tvalid_set's rmse: 2.75818\n",
803
+ "[10000]\tvalid_set's rmse: 2.75818\n"
804
+ ]
805
+ },
806
+ {
807
+ "name": "stderr",
808
+ "output_type": "stream",
809
+ "text": [
810
+ "\t-2.7582\t = Validation score (-root_mean_squared_error)\n",
811
+ "\t2648.19s\t = Training runtime\n",
812
+ "\t1.43s\t = Validation runtime\n",
813
+ "Fitting model: WeightedEnsemble_L2 ...\n",
814
+ "\tEnsemble Weights: {'NeuralNetTorch': 0.375, 'NeuralNetFastAI': 0.333, 'LightGBMLarge': 0.167, 'LightGBM': 0.125}\n",
815
+ "\t-2.6075\t = Validation score (-root_mean_squared_error)\n",
816
+ "\t0.03s\t = Training runtime\n",
817
+ "\t0.0s\t = Validation runtime\n",
818
+ "AutoGluon training complete, total runtime = 11264.22s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1140.4 rows/s (2500 batch size)\n",
819
+ "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X18_mean\")\n",
820
+ "Verbosity: 2 (Standard Logging)\n",
821
+ "=================== System Info ===================\n",
822
+ "AutoGluon Version: 1.1.1\n",
823
+ "Python Version: 3.10.11\n",
824
+ "Operating System: Windows\n",
825
+ "Platform Machine: AMD64\n",
826
+ "Platform Version: 10.0.22631\n",
827
+ "CPU Count: 12\n",
828
+ "Memory Avail: 7.60 GB / 15.79 GB (48.1%)\n",
829
+ "Disk Space Avail: 74.16 GB / 150.79 GB (49.2%)\n",
830
+ "===================================================\n",
831
+ "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n",
832
+ "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n",
833
+ "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n",
834
+ "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n",
835
+ "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n",
836
+ "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n",
837
+ "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 191.49 MB).\n",
838
+ "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n",
839
+ "Beginning AutoGluon training ...\n",
840
+ "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X26_mean\"\n",
841
+ "Train Data Rows: 43363\n",
842
+ "Train Data Columns: 935\n",
843
+ "Label Column: X26_mean\n",
844
+ "Problem Type: regression\n",
845
+ "Preprocessing data ...\n",
846
+ "Using Feature Generators to preprocess the data ...\n"
847
+ ]
848
+ },
849
+ {
850
+ "name": "stdout",
851
+ "output_type": "stream",
852
+ "text": [
853
+ "Fitting TabularPredictor for label: X26_mean ...\n"
854
+ ]
855
+ },
856
+ {
857
+ "name": "stderr",
858
+ "output_type": "stream",
859
+ "text": [
860
+ "Fitting AutoMLPipelineFeatureGenerator...\n",
861
+ "\tAvailable Memory: 7763.00 MB\n",
862
+ "\tTrain Data (Original) Memory Usage: 182.29 MB (2.3% of available memory)\n",
863
+ "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n",
864
+ "\tStage 1 Generators:\n",
865
+ "\t\tFitting AsTypeFeatureGenerator...\n",
866
+ "\tStage 2 Generators:\n",
867
+ "\t\tFitting FillNaFeatureGenerator...\n",
868
+ "\tStage 3 Generators:\n",
869
+ "\t\tFitting IdentityFeatureGenerator...\n",
870
+ "\tStage 4 Generators:\n",
871
+ "\t\tFitting DropUniqueFeatureGenerator...\n",
872
+ "\tStage 5 Generators:\n",
873
+ "\t\tFitting DropDuplicatesFeatureGenerator...\n",
874
+ "\tTypes of features in original data (raw dtype, special dtypes):\n",
875
+ "\t\t('float', []) : 813 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
876
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
877
+ "\tTypes of features in processed data (raw dtype, special dtypes):\n",
878
+ "\t\t('float', []) : 813 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
879
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
880
+ "\t6.5s = Fit runtime\n",
881
+ "\t935 features in original data used to generate 935 features in processed data.\n",
882
+ "\tTrain Data (Processed) Memory Usage: 182.29 MB (2.4% of available memory)\n",
883
+ "Data preprocessing and feature engineering runtime = 6.81s ...\n",
884
+ "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n",
885
+ "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n",
886
+ "\tTo change this, specify the eval_metric parameter of Predictor()\n",
887
+ "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n",
888
+ "User-specified model hyperparameters to be fit:\n",
889
+ "{\n",
890
+ "\t'NN_TORCH': {},\n",
891
+ "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n",
892
+ "\t'FASTAI': {},\n",
893
+ "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
894
+ "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
895
+ "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n",
896
+ "}\n",
897
+ "Fitting 9 L1 models ...\n",
898
+ "Fitting model: KNeighborsUnif ...\n",
899
+ "\t-75.2345\t = Validation score (-root_mean_squared_error)\n",
900
+ "\t1.63s\t = Training runtime\n",
901
+ "\t2.42s\t = Validation runtime\n",
902
+ "Fitting model: KNeighborsDist ...\n",
903
+ "\t-77.2557\t = Validation score (-root_mean_squared_error)\n",
904
+ "\t1.57s\t = Training runtime\n",
905
+ "\t2.46s\t = Validation runtime\n",
906
+ "Fitting model: LightGBMXT ...\n",
907
+ "\t-56.0706\t = Validation score (-root_mean_squared_error)\n",
908
+ "\t45.17s\t = Training runtime\n",
909
+ "\t0.06s\t = Validation runtime\n",
910
+ "Fitting model: LightGBM ...\n",
911
+ "\t-54.6852\t = Validation score (-root_mean_squared_error)\n",
912
+ "\t41.69s\t = Training runtime\n",
913
+ "\t0.04s\t = Validation runtime\n",
914
+ "Fitting model: RandomForestMSE ...\n",
915
+ "\t-55.0949\t = Validation score (-root_mean_squared_error)\n",
916
+ "\t9653.14s\t = Training runtime\n",
917
+ "\t0.3s\t = Validation runtime\n",
918
+ "Fitting model: ExtraTreesMSE ...\n",
919
+ "\t-55.9584\t = Validation score (-root_mean_squared_error)\n",
920
+ "\t1874.15s\t = Training runtime\n",
921
+ "\t0.27s\t = Validation runtime\n",
922
+ "Fitting model: NeuralNetFastAI ...\n",
923
+ "\t-57.9006\t = Validation score (-root_mean_squared_error)\n",
924
+ "\t159.0s\t = Training runtime\n",
925
+ "\t0.22s\t = Validation runtime\n",
926
+ "Fitting model: NeuralNetTorch ...\n",
927
+ "\t-59.0582\t = Validation score (-root_mean_squared_error)\n",
928
+ "\t155.0s\t = Training runtime\n",
929
+ "\t0.27s\t = Validation runtime\n",
930
+ "Fitting model: LightGBMLarge ...\n"
931
+ ]
932
+ },
933
+ {
934
+ "name": "stdout",
935
+ "output_type": "stream",
936
+ "text": [
937
+ "[1000]\tvalid_set's rmse: 53.3837\n"
938
+ ]
939
+ },
940
+ {
941
+ "name": "stderr",
942
+ "output_type": "stream",
943
+ "text": [
944
+ "\t-53.3795\t = Validation score (-root_mean_squared_error)\n",
945
+ "\t442.04s\t = Training runtime\n",
946
+ "\t0.13s\t = Validation runtime\n",
947
+ "Fitting model: WeightedEnsemble_L2 ...\n",
948
+ "\tEnsemble Weights: {'LightGBMLarge': 0.84, 'NeuralNetFastAI': 0.16}\n",
949
+ "\t-53.1964\t = Validation score (-root_mean_squared_error)\n",
950
+ "\t0.03s\t = Training runtime\n",
951
+ "\t0.0s\t = Validation runtime\n",
952
+ "AutoGluon training complete, total runtime = 12390.51s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 7137.6 rows/s (2500 batch size)\n",
953
+ "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X26_mean\")\n",
954
+ "Verbosity: 2 (Standard Logging)\n",
955
+ "=================== System Info ===================\n",
956
+ "AutoGluon Version: 1.1.1\n",
957
+ "Python Version: 3.10.11\n",
958
+ "Operating System: Windows\n",
959
+ "Platform Machine: AMD64\n",
960
+ "Platform Version: 10.0.22631\n",
961
+ "CPU Count: 12\n",
962
+ "Memory Avail: 7.35 GB / 15.79 GB (46.5%)\n",
963
+ "Disk Space Avail: 72.47 GB / 150.79 GB (48.1%)\n",
964
+ "===================================================\n",
965
+ "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n",
966
+ "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n",
967
+ "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n",
968
+ "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n",
969
+ "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n",
970
+ "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n",
971
+ "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 191.84 MB).\n",
972
+ "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n",
973
+ "Beginning AutoGluon training ...\n",
974
+ "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X50_mean\"\n",
975
+ "Train Data Rows: 43363\n",
976
+ "Train Data Columns: 936\n",
977
+ "Label Column: X50_mean\n",
978
+ "Problem Type: regression\n",
979
+ "Preprocessing data ...\n"
980
+ ]
981
+ },
982
+ {
983
+ "name": "stdout",
984
+ "output_type": "stream",
985
+ "text": [
986
+ "Fitting TabularPredictor for label: X50_mean ...\n"
987
+ ]
988
+ },
989
+ {
990
+ "name": "stderr",
991
+ "output_type": "stream",
992
+ "text": [
993
+ "Using Feature Generators to preprocess the data ...\n",
994
+ "Fitting AutoMLPipelineFeatureGenerator...\n",
995
+ "\tAvailable Memory: 7495.31 MB\n",
996
+ "\tTrain Data (Original) Memory Usage: 182.62 MB (2.4% of available memory)\n",
997
+ "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n",
998
+ "\tStage 1 Generators:\n",
999
+ "\t\tFitting AsTypeFeatureGenerator...\n",
1000
+ "\tStage 2 Generators:\n",
1001
+ "\t\tFitting FillNaFeatureGenerator...\n",
1002
+ "\tStage 3 Generators:\n",
1003
+ "\t\tFitting IdentityFeatureGenerator...\n",
1004
+ "\tStage 4 Generators:\n",
1005
+ "\t\tFitting DropUniqueFeatureGenerator...\n",
1006
+ "\tStage 5 Generators:\n",
1007
+ "\t\tFitting DropDuplicatesFeatureGenerator...\n",
1008
+ "\tTypes of features in original data (raw dtype, special dtypes):\n",
1009
+ "\t\t('float', []) : 814 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
1010
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
1011
+ "\tTypes of features in processed data (raw dtype, special dtypes):\n",
1012
+ "\t\t('float', []) : 814 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
1013
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
1014
+ "\t6.4s = Fit runtime\n",
1015
+ "\t936 features in original data used to generate 936 features in processed data.\n",
1016
+ "\tTrain Data (Processed) Memory Usage: 182.62 MB (2.4% of available memory)\n",
1017
+ "Data preprocessing and feature engineering runtime = 6.79s ...\n",
1018
+ "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n",
1019
+ "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n",
1020
+ "\tTo change this, specify the eval_metric parameter of Predictor()\n",
1021
+ "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n",
1022
+ "User-specified model hyperparameters to be fit:\n",
1023
+ "{\n",
1024
+ "\t'NN_TORCH': {},\n",
1025
+ "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n",
1026
+ "\t'FASTAI': {},\n",
1027
+ "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
1028
+ "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
1029
+ "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n",
1030
+ "}\n",
1031
+ "Fitting 9 L1 models ...\n",
1032
+ "Fitting model: KNeighborsUnif ...\n",
1033
+ "\t-0.6334\t = Validation score (-root_mean_squared_error)\n",
1034
+ "\t1.99s\t = Training runtime\n",
1035
+ "\t2.73s\t = Validation runtime\n",
1036
+ "Fitting model: KNeighborsDist ...\n",
1037
+ "\t-0.6393\t = Validation score (-root_mean_squared_error)\n",
1038
+ "\t1.95s\t = Training runtime\n",
1039
+ "\t2.72s\t = Validation runtime\n",
1040
+ "Fitting model: LightGBMXT ...\n"
1041
+ ]
1042
+ },
1043
+ {
1044
+ "name": "stdout",
1045
+ "output_type": "stream",
1046
+ "text": [
1047
+ "[1000]\tvalid_set's rmse: 0.361925\n",
1048
+ "[2000]\tvalid_set's rmse: 0.357162\n",
1049
+ "[3000]\tvalid_set's rmse: 0.355106\n",
1050
+ "[4000]\tvalid_set's rmse: 0.353916\n",
1051
+ "[5000]\tvalid_set's rmse: 0.353093\n",
1052
+ "[6000]\tvalid_set's rmse: 0.352683\n",
1053
+ "[7000]\tvalid_set's rmse: 0.352526\n",
1054
+ "[8000]\tvalid_set's rmse: 0.352398\n",
1055
+ "[9000]\tvalid_set's rmse: 0.352323\n",
1056
+ "[10000]\tvalid_set's rmse: 0.352234\n"
1057
+ ]
1058
+ },
1059
+ {
1060
+ "name": "stderr",
1061
+ "output_type": "stream",
1062
+ "text": [
1063
+ "\t-0.3522\t = Validation score (-root_mean_squared_error)\n",
1064
+ "\t744.88s\t = Training runtime\n",
1065
+ "\t0.8s\t = Validation runtime\n",
1066
+ "Fitting model: LightGBM ...\n"
1067
+ ]
1068
+ },
1069
+ {
1070
+ "name": "stdout",
1071
+ "output_type": "stream",
1072
+ "text": [
1073
+ "[1000]\tvalid_set's rmse: 0.352549\n",
1074
+ "[2000]\tvalid_set's rmse: 0.349969\n",
1075
+ "[3000]\tvalid_set's rmse: 0.348952\n",
1076
+ "[4000]\tvalid_set's rmse: 0.348591\n",
1077
+ "[5000]\tvalid_set's rmse: 0.348339\n",
1078
+ "[6000]\tvalid_set's rmse: 0.348147\n",
1079
+ "[7000]\tvalid_set's rmse: 0.348034\n",
1080
+ "[8000]\tvalid_set's rmse: 0.347988\n",
1081
+ "[9000]\tvalid_set's rmse: 0.347937\n",
1082
+ "[10000]\tvalid_set's rmse: 0.347919\n"
1083
+ ]
1084
+ },
1085
+ {
1086
+ "name": "stderr",
1087
+ "output_type": "stream",
1088
+ "text": [
1089
+ "\t-0.3479\t = Validation score (-root_mean_squared_error)\n",
1090
+ "\t921.95s\t = Training runtime\n",
1091
+ "\t0.8s\t = Validation runtime\n",
1092
+ "Fitting model: RandomForestMSE ...\n",
1093
+ "\t-0.344\t = Validation score (-root_mean_squared_error)\n",
1094
+ "\t3068.82s\t = Training runtime\n",
1095
+ "\t0.21s\t = Validation runtime\n",
1096
+ "Fitting model: ExtraTreesMSE ...\n",
1097
+ "\t-0.3735\t = Validation score (-root_mean_squared_error)\n",
1098
+ "\t1075.89s\t = Training runtime\n",
1099
+ "\t0.21s\t = Validation runtime\n",
1100
+ "Fitting model: NeuralNetFastAI ...\n",
1101
+ "\t-0.397\t = Validation score (-root_mean_squared_error)\n",
1102
+ "\t161.54s\t = Training runtime\n",
1103
+ "\t0.25s\t = Validation runtime\n",
1104
+ "Fitting model: NeuralNetTorch ...\n",
1105
+ "\t-0.3914\t = Validation score (-root_mean_squared_error)\n",
1106
+ "\t251.87s\t = Training runtime\n",
1107
+ "\t0.53s\t = Validation runtime\n",
1108
+ "Fitting model: LightGBMLarge ...\n"
1109
+ ]
1110
+ },
1111
+ {
1112
+ "name": "stdout",
1113
+ "output_type": "stream",
1114
+ "text": [
1115
+ "[1000]\tvalid_set's rmse: 0.330805\n",
1116
+ "[2000]\tvalid_set's rmse: 0.329588\n",
1117
+ "[3000]\tvalid_set's rmse: 0.329333\n",
1118
+ "[4000]\tvalid_set's rmse: 0.329259\n",
1119
+ "[5000]\tvalid_set's rmse: 0.329238\n",
1120
+ "[6000]\tvalid_set's rmse: 0.329229\n",
1121
+ "[7000]\tvalid_set's rmse: 0.329227\n",
1122
+ "[8000]\tvalid_set's rmse: 0.329226\n",
1123
+ "[9000]\tvalid_set's rmse: 0.329226\n",
1124
+ "[10000]\tvalid_set's rmse: 0.329226\n"
1125
+ ]
1126
+ },
1127
+ {
1128
+ "name": "stderr",
1129
+ "output_type": "stream",
1130
+ "text": [
1131
+ "\t-0.3292\t = Validation score (-root_mean_squared_error)\n",
1132
+ "\t2505.43s\t = Training runtime\n",
1133
+ "\t1.29s\t = Validation runtime\n",
1134
+ "Fitting model: WeightedEnsemble_L2 ...\n",
1135
+ "\tEnsemble Weights: {'LightGBMLarge': 0.857, 'NeuralNetFastAI': 0.095, 'RandomForestMSE': 0.048}\n",
1136
+ "\t-0.3284\t = Validation score (-root_mean_squared_error)\n",
1137
+ "\t0.02s\t = Training runtime\n",
1138
+ "\t0.0s\t = Validation runtime\n",
1139
+ "AutoGluon training complete, total runtime = 8758.55s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1436.0 rows/s (2500 batch size)\n",
1140
+ "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X50_mean\")\n",
1141
+ "Verbosity: 2 (Standard Logging)\n",
1142
+ "=================== System Info ===================\n",
1143
+ "AutoGluon Version: 1.1.1\n",
1144
+ "Python Version: 3.10.11\n",
1145
+ "Operating System: Windows\n",
1146
+ "Platform Machine: AMD64\n",
1147
+ "Platform Version: 10.0.22631\n",
1148
+ "CPU Count: 12\n",
1149
+ "Memory Avail: 6.87 GB / 15.79 GB (43.5%)\n",
1150
+ "Disk Space Avail: 70.62 GB / 150.79 GB (46.8%)\n",
1151
+ "===================================================\n",
1152
+ "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n",
1153
+ "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n",
1154
+ "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n",
1155
+ "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n",
1156
+ "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n",
1157
+ "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n",
1158
+ "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 192.18 MB).\n",
1159
+ "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n",
1160
+ "Beginning AutoGluon training ...\n",
1161
+ "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X3112_mean\"\n",
1162
+ "Train Data Rows: 43363\n",
1163
+ "Train Data Columns: 937\n",
1164
+ "Label Column: X3112_mean\n",
1165
+ "Problem Type: regression\n",
1166
+ "Preprocessing data ...\n",
1167
+ "Using Feature Generators to preprocess the data ...\n",
1168
+ "Fitting AutoMLPipelineFeatureGenerator...\n"
1169
+ ]
1170
+ },
1171
+ {
1172
+ "name": "stdout",
1173
+ "output_type": "stream",
1174
+ "text": [
1175
+ "Fitting TabularPredictor for label: X3112_mean ...\n"
1176
+ ]
1177
+ },
1178
+ {
1179
+ "name": "stderr",
1180
+ "output_type": "stream",
1181
+ "text": [
1182
+ "\tAvailable Memory: 7019.43 MB\n",
1183
+ "\tTrain Data (Original) Memory Usage: 182.95 MB (2.6% of available memory)\n",
1184
+ "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n",
1185
+ "\tStage 1 Generators:\n",
1186
+ "\t\tFitting AsTypeFeatureGenerator...\n",
1187
+ "\tStage 2 Generators:\n",
1188
+ "\t\tFitting FillNaFeatureGenerator...\n",
1189
+ "\tStage 3 Generators:\n",
1190
+ "\t\tFitting IdentityFeatureGenerator...\n",
1191
+ "\tStage 4 Generators:\n",
1192
+ "\t\tFitting DropUniqueFeatureGenerator...\n",
1193
+ "\tStage 5 Generators:\n",
1194
+ "\t\tFitting DropDuplicatesFeatureGenerator...\n",
1195
+ "\tTypes of features in original data (raw dtype, special dtypes):\n",
1196
+ "\t\t('float', []) : 815 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
1197
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
1198
+ "\tTypes of features in processed data (raw dtype, special dtypes):\n",
1199
+ "\t\t('float', []) : 815 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n",
1200
+ "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n",
1201
+ "\t5.0s = Fit runtime\n",
1202
+ "\t937 features in original data used to generate 937 features in processed data.\n",
1203
+ "\tTrain Data (Processed) Memory Usage: 182.95 MB (2.6% of available memory)\n",
1204
+ "Data preprocessing and feature engineering runtime = 5.29s ...\n",
1205
+ "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n",
1206
+ "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n",
1207
+ "\tTo change this, specify the eval_metric parameter of Predictor()\n",
1208
+ "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n",
1209
+ "User-specified model hyperparameters to be fit:\n",
1210
+ "{\n",
1211
+ "\t'NN_TORCH': {},\n",
1212
+ "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n",
1213
+ "\t'FASTAI': {},\n",
1214
+ "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
1215
+ "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n",
1216
+ "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n",
1217
+ "}\n",
1218
+ "Fitting 9 L1 models ...\n",
1219
+ "Fitting model: KNeighborsUnif ...\n",
1220
+ "\t-2270.871\t = Validation score (-root_mean_squared_error)\n",
1221
+ "\t1.37s\t = Training runtime\n",
1222
+ "\t2.24s\t = Validation runtime\n",
1223
+ "Fitting model: KNeighborsDist ...\n",
1224
+ "\t-2230.0395\t = Validation score (-root_mean_squared_error)\n",
1225
+ "\t1.34s\t = Training runtime\n",
1226
+ "\t2.34s\t = Validation runtime\n",
1227
+ "Fitting model: LightGBMXT ...\n"
1228
+ ]
1229
+ },
1230
+ {
1231
+ "name": "stdout",
1232
+ "output_type": "stream",
1233
+ "text": [
1234
+ "[1000]\tvalid_set's rmse: 1470.67\n",
1235
+ "[2000]\tvalid_set's rmse: 1460.77\n",
1236
+ "[3000]\tvalid_set's rmse: 1453.2\n",
1237
+ "[4000]\tvalid_set's rmse: 1449.16\n",
1238
+ "[5000]\tvalid_set's rmse: 1448\n",
1239
+ "[6000]\tvalid_set's rmse: 1447.65\n",
1240
+ "[7000]\tvalid_set's rmse: 1447.57\n",
1241
+ "[8000]\tvalid_set's rmse: 1446.92\n",
1242
+ "[9000]\tvalid_set's rmse: 1446.78\n",
1243
+ "[10000]\tvalid_set's rmse: 1446.71\n"
1244
+ ]
1245
+ },
1246
+ {
1247
+ "name": "stderr",
1248
+ "output_type": "stream",
1249
+ "text": [
1250
+ "\t-1446.6537\t = Validation score (-root_mean_squared_error)\n",
1251
+ "\t680.41s\t = Training runtime\n",
1252
+ "\t0.54s\t = Validation runtime\n",
1253
+ "Fitting model: LightGBM ...\n"
1254
+ ]
1255
+ },
1256
+ {
1257
+ "name": "stdout",
1258
+ "output_type": "stream",
1259
+ "text": [
1260
+ "[1000]\tvalid_set's rmse: 1401.6\n",
1261
+ "[2000]\tvalid_set's rmse: 1389.58\n",
1262
+ "[3000]\tvalid_set's rmse: 1386.45\n",
1263
+ "[4000]\tvalid_set's rmse: 1385.03\n",
1264
+ "[5000]\tvalid_set's rmse: 1384.81\n",
1265
+ "[6000]\tvalid_set's rmse: 1384.61\n",
1266
+ "[7000]\tvalid_set's rmse: 1384.48\n",
1267
+ "[8000]\tvalid_set's rmse: 1384.34\n",
1268
+ "[9000]\tvalid_set's rmse: 1384.35\n"
1269
+ ]
1270
+ },
1271
+ {
1272
+ "name": "stderr",
1273
+ "output_type": "stream",
1274
+ "text": [
1275
+ "\t-1384.3118\t = Validation score (-root_mean_squared_error)\n",
1276
+ "\t820.56s\t = Training runtime\n",
1277
+ "\t0.42s\t = Validation runtime\n",
1278
+ "Fitting model: RandomForestMSE ...\n",
1279
+ "\t-1349.2685\t = Validation score (-root_mean_squared_error)\n",
1280
+ "\t4440.72s\t = Training runtime\n",
1281
+ "\t0.21s\t = Validation runtime\n",
1282
+ "Fitting model: ExtraTreesMSE ...\n",
1283
+ "\t-1451.9243\t = Validation score (-root_mean_squared_error)\n",
1284
+ "\t1308.72s\t = Training runtime\n",
1285
+ "\t0.22s\t = Validation runtime\n",
1286
+ "Fitting model: NeuralNetFastAI ...\n",
1287
+ "\t-1514.4165\t = Validation score (-root_mean_squared_error)\n",
1288
+ "\t158.34s\t = Training runtime\n",
1289
+ "\t0.24s\t = Validation runtime\n",
1290
+ "Fitting model: NeuralNetTorch ...\n",
1291
+ "\t-1537.7455\t = Validation score (-root_mean_squared_error)\n",
1292
+ "\t143.11s\t = Training runtime\n",
1293
+ "\t0.53s\t = Validation runtime\n",
1294
+ "Fitting model: LightGBMLarge ...\n"
1295
+ ]
1296
+ },
1297
+ {
1298
+ "name": "stdout",
1299
+ "output_type": "stream",
1300
+ "text": [
1301
+ "[1000]\tvalid_set's rmse: 1327.67\n",
1302
+ "[2000]\tvalid_set's rmse: 1325.67\n",
1303
+ "[3000]\tvalid_set's rmse: 1325.22\n",
1304
+ "[4000]\tvalid_set's rmse: 1325.1\n",
1305
+ "[5000]\tvalid_set's rmse: 1325.06\n",
1306
+ "[6000]\tvalid_set's rmse: 1325.05\n",
1307
+ "[7000]\tvalid_set's rmse: 1325.04\n",
1308
+ "[8000]\tvalid_set's rmse: 1325.04\n",
1309
+ "[9000]\tvalid_set's rmse: 1325.04\n",
1310
+ "[10000]\tvalid_set's rmse: 1325.04\n"
1311
+ ]
1312
+ },
1313
+ {
1314
+ "name": "stderr",
1315
+ "output_type": "stream",
1316
+ "text": [
1317
+ "\t-1325.0433\t = Validation score (-root_mean_squared_error)\n",
1318
+ "\t2420.99s\t = Training runtime\n",
1319
+ "\t1.04s\t = Validation runtime\n",
1320
+ "Fitting model: WeightedEnsemble_L2 ...\n",
1321
+ "\tEnsemble Weights: {'LightGBMLarge': 0.571, 'RandomForestMSE': 0.333, 'NeuralNetFastAI': 0.095}\n",
1322
+ "\t-1313.9254\t = Validation score (-root_mean_squared_error)\n",
1323
+ "\t0.03s\t = Training runtime\n",
1324
+ "\t0.0s\t = Validation runtime\n",
1325
+ "AutoGluon training complete, total runtime = 9995.55s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1683.5 rows/s (2500 batch size)\n",
1326
+ "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X3112_mean\")\n"
1327
+ ]
1328
+ },
1329
+ {
1330
+ "name": "stdout",
1331
+ "output_type": "stream",
1332
+ "text": [
1333
+ "MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('multilabel_predictor_source')\n"
1334
+ ]
1335
+ }
1336
+ ],
1337
+ "source": [
1338
+ "# Define paths\n",
1339
+ "train_csv_path = 'train.csv'\n",
1340
+ "train_image_dir = 'train_images'\n",
1341
+ "test_csv_path = 'test.csv'\n",
1342
+ "test_image_dir = 'test_images'\n",
1343
+ "output_path = 'prediction.csv'\n",
1344
+ "\n",
1345
+ "# Load train and test datasets\n",
1346
+ "train_df = pd.read_csv(train_csv_path)\n",
1347
+ "\n",
1348
+ "# Columns for ancillary data and target traits\n",
1349
+ "ancillary_columns = train_df.columns[:-6] # First 164 columns are ancillary data\n",
1350
+ "target_columns = train_df.columns[-6:] # Last 6 columns are target traits\n",
1351
+ "\n",
1352
+ "# Load Vision Transformer model and feature extractor\n",
1353
+ "# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
1354
+ "# vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k').to(device)\n",
1355
+ "# feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')\n",
1356
+ "\n",
1357
+ "# Generate image embeddings for train and test datasets\n",
1358
+ "print(\"Extracting image embeddings for training data...\")\n",
1359
+ "# train_image_embeddings = preprocess_images(train_df, train_image_dir)\n",
1360
+ "with open('train_image_embeddings.pkl', 'rb') as f:\n",
1361
+ " train_image_embeddings = pickle.load(f)\n",
1362
+ "\n",
1363
+ "# Combine ancillary data and image embeddings\n",
1364
+ "print(\"Combining ancillary data and image embeddings...\")\n",
1365
+ "train_combined = pd.concat([train_df[ancillary_columns], train_image_embeddings, train_df[target_columns]], axis=1)\n",
1366
+ "\n",
1367
+ "# Initialize MultilabelPredictor\n",
1368
+ "targets = list(target_columns)\n",
1369
+ "problem_types = ['regression'] * len(targets)\n",
1370
+ "eval_metrics = ['mean_absolute_percentage_error'] * len(targets)\n",
1371
+ "hyperparameters = {\n",
1372
+ "\t'NN_TORCH': {},\n",
1373
+ "\t'GBM': ['GBMLarge'],\n",
1374
+ "\t'FASTAI': {}\n",
1375
+ "}\n",
1376
+ "\n",
1377
+ "multi_predictor = MultilabelPredictor(\n",
1378
+ " labels=targets,\n",
1379
+ " problem_types=problem_types,\n",
1380
+ " # eval_metrics=eval_metrics,\n",
1381
+ " path='multilabel_predictor_source'\n",
1382
+ ")\n",
1383
+ "\n",
1384
+ "# Train MultilabelPredictor\n",
1385
+ "print(\"Training MultilabelPredictor...\")\n",
1386
+ "multi_predictor.fit(train_combined, hyperparameters=hyperparameters)\n"
1387
+ ]
1388
+ },
1389
+ {
1390
+ "cell_type": "code",
1391
+ "execution_count": 3,
1392
+ "metadata": {},
1393
+ "outputs": [
1394
+ {
1395
+ "name": "stdout",
1396
+ "output_type": "stream",
1397
+ "text": [
1398
+ "Extracting image embeddings for test data...\n",
1399
+ "Making predictions on test data...\n",
1400
+ "Predicting with TabularPredictor for label: X4_mean ...\n",
1401
+ "Predicting with TabularPredictor for label: X11_mean ...\n",
1402
+ "Predicting with TabularPredictor for label: X18_mean ...\n",
1403
+ "Predicting with TabularPredictor for label: X26_mean ...\n",
1404
+ "Predicting with TabularPredictor for label: X50_mean ...\n",
1405
+ "Predicting with TabularPredictor for label: X3112_mean ...\n",
1406
+ "Saving predictions to prediction.csv...\n",
1407
+ "Predictions saved successfully!\n"
1408
+ ]
1409
+ }
1410
+ ],
1411
+ "source": [
1412
+ "test_df = pd.read_csv(test_csv_path)\n",
1413
+ "print(\"Extracting image embeddings for test data...\")\n",
1414
+ "# test_image_embeddings = preprocess_images(test_df, test_image_dir)\n",
1415
+ "with open('test_image_embeddings.pkl', 'rb') as f:\n",
1416
+ " test_image_embeddings = pickle.load(f)\n",
1417
+ "\n",
1418
+ "test_combined = pd.concat([test_df[ancillary_columns], test_image_embeddings], axis=1)\n",
1419
+ "\n",
1420
+ "# Make predictions on test data\n",
1421
+ "print(\"Making predictions on test data...\")\n",
1422
+ "predictions = multi_predictor.predict(test_combined)\n",
1423
+ "\n",
1424
+ "# Save predictions to CSV\n",
1425
+ "print(f\"Saving predictions to {output_path}...\")\n",
1426
+ "predictions.insert(0, 'id', test_df['id'])\n",
1427
+ "predictions.to_csv(output_path, index=False)\n",
1428
+ "print(\"Predictions saved successfully!\")"
1429
+ ]
1430
+ }
1431
+ ],
1432
+ "metadata": {
1433
+ "kernelspec": {
1434
+ "display_name": "venv",
1435
+ "language": "python",
1436
+ "name": "python3"
1437
+ },
1438
+ "language_info": {
1439
+ "codemirror_mode": {
1440
+ "name": "ipython",
1441
+ "version": 3
1442
+ },
1443
+ "file_extension": ".py",
1444
+ "mimetype": "text/x-python",
1445
+ "name": "python",
1446
+ "nbconvert_exporter": "python",
1447
+ "pygments_lexer": "ipython3",
1448
+ "version": "3.10.11"
1449
+ }
1450
+ },
1451
+ "nbformat": 4,
1452
+ "nbformat_minor": 2
1453
+ }
test_image_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:374282b7d804cbc883af2699716a07dc8eda02ebf9a23a71069c7318a71dab86
3
+ size 19633747
train_image_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:056b88ebec27062a9bf88fd79d1a13738e5df95796201bc043095baa2728cfd8
3
+ size 133211731
train_test_data.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0976b8751e4bd592a26e3bd08fb52f4f743809f2bdac7732278af78e1efae32
3
+ size 301994252