naazahrani commited on
Commit
67393bf
·
verified ·
1 Parent(s): 71a4352

Delete evaluation

Browse files
evaluation/ar/acva_5_shot.json DELETED
@@ -1,119 +0,0 @@
1
- {
2
- "results": {
3
- "acva": {
4
- "alias": "acva",
5
- "acc,none": 0.7746268656716417,
6
- "acc_stderr,none": 0.004477269169728854,
7
- "acc_norm,none": 0.7632606199770379,
8
- "acc_norm_stderr,none": 0.004554991129754026
9
- }
10
- },
11
- "group_subtasks": {
12
- "acva": []
13
- },
14
- "configs": {
15
- "acva": {
16
- "task": "acva",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "FreedomIntelligence/ACVA-Arabic-Cultural-Value-Alignment",
21
- "dataset_kwargs": {
22
- "trust_remote_code": true
23
- },
24
- "test_split": "test",
25
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _format_subject(subject):\n \n arabic_words = subtasks_ar[subtasks.index(subject)]\n return arabic_words\n \n def _generate_subject(doc):\n subject = _format_subject(doc[\"id\"].split(\"-\")[0])\n\n return subject\n \n def _process_docs(doc):\n keys = [\"\u0635\u062d\",\n \"\u062e\u0637\u0623\"]\n subject = _generate_subject(doc)\n gold = keys.index(doc['answer'])\n out_doc = {\n \"id\": doc[\"id\"],\n \"query\": \"\\n\\n\\n\u0627\u0644\u0633\u0624\u0627\u0644:\" + doc[\"question\"]+\"\\n\u0625\u062c\u0627\u0628\u0629:'\",\n \"choices\": keys,\n \"gold\": gold,\n \"subject\": subject,\n }\n \n return out_doc\n\n return dataset.map(_process_docs)\n",
26
- "doc_to_text": "query",
27
- "doc_to_target": "gold",
28
- "doc_to_choice": "choices",
29
- "description": "\u0641\u064a\u0645\u0627 \u064a\u0644\u064a \u0639\u0628\u0627\u0631\u0627\u062a \u0625\u0645\u0627 \u0635\u062d\u064a\u062d\u0629 \u0623\u0648 \u062e\u0627\u0637\u0626\u0629 \u062d\u0648\u0644 {{subject}}\n \u0627\u0644\u0631\u062c\u0627\u0621 \u062a\u0635\u0646\u064a\u0641 \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u0625\u0644\u0649 '\u0635\u062d' \u0623\u0648 '\u062e\u0637\u0623' \u062f\u0648\u0646 \u0634\u0631\u062d",
30
- "target_delimiter": " ",
31
- "fewshot_delimiter": "\n\n",
32
- "num_fewshot": 5,
33
- "metric_list": [
34
- {
35
- "metric": "acc",
36
- "aggregation": "mean",
37
- "higher_is_better": true
38
- },
39
- {
40
- "metric": "acc_norm",
41
- "aggregation": "mean",
42
- "higher_is_better": true
43
- }
44
- ],
45
- "output_type": "multiple_choice",
46
- "repeats": 1,
47
- "should_decontaminate": false,
48
- "metadata": {
49
- "version": 0.0
50
- }
51
- }
52
- },
53
- "versions": {
54
- "acva": 0.0
55
- },
56
- "n-shot": {
57
- "acva": 5
58
- },
59
- "higher_is_better": {
60
- "acva": {
61
- "acc": true,
62
- "acc_norm": true
63
- }
64
- },
65
- "n-samples": {
66
- "acva": {
67
- "original": 8710,
68
- "effective": 8710
69
- }
70
- },
71
- "config": {
72
- "model": "vllm",
73
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
74
- "batch_size": 1,
75
- "batch_sizes": [],
76
- "device": null,
77
- "use_cache": null,
78
- "limit": null,
79
- "bootstrap_iters": 100000,
80
- "gen_kwargs": null,
81
- "random_seed": 0,
82
- "numpy_seed": 1234,
83
- "torch_seed": 1234,
84
- "fewshot_seed": 1234
85
- },
86
- "git_hash": "8e1bd48d",
87
- "date": 1735662713.7617116,
88
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
89
- "transformers_version": "4.47.1",
90
- "upper_git_hash": null,
91
- "tokenizer_pad_token": [
92
- "<unk>",
93
- "0"
94
- ],
95
- "tokenizer_eos_token": [
96
- "</s>",
97
- "2"
98
- ],
99
- "tokenizer_bos_token": [
100
- "<s>",
101
- "1"
102
- ],
103
- "eot_token_id": 2,
104
- "max_length": 4096,
105
- "task_hashes": {
106
- "acva": "d007c508f0accdd697f549d7cbe7f960f1470c8f86f1a0969355a6ef33108edb"
107
- },
108
- "model_source": "vllm",
109
- "model_name": "/ALLaM-7B-Instruct",
110
- "model_name_sanitized": "/ALLaM-7B-Instruct",
111
- "system_instruction": null,
112
- "system_instruction_sha": null,
113
- "fewshot_as_multiturn": false,
114
- "chat_template": null,
115
- "chat_template_sha": null,
116
- "start_time": 3374.021232778,
117
- "end_time": 3578.563943596,
118
- "total_evaluation_time_seconds": "204.54271081800016"
119
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/ar_ifeval_0_shot.json DELETED
@@ -1,142 +0,0 @@
1
- {
2
- "results": {
3
- "ar_ifeval": {
4
- "alias": "ar_ifeval",
5
- "prompt_level_strict_acc,none": 0.31343283582089554,
6
- "prompt_level_strict_acc_stderr,none": 0.020055655889994813,
7
- "inst_level_strict_acc,none": 0.6764505119453925,
8
- "inst_level_strict_acc_stderr,none": "N/A",
9
- "prompt_level_loose_acc,none": 0.3656716417910448,
10
- "prompt_level_loose_acc_stderr,none": 0.020822161638297296,
11
- "inst_level_loose_acc,none": 0.7051194539249147,
12
- "inst_level_loose_acc_stderr,none": "N/A"
13
- }
14
- },
15
- "group_subtasks": {
16
- "ar_ifeval": []
17
- },
18
- "configs": {
19
- "ar_ifeval": {
20
- "task": "ar_ifeval",
21
- "dataset_path": "lm_eval/tasks/ar_ifeval/ar_ifeval.py",
22
- "dataset_name": "ar_ifeval",
23
- "dataset_kwargs": {
24
- "trust_remote_code": true
25
- },
26
- "test_split": "test",
27
- "doc_to_text": "prompt",
28
- "doc_to_target": 0,
29
- "process_results": "def process_results(doc, results):\n\n response = results[0]\n out_strict = process_sample(doc, response, 'strict')\n out_loose = process_sample(doc, response, 'loose')\n\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n",
30
- "description": "",
31
- "target_delimiter": " ",
32
- "fewshot_delimiter": "\n\n",
33
- "num_fewshot": 0,
34
- "metric_list": [
35
- {
36
- "metric": "prompt_level_strict_acc",
37
- "aggregation": "mean",
38
- "higher_is_better": true
39
- },
40
- {
41
- "metric": "inst_level_strict_acc",
42
- "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
43
- "higher_is_better": true
44
- },
45
- {
46
- "metric": "prompt_level_loose_acc",
47
- "aggregation": "mean",
48
- "higher_is_better": true
49
- },
50
- {
51
- "metric": "inst_level_loose_acc",
52
- "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
53
- "higher_is_better": true
54
- }
55
- ],
56
- "output_type": "generate_until",
57
- "generation_kwargs": {
58
- "until": [],
59
- "do_sample": false,
60
- "temperature": 0.0,
61
- "max_gen_toks": 1280
62
- },
63
- "repeats": 1,
64
- "should_decontaminate": false,
65
- "metadata": {
66
- "version": 4.0
67
- }
68
- }
69
- },
70
- "versions": {
71
- "ar_ifeval": 4.0
72
- },
73
- "n-shot": {
74
- "ar_ifeval": 0
75
- },
76
- "higher_is_better": {
77
- "ar_ifeval": {
78
- "prompt_level_strict_acc": true,
79
- "inst_level_strict_acc": true,
80
- "prompt_level_loose_acc": true,
81
- "inst_level_loose_acc": true
82
- }
83
- },
84
- "n-samples": {
85
- "ar_ifeval": {
86
- "original": 536,
87
- "effective": 536
88
- }
89
- },
90
- "config": {
91
- "model": "hf",
92
- "model_args": "pretrained=/ALLaM-7B-Instruct,trust_remote_code=True,cache_dir=/tmp,parallelize=False",
93
- "model_num_parameters": 7000559616,
94
- "model_dtype": "torch.bfloat16",
95
- "model_revision": "main",
96
- "model_sha": "",
97
- "batch_size": 1,
98
- "batch_sizes": [],
99
- "device": null,
100
- "use_cache": null,
101
- "limit": null,
102
- "bootstrap_iters": 100000,
103
- "gen_kwargs": null,
104
- "random_seed": 0,
105
- "numpy_seed": 1234,
106
- "torch_seed": 1234,
107
- "fewshot_seed": 1234
108
- },
109
- "git_hash": "b955b2950",
110
- "date": 1739618378.981141,
111
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.88\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
112
- "transformers_version": "4.48.3",
113
- "upper_git_hash": null,
114
- "tokenizer_pad_token": [
115
- "<unk>",
116
- "0"
117
- ],
118
- "tokenizer_eos_token": [
119
- "</s>",
120
- "2"
121
- ],
122
- "tokenizer_bos_token": [
123
- "<s>",
124
- "1"
125
- ],
126
- "eot_token_id": 2,
127
- "max_length": 4096,
128
- "task_hashes": {
129
- "ar_ifeval": "d0db7903ef270d7dc54efe4e7713be0de9864fc3a36c901c6e5777a6a5f69aa9"
130
- },
131
- "model_source": "hf",
132
- "model_name": "/ALLaM-7B-Instruct",
133
- "model_name_sanitized": "/ALLaM-7B-Instruct",
134
- "system_instruction": null,
135
- "system_instruction_sha": null,
136
- "fewshot_as_multiturn": false,
137
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + ' [INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
138
- "chat_template_sha": "f1dff938141b507da4a409b6bb3431382088a97a963acd246a41f2f344ae831f",
139
- "start_time": 1393068.333905473,
140
- "end_time": 1397143.169266589,
141
- "total_evaluation_time_seconds": "4074.8353611161"
142
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/araMath_5_shot.json DELETED
@@ -1,126 +0,0 @@
1
- {
2
- "results": {
3
- "araMath": {
4
- "alias": "araMath",
5
- "acc,none": 0.6677685950413224,
6
- "acc_stderr,none": 0.019165266705090528,
7
- "acc_norm,none": 0.6677685950413224,
8
- "acc_norm_stderr,none": 0.019165266705090528
9
- }
10
- },
11
- "group_subtasks": {
12
- "araMath": []
13
- },
14
- "configs": {
15
- "araMath": {
16
- "task": "araMath",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/araMath/araMath.py",
21
- "dataset_name": "araMath",
22
- "dataset_kwargs": {
23
- "trust_remote_code": true
24
- },
25
- "validation_split": "validation",
26
- "test_split": "test",
27
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc):\n def remove_prefix(choice):\n prefixes = [\"(A)\", \"(B)\", \"(C)\", \"(D)\"]\n for prefix in prefixes:\n if choice.startswith(prefix + \" \"):\n return choice[len(prefix) + 1:] \n return choice \n\n def format_example(doc, keys):\n question = doc[\"question\"].strip()\n choices = \"\".join(\n [f\"{key}. {remove_prefix(choice)}\\n\" for key, choice in zip(keys, doc[\"options\"])]\n )\n\n prompt = f\"\\n\\n\u0627\u0644\u0633\u0624\u0627\u0644: {question}\\n{choices}\\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n\n keys_en = [\"A\", \"B\", \"C\", \"D\"]\n out_doc = {\n \"query\": format_example(doc, keys_en),\n \"choices\": keys_en,\n \"gold\": keys_en.index(doc[\"label\"]),\n }\n return out_doc\n \n return dataset.map(_process_docs)\n",
28
- "doc_to_text": "query",
29
- "doc_to_target": "gold",
30
- "doc_to_choice": "{{choices}}",
31
- "description": "\u0645\u0646 \u0641\u0636\u0644\u0643 \u0627\u062e\u062a\u0631 \u0625\u062c\u0627\u0628\u0629 \u0648\u0627\u062d\u062f\u0629 \u0645\u0646 \u0628\u064a\u0646 'A\u060c B\u060c C\u060c D' \u062f\u0648\u0646 \u0634\u0631\u062d",
32
- "target_delimiter": " ",
33
- "fewshot_delimiter": "\n\n",
34
- "num_fewshot": 5,
35
- "metric_list": [
36
- {
37
- "metric": "acc",
38
- "aggregation": "mean",
39
- "higher_is_better": true
40
- },
41
- {
42
- "metric": "acc_norm",
43
- "aggregation": "mean",
44
- "higher_is_better": true
45
- }
46
- ],
47
- "output_type": "multiple_choice",
48
- "repeats": 1,
49
- "should_decontaminate": true,
50
- "doc_to_decontamination_query": "query",
51
- "metadata": {
52
- "version": 0.0
53
- }
54
- }
55
- },
56
- "versions": {
57
- "araMath": 0.0
58
- },
59
- "n-shot": {
60
- "araMath": 5
61
- },
62
- "higher_is_better": {
63
- "araMath": {
64
- "acc": true,
65
- "acc_norm": true
66
- }
67
- },
68
- "n-samples": {
69
- "araMath": {
70
- "original": 605,
71
- "effective": 605
72
- }
73
- },
74
- "config": {
75
- "model": "hf",
76
- "model_args": "pretrained=/ALLaM-7B-Instruct,trust_remote_code=True,cache_dir=/tmp,parallelize=False",
77
- "model_num_parameters": 7000559616,
78
- "model_dtype": "torch.bfloat16",
79
- "model_revision": "main",
80
- "model_sha": "",
81
- "batch_size": 1,
82
- "batch_sizes": [],
83
- "device": null,
84
- "use_cache": null,
85
- "limit": null,
86
- "bootstrap_iters": 100000,
87
- "gen_kwargs": null,
88
- "random_seed": 0,
89
- "numpy_seed": 1234,
90
- "torch_seed": 1234,
91
- "fewshot_seed": 1234
92
- },
93
- "git_hash": "b955b2950",
94
- "date": 1739618269.6292942,
95
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.88\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
96
- "transformers_version": "4.48.3",
97
- "upper_git_hash": null,
98
- "tokenizer_pad_token": [
99
- "<unk>",
100
- "0"
101
- ],
102
- "tokenizer_eos_token": [
103
- "</s>",
104
- "2"
105
- ],
106
- "tokenizer_bos_token": [
107
- "<s>",
108
- "1"
109
- ],
110
- "eot_token_id": 2,
111
- "max_length": 4096,
112
- "task_hashes": {
113
- "araMath": "e7f60b63c44ee90c76a61f37207fa1f812622b6662200911fcfd7dabe78ada66"
114
- },
115
- "model_source": "hf",
116
- "model_name": "/ALLaM-7B-Instruct",
117
- "model_name_sanitized": "/ALLaM-7B-Instruct",
118
- "system_instruction": null,
119
- "system_instruction_sha": null,
120
- "fewshot_as_multiturn": false,
121
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + ' [INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
122
- "chat_template_sha": "f1dff938141b507da4a409b6bb3431382088a97a963acd246a41f2f344ae831f",
123
- "start_time": 1392959.193182268,
124
- "end_time": 1393012.133225703,
125
- "total_evaluation_time_seconds": "52.940043434966356"
126
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/araPro_0_shot.json DELETED
@@ -1,130 +0,0 @@
1
- {
2
- "results": {
3
- "araPro": {
4
- "alias": "araPro",
5
- "acc,none": 0.6970605878824235,
6
- "acc_stderr,none": 0.006498724870364006,
7
- "acc_norm,none": 0.6970605878824235,
8
- "acc_norm_stderr,none": 0.006498724870364006
9
- }
10
- },
11
- "group_subtasks": {
12
- "araPro": []
13
- },
14
- "configs": {
15
- "araPro": {
16
- "task": "araPro",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/araPro/araPro.py",
21
- "dataset_name": "araPro",
22
- "dataset_kwargs": {
23
- "trust_remote_code": true
24
- },
25
- "validation_split": "validation",
26
- "test_split": "test",
27
- "fewshot_split": "validation",
28
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc): \n def remove_prefix(choice):\n return choice.replace('.', '') if '.' in choice[:2] else choice\n \n def format_example(doc, keys):\n question = doc[\"question\"].strip()\n \n choice_num = ['choice1', 'choice2', 'choice3', 'choice4']\n choices = \"\".join(\n [f\"{key}. {remove_prefix(doc[choice_num[index]])}\\n\" for index, key in enumerate(keys)]\n )\n\n prompt = f\"\\n\\n\\n\u0627\u0644\u0633\u0624\u0627\u0644: {question}\\n{choices} \\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n\n #keys = [\"1\", \"2\", \"3\", \"4\"]\n keys = [\"A\", \"B\", \"C\", \"D\"]\n out_doc = {\n \"query\": format_example(doc, keys), \n \"choices\": keys,\n \"gold\": doc[\"answer\"]-1,\n } \n\n return out_doc\n \n return dataset.map(_process_docs)\n",
29
- "doc_to_text": "query",
30
- "doc_to_target": "gold",
31
- "doc_to_choice": "{{choices}}",
32
- "description": "\u0641\u064a\u0645\u0627 \u064a\u0644\u064a \u0623\u0633\u0626\u0644\u0629 \u0627\u0644\u0627\u062e\u062a\u064a\u0627\u0631 \u0645\u0646 \u0645\u062a\u0639\u062f\u062f (\u0645\u0639 \u0627\u0644\u0625\u062c\u0627\u0628\u0627\u062a) \u0645\u0646 \u0641\u0636\u0644\u0643 \u0627\u062e\u062a\u0631 \u0625\u062c\u0627\u0628\u0629 \u0648\u0627\u062d\u062f\u0629 \u062f\u0648\u0646 \u0634\u0631\u062d",
33
- "target_delimiter": " ",
34
- "fewshot_delimiter": "\n\n",
35
- "fewshot_config": {
36
- "sampler": "balanced_cat"
37
- },
38
- "num_fewshot": 0,
39
- "metric_list": [
40
- {
41
- "metric": "acc",
42
- "aggregation": "mean",
43
- "higher_is_better": true
44
- },
45
- {
46
- "metric": "acc_norm",
47
- "aggregation": "mean",
48
- "higher_is_better": true
49
- }
50
- ],
51
- "output_type": "multiple_choice",
52
- "repeats": 1,
53
- "should_decontaminate": true,
54
- "doc_to_decontamination_query": "Question",
55
- "metadata": {
56
- "version": 2.0
57
- }
58
- }
59
- },
60
- "versions": {
61
- "araPro": 2.0
62
- },
63
- "n-shot": {
64
- "araPro": 0
65
- },
66
- "higher_is_better": {
67
- "araPro": {
68
- "acc": true,
69
- "acc_norm": true
70
- }
71
- },
72
- "n-samples": {
73
- "araPro": {
74
- "original": 5001,
75
- "effective": 5001
76
- }
77
- },
78
- "config": {
79
- "model": "hf",
80
- "model_args": "pretrained=/ALLaM-7B-Instruct,trust_remote_code=True,cache_dir=/tmp,parallelize=False",
81
- "model_num_parameters": 7000559616,
82
- "model_dtype": "torch.bfloat16",
83
- "model_revision": "main",
84
- "model_sha": "",
85
- "batch_size": 1,
86
- "batch_sizes": [],
87
- "device": null,
88
- "use_cache": null,
89
- "limit": null,
90
- "bootstrap_iters": 100000,
91
- "gen_kwargs": null,
92
- "random_seed": 0,
93
- "numpy_seed": 1234,
94
- "torch_seed": 1234,
95
- "fewshot_seed": 1234
96
- },
97
- "git_hash": "b955b2950",
98
- "date": 1739617164.0204737,
99
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.88\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
100
- "transformers_version": "4.48.3",
101
- "upper_git_hash": null,
102
- "tokenizer_pad_token": [
103
- "<unk>",
104
- "0"
105
- ],
106
- "tokenizer_eos_token": [
107
- "</s>",
108
- "2"
109
- ],
110
- "tokenizer_bos_token": [
111
- "<s>",
112
- "1"
113
- ],
114
- "eot_token_id": 2,
115
- "max_length": 4096,
116
- "task_hashes": {
117
- "araPro": "01340c360a1565c46298c4c24dd3fdfe1ea614c6eef6e4d4f021f1da83da2584"
118
- },
119
- "model_source": "hf",
120
- "model_name": "/ALLaM-7B-Instruct",
121
- "model_name_sanitized": "/ALLaM-7B-Instruct",
122
- "system_instruction": null,
123
- "system_instruction_sha": null,
124
- "fewshot_as_multiturn": false,
125
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + ' [INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
126
- "chat_template_sha": "f1dff938141b507da4a409b6bb3431382088a97a963acd246a41f2f344ae831f",
127
- "start_time": 1391853.516943726,
128
- "end_time": 1392050.054185297,
129
- "total_evaluation_time_seconds": "196.5372415711172"
130
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/arabicmmlu_0_shot.json DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/ar/etec_0_shot.json DELETED
@@ -1,126 +0,0 @@
1
- {
2
- "results": {
3
- "etec": {
4
- "alias": "etec",
5
- "acc,none": 0.6666666666666666,
6
- "acc_stderr,none": 0.010854826817097195,
7
- "acc_norm,none": 0.6666666666666666,
8
- "acc_norm_stderr,none": 0.010854826817097195
9
- }
10
- },
11
- "group_subtasks": {
12
- "etec": []
13
- },
14
- "configs": {
15
- "etec": {
16
- "task": "etec",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/etec/etec.py",
21
- "dataset_name": "etec",
22
- "dataset_kwargs": {
23
- "trust_remote_code": true
24
- },
25
- "validation_split": "validation",
26
- "test_split": "test",
27
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc):\n def format_example(doc, keys):\n question = doc[\"question\"].strip()\n \n choices = \"\".join(\n [f\"{key}. {choice}\\n\" for key, choice in zip(keys, doc[\"choices\"])]\n )\n prompt = f\"\u0627\u0644\u0633\u0624\u0627\u0644: {question}\\n{choices}\\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n print(doc[\"label\"])\n keys_ar = [\"\u0623\", \"\u0628\", \"\u062c\", \"\u062f\"]\n keys_en = [\"A\", \"B\", \"C\", \"D\"]\n out_doc = {\n \"query\": format_example(doc, keys_en),\n \"choices\": keys_en,\n \"gold\": int(doc[\"label\"])-1,\n }\n return out_doc\n \n return dataset.map(_process_docs)\n",
28
- "doc_to_text": "query",
29
- "doc_to_target": "gold",
30
- "doc_to_choice": "choices",
31
- "description": "\u0641\u064a\u0645\u0627 \u064a\u0644\u064a \u0623\u0633\u0626\u0644\u0629 \u0627\u0644\u0627\u062e\u062a\u064a\u0627\u0631 \u0645\u0646 \u0645\u062a\u0639\u062f\u062f (\u0645\u0639 \u0627\u0644\u0625\u062c\u0627\u0628\u0627\u062a) \u0645\u0646 \u0641\u0636\u0644\u0643 \u0627\u062e\u062a\u0631 \u0625\u062c\u0627\u0628\u0629 \u0648\u0627\u062d\u062f\u0629 \u062f\u0648\u0646 \u0634\u0631\u062d\n ",
32
- "target_delimiter": " ",
33
- "fewshot_delimiter": "\n\n",
34
- "num_fewshot": 0,
35
- "metric_list": [
36
- {
37
- "metric": "acc",
38
- "aggregation": "mean",
39
- "higher_is_better": true
40
- },
41
- {
42
- "metric": "acc_norm",
43
- "aggregation": "mean",
44
- "higher_is_better": true
45
- }
46
- ],
47
- "output_type": "multiple_choice",
48
- "repeats": 1,
49
- "should_decontaminate": true,
50
- "doc_to_decontamination_query": "query",
51
- "metadata": {
52
- "version": 0.0
53
- }
54
- }
55
- },
56
- "versions": {
57
- "etec": 0.0
58
- },
59
- "n-shot": {
60
- "etec": 0
61
- },
62
- "higher_is_better": {
63
- "etec": {
64
- "acc": true,
65
- "acc_norm": true
66
- }
67
- },
68
- "n-samples": {
69
- "etec": {
70
- "original": 1887,
71
- "effective": 1887
72
- }
73
- },
74
- "config": {
75
- "model": "hf",
76
- "model_args": "pretrained=/ALLaM-7B-Instruct,trust_remote_code=True,cache_dir=/tmp,parallelize=False",
77
- "model_num_parameters": 7000559616,
78
- "model_dtype": "torch.bfloat16",
79
- "model_revision": "main",
80
- "model_sha": "",
81
- "batch_size": 1,
82
- "batch_sizes": [],
83
- "device": null,
84
- "use_cache": null,
85
- "limit": null,
86
- "bootstrap_iters": 100000,
87
- "gen_kwargs": null,
88
- "random_seed": 0,
89
- "numpy_seed": 1234,
90
- "torch_seed": 1234,
91
- "fewshot_seed": 1234
92
- },
93
- "git_hash": "b955b2950",
94
- "date": 1739617421.4265695,
95
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.88\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
96
- "transformers_version": "4.48.3",
97
- "upper_git_hash": null,
98
- "tokenizer_pad_token": [
99
- "<unk>",
100
- "0"
101
- ],
102
- "tokenizer_eos_token": [
103
- "</s>",
104
- "2"
105
- ],
106
- "tokenizer_bos_token": [
107
- "<s>",
108
- "1"
109
- ],
110
- "eot_token_id": 2,
111
- "max_length": 4096,
112
- "task_hashes": {
113
- "etec": "a0d87bf7eb82815b66ea544cb632aafb803526dee24b399f30fdc751be442b60"
114
- },
115
- "model_source": "hf",
116
- "model_name": "/ALLaM-7B-Instruct",
117
- "model_name_sanitized": "/ALLaM-7B-Instruct",
118
- "system_instruction": null,
119
- "system_instruction_sha": null,
120
- "fewshot_as_multiturn": false,
121
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + ' [INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
122
- "chat_template_sha": "f1dff938141b507da4a409b6bb3431382088a97a963acd246a41f2f344ae831f",
123
- "start_time": 1392110.980523203,
124
- "end_time": 1392198.883363127,
125
- "total_evaluation_time_seconds": "87.90283992397599"
126
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/exams_ar_5_shot.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "results": {
3
- "exams_ar": {
4
- "alias": "exams_ar",
5
- "acc,none": 0.515828677839851,
6
- "acc_stderr,none": 0.021585885942816244,
7
- "acc_norm,none": 0.515828677839851,
8
- "acc_norm_stderr,none": 0.021585885942816244
9
- }
10
- },
11
- "group_subtasks": {
12
- "exams_ar": []
13
- },
14
- "configs": {
15
- "exams_ar": {
16
- "task": "exams_ar",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/exams_ar",
21
- "dataset_name": "exams_ar",
22
- "dataset_kwargs": {
23
- "trust_remote_code": true
24
- },
25
- "test_split": "test",
26
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n\n def _process_docs(doc):\n def format_example(doc, keys):\n \"\"\"\n <prompt>\n \u0633\u0624\u0627\u0644:\n A. <choice1>\n B. <choice2>\n C. <choice3>\n D. <choice4>\n \u0627\u062c\u0627\u0628\u0629:\n \"\"\"\n \n question = doc[\"question\"].strip()\n \n choices = \"\".join(\n [f\"{key}. {choice}\\n\" for key, choice in zip(keys, doc[\"choices\"])]\n )\n prompt = f\"\u0627\u0644\u0633\u0624\u0627\u0644: {question}\\n{choices} \\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n\n def _format_subject(subject):\n arabic_words = subtasks_ar[subtasks.index(subject)]\n return arabic_words\n\n keys = [\"A\", \"B\", \"C\", \"D\"]\n \n subject = doc['id'].split(\"-\")[0]\n description = f\"\ufed2\ufef4\ufee3\ufe8d \ufef2\ufee0\ufef3 \ufe84\ufeb4\ufe8c\ufedf\ufe93 \ufe8d\ufefc\ufea8\ufe98\ufef3\ufe8d\ufead \ufee2\ufee7 \ufee2\ufe98\ufecb\ufea9\ufea9 (\ufee2\ufecb \ufe8d\ufefa\ufe9f\ufe8e\ufe91\ufe8e\ufe97) \ufea1\ufeee\ufedf {_format_subject(subject)} \\n\" #\ufee2\ufee7 \ufed2\ufec0\ufee0\ufedb \ufe8e\ufea8\ufe97\ufead \ufe88\ufe9f\ufe8e\ufe91\ufe93 \ufeed\ufe8e\ufea3\ufea9\ufe93 \ufee2\ufee7 \ufe90\ufef4\ufee7 'A\u060c B\u060c C\u060c D' \ufea9\ufeee\ufee7 \ufeb5\ufeae\ufea3\\n\"\n\n out_doc = {\n \"idx\": doc[\"idx\"],\n \"id\": doc[\"id\"],\n 'dsecription': description,\n \"query\": format_example(doc, keys), # \"Question: \" + doc[\"question\"]['stem'] + \"\\nAnswer:\",\n \"choices\": keys,\n \"gold\": [\"A\", \"B\", \"C\", \"D\"].index(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_docs)\n",
27
- "doc_to_text": "query",
28
- "doc_to_target": "gold",
29
- "doc_to_choice": "choices",
30
- "description": "description",
31
- "target_delimiter": " ",
32
- "fewshot_delimiter": "\n\n",
33
- "num_fewshot": 5,
34
- "metric_list": [
35
- {
36
- "metric": "acc",
37
- "aggregation": "mean",
38
- "higher_is_better": true
39
- },
40
- {
41
- "metric": "acc_norm",
42
- "aggregation": "mean",
43
- "higher_is_better": true
44
- }
45
- ],
46
- "output_type": "multiple_choice",
47
- "repeats": 1,
48
- "should_decontaminate": true,
49
- "doc_to_decontamination_query": "query",
50
- "metadata": {
51
- "version": 0.0
52
- }
53
- }
54
- },
55
- "versions": {
56
- "exams_ar": 0.0
57
- },
58
- "n-shot": {
59
- "exams_ar": 5
60
- },
61
- "higher_is_better": {
62
- "exams_ar": {
63
- "acc": true,
64
- "acc_norm": true
65
- }
66
- },
67
- "n-samples": {
68
- "exams_ar": {
69
- "original": 537,
70
- "effective": 537
71
- }
72
- },
73
- "config": {
74
- "model": "vllm",
75
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
76
- "batch_size": 1,
77
- "batch_sizes": [],
78
- "device": null,
79
- "use_cache": null,
80
- "limit": null,
81
- "bootstrap_iters": 100000,
82
- "gen_kwargs": null,
83
- "random_seed": 0,
84
- "numpy_seed": 1234,
85
- "torch_seed": 1234,
86
- "fewshot_seed": 1234
87
- },
88
- "git_hash": "8e1bd48d",
89
- "date": 1735662207.0830526,
90
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
91
- "transformers_version": "4.47.1",
92
- "upper_git_hash": null,
93
- "tokenizer_pad_token": [
94
- "<unk>",
95
- "0"
96
- ],
97
- "tokenizer_eos_token": [
98
- "</s>",
99
- "2"
100
- ],
101
- "tokenizer_bos_token": [
102
- "<s>",
103
- "1"
104
- ],
105
- "eot_token_id": 2,
106
- "max_length": 4096,
107
- "task_hashes": {
108
- "exams_ar": "b1561abd56354d570ac16bf64163b0ee8dc6c507234b05f678576b09c26c644a"
109
- },
110
- "model_source": "vllm",
111
- "model_name": "/ALLaM-7B-Instruct",
112
- "model_name_sanitized": "/ALLaM-7B-Instruct",
113
- "system_instruction": null,
114
- "system_instruction_sha": null,
115
- "fewshot_as_multiturn": false,
116
- "chat_template": null,
117
- "chat_template_sha": null,
118
- "start_time": 2867.397536365,
119
- "end_time": 2948.510496752,
120
- "total_evaluation_time_seconds": "81.11296038699993"
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/gat_0_shot.json DELETED
@@ -1,549 +0,0 @@
1
- {
2
- "results": {
3
- "gat": {
4
- "acc,none": 0.4452527279568544,
5
- "acc_stderr,none": 0.0038711388833064567,
6
- "alias": "gat"
7
- },
8
- "gat_algebra": {
9
- "alias": " - gat_algebra",
10
- "acc,none": 0.40667903525046384,
11
- "acc_stderr,none": 0.009463939247454995
12
- },
13
- "gat_analogy": {
14
- "alias": " - gat_analogy",
15
- "acc,none": 0.35919854280510016,
16
- "acc_stderr,none": 0.009158766245747282
17
- },
18
- "gat_arithmetic": {
19
- "alias": " - gat_arithmetic",
20
- "acc,none": 0.40154582259845417,
21
- "acc_stderr,none": 0.009406284814832203
22
- },
23
- "gat_association": {
24
- "alias": " - gat_association",
25
- "acc,none": 0.5464114832535886,
26
- "acc_stderr,none": 0.015407801869520031
27
- },
28
- "gat_comparisons": {
29
- "alias": " - gat_comparisons",
30
- "acc,none": 0.34508196721311474,
31
- "acc_stderr,none": 0.013616100682624904
32
- },
33
- "gat_completion": {
34
- "alias": " - gat_completion",
35
- "acc,none": 0.6057851239669422,
36
- "acc_stderr,none": 0.014054411207805699
37
- },
38
- "gat_contextual": {
39
- "alias": " - gat_contextual",
40
- "acc,none": 0.3941717791411043,
41
- "acc_stderr,none": 0.013537713096332765
42
- },
43
- "gat_geometry": {
44
- "alias": " - gat_geometry",
45
- "acc,none": 0.473972602739726,
46
- "acc_stderr,none": 0.026171590093068537
47
- },
48
- "gat_reading": {
49
- "alias": " - gat_reading",
50
- "acc,none": 0.5727788279773157,
51
- "acc_stderr,none": 0.009620311542503682
52
- }
53
- },
54
- "groups": {
55
- "gat": {
56
- "acc,none": 0.4452527279568544,
57
- "acc_stderr,none": 0.0038711388833064567,
58
- "alias": "gat"
59
- }
60
- },
61
- "group_subtasks": {
62
- "gat": [
63
- "gat_analogy",
64
- "gat_association",
65
- "gat_completion",
66
- "gat_reading",
67
- "gat_algebra",
68
- "gat_arithmetic",
69
- "gat_comparisons",
70
- "gat_contextual",
71
- "gat_geometry"
72
- ]
73
- },
74
- "configs": {
75
- "gat_algebra": {
76
- "task": "gat_algebra",
77
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
78
- "dataset_name": "algebra",
79
- "dataset_kwargs": {
80
- "trust_remote_code": true
81
- },
82
- "test_split": "test",
83
- "fewshot_split": "validation",
84
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
85
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
86
- "doc_to_target": "{{label}}",
87
- "doc_to_choice": [
88
- "\u0623",
89
- "\u0628",
90
- "\u062c",
91
- "\u062f"
92
- ],
93
- "description": "",
94
- "target_delimiter": " ",
95
- "fewshot_delimiter": "\n\n",
96
- "num_fewshot": 0,
97
- "metric_list": [
98
- {
99
- "metric": "acc",
100
- "aggregation": "mean",
101
- "higher_is_better": true
102
- }
103
- ],
104
- "output_type": "multiple_choice",
105
- "repeats": 1,
106
- "should_decontaminate": false,
107
- "metadata": {
108
- "version": 0.0
109
- }
110
- },
111
- "gat_analogy": {
112
- "task": "gat_analogy",
113
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
114
- "dataset_name": "analogy",
115
- "dataset_kwargs": {
116
- "trust_remote_code": true
117
- },
118
- "test_split": "test",
119
- "fewshot_split": "validation",
120
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
121
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
122
- "doc_to_target": "{{label}}",
123
- "doc_to_choice": [
124
- "\u0623",
125
- "\u0628",
126
- "\u062c",
127
- "\u062f"
128
- ],
129
- "description": "",
130
- "target_delimiter": " ",
131
- "fewshot_delimiter": "\n\n",
132
- "num_fewshot": 0,
133
- "metric_list": [
134
- {
135
- "metric": "acc",
136
- "aggregation": "mean",
137
- "higher_is_better": true
138
- }
139
- ],
140
- "output_type": "multiple_choice",
141
- "repeats": 1,
142
- "should_decontaminate": false,
143
- "metadata": {
144
- "version": 0.0
145
- }
146
- },
147
- "gat_arithmetic": {
148
- "task": "gat_arithmetic",
149
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
150
- "dataset_name": "arithmetic",
151
- "dataset_kwargs": {
152
- "trust_remote_code": true
153
- },
154
- "test_split": "test",
155
- "fewshot_split": "validation",
156
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
157
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
158
- "doc_to_target": "{{label}}",
159
- "doc_to_choice": [
160
- "\u0623",
161
- "\u0628",
162
- "\u062c",
163
- "\u062f"
164
- ],
165
- "description": "",
166
- "target_delimiter": " ",
167
- "fewshot_delimiter": "\n\n",
168
- "num_fewshot": 0,
169
- "metric_list": [
170
- {
171
- "metric": "acc",
172
- "aggregation": "mean",
173
- "higher_is_better": true
174
- }
175
- ],
176
- "output_type": "multiple_choice",
177
- "repeats": 1,
178
- "should_decontaminate": false,
179
- "metadata": {
180
- "version": 0.0
181
- }
182
- },
183
- "gat_association": {
184
- "task": "gat_association",
185
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
186
- "dataset_name": "association",
187
- "dataset_kwargs": {
188
- "trust_remote_code": true
189
- },
190
- "test_split": "test",
191
- "fewshot_split": "validation",
192
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
193
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
194
- "doc_to_target": "{{label}}",
195
- "doc_to_choice": [
196
- "\u0623",
197
- "\u0628",
198
- "\u062c",
199
- "\u062f"
200
- ],
201
- "description": "",
202
- "target_delimiter": " ",
203
- "fewshot_delimiter": "\n\n",
204
- "num_fewshot": 0,
205
- "metric_list": [
206
- {
207
- "metric": "acc",
208
- "aggregation": "mean",
209
- "higher_is_better": true
210
- }
211
- ],
212
- "output_type": "multiple_choice",
213
- "repeats": 1,
214
- "should_decontaminate": false,
215
- "metadata": {
216
- "version": 0.0
217
- }
218
- },
219
- "gat_comparisons": {
220
- "task": "gat_comparisons",
221
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
222
- "dataset_name": "comparisons",
223
- "dataset_kwargs": {
224
- "trust_remote_code": true
225
- },
226
- "test_split": "test",
227
- "fewshot_split": "validation",
228
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
229
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
230
- "doc_to_target": "{{label}}",
231
- "doc_to_choice": [
232
- "\u0623",
233
- "\u0628",
234
- "\u062c",
235
- "\u062f"
236
- ],
237
- "description": "",
238
- "target_delimiter": " ",
239
- "fewshot_delimiter": "\n\n",
240
- "num_fewshot": 0,
241
- "metric_list": [
242
- {
243
- "metric": "acc",
244
- "aggregation": "mean",
245
- "higher_is_better": true
246
- }
247
- ],
248
- "output_type": "multiple_choice",
249
- "repeats": 1,
250
- "should_decontaminate": false,
251
- "metadata": {
252
- "version": 0.0
253
- }
254
- },
255
- "gat_completion": {
256
- "task": "gat_completion",
257
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
258
- "dataset_name": "completion",
259
- "dataset_kwargs": {
260
- "trust_remote_code": true
261
- },
262
- "test_split": "test",
263
- "fewshot_split": "validation",
264
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
265
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
266
- "doc_to_target": "{{label}}",
267
- "doc_to_choice": [
268
- "\u0623",
269
- "\u0628",
270
- "\u062c",
271
- "\u062f"
272
- ],
273
- "description": "",
274
- "target_delimiter": " ",
275
- "fewshot_delimiter": "\n\n",
276
- "num_fewshot": 0,
277
- "metric_list": [
278
- {
279
- "metric": "acc",
280
- "aggregation": "mean",
281
- "higher_is_better": true
282
- }
283
- ],
284
- "output_type": "multiple_choice",
285
- "repeats": 1,
286
- "should_decontaminate": false,
287
- "metadata": {
288
- "version": 0.0
289
- }
290
- },
291
- "gat_contextual": {
292
- "task": "gat_contextual",
293
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
294
- "dataset_name": "contextual",
295
- "dataset_kwargs": {
296
- "trust_remote_code": true
297
- },
298
- "test_split": "test",
299
- "fewshot_split": "validation",
300
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
301
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
302
- "doc_to_target": "{{label}}",
303
- "doc_to_choice": [
304
- "\u0623",
305
- "\u0628",
306
- "\u062c",
307
- "\u062f"
308
- ],
309
- "description": "\u0627\u0648\u062c\u062f \u0627\u0644\u062e\u0637\u0623 \u0627\u0644\u0633\u064a\u0627\u0642\u064a \u0641\u064a \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u0627\u0644\u062a\u0627\u0644\u064a\u0629 \u0645\u0646 \u0628\u064a\u0646 \u0627\u0644\u062e\u064a\u0627\u0631\u0627\u062a:",
310
- "target_delimiter": " ",
311
- "fewshot_delimiter": "\n\n",
312
- "num_fewshot": 0,
313
- "metric_list": [
314
- {
315
- "metric": "acc",
316
- "aggregation": "mean",
317
- "higher_is_better": true
318
- }
319
- ],
320
- "output_type": "multiple_choice",
321
- "repeats": 1,
322
- "should_decontaminate": false,
323
- "metadata": {
324
- "version": 0.0
325
- }
326
- },
327
- "gat_geometry": {
328
- "task": "gat_geometry",
329
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
330
- "dataset_name": "geometry",
331
- "dataset_kwargs": {
332
- "trust_remote_code": true
333
- },
334
- "test_split": "test",
335
- "fewshot_split": "validation",
336
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
337
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
338
- "doc_to_target": "{{label}}",
339
- "doc_to_choice": [
340
- "\u0623",
341
- "\u0628",
342
- "\u062c",
343
- "\u062f"
344
- ],
345
- "description": "",
346
- "target_delimiter": " ",
347
- "fewshot_delimiter": "\n\n",
348
- "num_fewshot": 0,
349
- "metric_list": [
350
- {
351
- "metric": "acc",
352
- "aggregation": "mean",
353
- "higher_is_better": true
354
- }
355
- ],
356
- "output_type": "multiple_choice",
357
- "repeats": 1,
358
- "should_decontaminate": false,
359
- "metadata": {
360
- "version": 0.0
361
- }
362
- },
363
- "gat_reading": {
364
- "task": "gat_reading",
365
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
366
- "dataset_name": "reading",
367
- "dataset_kwargs": {
368
- "trust_remote_code": true
369
- },
370
- "test_split": "test",
371
- "fewshot_split": "validation",
372
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
373
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
374
- "doc_to_target": "{{label}}",
375
- "doc_to_choice": [
376
- "\u0623",
377
- "\u0628",
378
- "\u062c",
379
- "\u062f"
380
- ],
381
- "description": "",
382
- "target_delimiter": " ",
383
- "fewshot_delimiter": "\n\n",
384
- "num_fewshot": 0,
385
- "metric_list": [
386
- {
387
- "metric": "acc",
388
- "aggregation": "mean",
389
- "higher_is_better": true
390
- }
391
- ],
392
- "output_type": "multiple_choice",
393
- "repeats": 1,
394
- "should_decontaminate": false,
395
- "metadata": {
396
- "version": 0.0
397
- }
398
- }
399
- },
400
- "versions": {
401
- "gat": 0,
402
- "gat_algebra": 0.0,
403
- "gat_analogy": 0.0,
404
- "gat_arithmetic": 0.0,
405
- "gat_association": 0.0,
406
- "gat_comparisons": 0.0,
407
- "gat_completion": 0.0,
408
- "gat_contextual": 0.0,
409
- "gat_geometry": 0.0,
410
- "gat_reading": 0.0
411
- },
412
- "n-shot": {
413
- "gat_algebra": 0,
414
- "gat_analogy": 0,
415
- "gat_arithmetic": 0,
416
- "gat_association": 0,
417
- "gat_comparisons": 0,
418
- "gat_completion": 0,
419
- "gat_contextual": 0,
420
- "gat_geometry": 0,
421
- "gat_reading": 0
422
- },
423
- "higher_is_better": {
424
- "gat": {
425
- "acc": true
426
- },
427
- "gat_algebra": {
428
- "acc": true
429
- },
430
- "gat_analogy": {
431
- "acc": true
432
- },
433
- "gat_arithmetic": {
434
- "acc": true
435
- },
436
- "gat_association": {
437
- "acc": true
438
- },
439
- "gat_comparisons": {
440
- "acc": true
441
- },
442
- "gat_completion": {
443
- "acc": true
444
- },
445
- "gat_contextual": {
446
- "acc": true
447
- },
448
- "gat_geometry": {
449
- "acc": true
450
- },
451
- "gat_reading": {
452
- "acc": true
453
- }
454
- },
455
- "n-samples": {
456
- "gat_analogy": {
457
- "original": 2745,
458
- "effective": 2745
459
- },
460
- "gat_association": {
461
- "original": 1045,
462
- "effective": 1045
463
- },
464
- "gat_completion": {
465
- "original": 1210,
466
- "effective": 1210
467
- },
468
- "gat_reading": {
469
- "original": 2645,
470
- "effective": 2645
471
- },
472
- "gat_algebra": {
473
- "original": 2695,
474
- "effective": 2695
475
- },
476
- "gat_arithmetic": {
477
- "original": 2717,
478
- "effective": 2717
479
- },
480
- "gat_comparisons": {
481
- "original": 1220,
482
- "effective": 1220
483
- },
484
- "gat_contextual": {
485
- "original": 1304,
486
- "effective": 1304
487
- },
488
- "gat_geometry": {
489
- "original": 365,
490
- "effective": 365
491
- }
492
- },
493
- "config": {
494
- "model": "vllm",
495
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
496
- "batch_size": 1,
497
- "batch_sizes": [],
498
- "device": null,
499
- "use_cache": null,
500
- "limit": null,
501
- "bootstrap_iters": 100000,
502
- "gen_kwargs": null,
503
- "random_seed": 0,
504
- "numpy_seed": 1234,
505
- "torch_seed": 1234,
506
- "fewshot_seed": 1234
507
- },
508
- "git_hash": "8e1bd48d",
509
- "date": 1735664096.2650902,
510
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
511
- "transformers_version": "4.47.1",
512
- "upper_git_hash": null,
513
- "tokenizer_pad_token": [
514
- "<unk>",
515
- "0"
516
- ],
517
- "tokenizer_eos_token": [
518
- "</s>",
519
- "2"
520
- ],
521
- "tokenizer_bos_token": [
522
- "<s>",
523
- "1"
524
- ],
525
- "eot_token_id": 2,
526
- "max_length": 4096,
527
- "task_hashes": {
528
- "gat_analogy": "ede28dec097bfebe8a85a19fa27d001696858276df66254bdb70fc63231f1a83",
529
- "gat_association": "5d82550d46c4f3cabf370185a8a23cc2eb5b08f1f0c5e210a8a712562a44bd08",
530
- "gat_completion": "fc3c19dd7f1896696fec1bffc21182804c9b2f1fb8d8c882428a6bb4bb61e370",
531
- "gat_reading": "93053b187a750d2e87f5488f2d0fda944f3da9195bb04d1c4dee9c4b56fa626a",
532
- "gat_algebra": "77832c595eaaf156775c3dbb27da0915ef600ebf46a7113ae32a202b0359e8a6",
533
- "gat_arithmetic": "6a498f75f5cc0ffd1b30f7a6293ba80d08f2a8876d5558d8e934bf57355ff0cc",
534
- "gat_comparisons": "acb80c0ed8dd07e916a471189aef3a546efc289824b2cc50a32c11dc4c97c9c1",
535
- "gat_contextual": "de063ed3b94011d74ee24a6532122c9d344fc15e42800db44f0849995a0bc37a",
536
- "gat_geometry": "3e482885559a4404ee9e97556edc6e49959770a499f4ae2c58f18ad85b91a363"
537
- },
538
- "model_source": "vllm",
539
- "model_name": "/ALLaM-7B-Instruct",
540
- "model_name_sanitized": "/ALLaM-7B-Instruct",
541
- "system_instruction": null,
542
- "system_instruction_sha": null,
543
- "fewshot_as_multiturn": false,
544
- "chat_template": null,
545
- "chat_template_sha": null,
546
- "start_time": 4756.376698655,
547
- "end_time": 5124.76942052,
548
- "total_evaluation_time_seconds": "368.39272186499966"
549
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/moe_ien_mcq_0_shot.json DELETED
@@ -1,127 +0,0 @@
1
- {
2
- "results": {
3
- "moe_ien_mcq": {
4
- "alias": "moe_ien_mcq",
5
- "acc,none": 0.9177177177177177,
6
- "acc_stderr,none": 0.002749455634736978,
7
- "acc_norm,none": 0.9177177177177177,
8
- "acc_norm_stderr,none": 0.002749455634736978
9
- }
10
- },
11
- "group_subtasks": {
12
- "moe_ien_mcq": []
13
- },
14
- "configs": {
15
- "moe_ien_mcq": {
16
- "task": "moe_ien_mcq",
17
- "dataset_path": "lm_eval/tasks/moe_ien_mcq/ien_moe_mcq.py",
18
- "dataset_name": "moe_ien_mcq",
19
- "dataset_kwargs": {
20
- "trust_remote_code": true
21
- },
22
- "validation_split": "validation",
23
- "test_split": "test",
24
- "fewshot_split": "validation",
25
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc): \n def remove_prefix(choice):\n return choice.split(\". \", 1)[1] if \". \" in choice else choice\n\n def format_example(doc, keys):\n question = doc[\"Question\"].strip()\n \n choices = \"\".join(\n [f\"{key}. {remove_prefix(choice)}\\n\" for key, choice in zip(keys, doc[\"Choices\"])]\n \n )\n prompt = f\"\\n\\n\u0633\u0624\u0627\u0644: {question}\\n{choices} \\n\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n\n keys = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"][0:len(doc[\"Choices\"])]\n out_doc = {\n \"Query\": format_example(doc, keys), \n \"Choices\": keys,\n \"gold\": int(doc[\"Answer\"])-1, ## \n } \n return out_doc\n \n return dataset.map(_process_docs)\n",
26
- "doc_to_text": "Query",
27
- "doc_to_target": "gold",
28
- "doc_to_choice": "{{Choices}}",
29
- "description": "\u0641\u064a\u0645\u0627\u202f\u064a\u0644\u064a\u202f\u0623\u0633\u0626\u0644\u0629\u202f\u0627\u0644\u0627\u062e\u062a\u064a\u0627\u0631\u202f\u0645\u0646\u202f\u0645\u062a\u0639\u062f\u062f\u202f(\u0645\u0639\u202f\u0627\u0644\u0625\u062c\u0627\u0628\u0627\u062a)\u202f\u0641\u064a\u202f{{Subject}}",
30
- "target_delimiter": " ",
31
- "fewshot_delimiter": "\n\n",
32
- "fewshot_config": {
33
- "sampler": "balanced_cat"
34
- },
35
- "num_fewshot": 0,
36
- "metric_list": [
37
- {
38
- "metric": "acc",
39
- "aggregation": "mean",
40
- "higher_is_better": true
41
- },
42
- {
43
- "metric": "acc_norm",
44
- "aggregation": "mean",
45
- "higher_is_better": true
46
- }
47
- ],
48
- "output_type": "multiple_choice",
49
- "repeats": 1,
50
- "should_decontaminate": true,
51
- "doc_to_decontamination_query": "Query",
52
- "metadata": {
53
- "version": 0.0
54
- }
55
- }
56
- },
57
- "versions": {
58
- "moe_ien_mcq": 0.0
59
- },
60
- "n-shot": {
61
- "moe_ien_mcq": 0
62
- },
63
- "higher_is_better": {
64
- "moe_ien_mcq": {
65
- "acc": true,
66
- "acc_norm": true
67
- }
68
- },
69
- "n-samples": {
70
- "moe_ien_mcq": {
71
- "original": 9990,
72
- "effective": 9990
73
- }
74
- },
75
- "config": {
76
- "model": "hf",
77
- "model_args": "pretrained=/ALLaM-7B-Instruct,trust_remote_code=True,cache_dir=/tmp,parallelize=False",
78
- "model_num_parameters": 7000559616,
79
- "model_dtype": "torch.bfloat16",
80
- "model_revision": "main",
81
- "model_sha": "",
82
- "batch_size": 1,
83
- "batch_sizes": [],
84
- "device": null,
85
- "use_cache": null,
86
- "limit": null,
87
- "bootstrap_iters": 100000,
88
- "gen_kwargs": null,
89
- "random_seed": 0,
90
- "numpy_seed": 1234,
91
- "torch_seed": 1234,
92
- "fewshot_seed": 1234
93
- },
94
- "git_hash": "b955b2950",
95
- "date": 1739617571.8184838,
96
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.88\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
97
- "transformers_version": "4.48.3",
98
- "upper_git_hash": null,
99
- "tokenizer_pad_token": [
100
- "<unk>",
101
- "0"
102
- ],
103
- "tokenizer_eos_token": [
104
- "</s>",
105
- "2"
106
- ],
107
- "tokenizer_bos_token": [
108
- "<s>",
109
- "1"
110
- ],
111
- "eot_token_id": 2,
112
- "max_length": 4096,
113
- "task_hashes": {
114
- "moe_ien_mcq": "504533b140426f12c89d975ef421328fc89d69af8719c420a1bf897ed4724191"
115
- },
116
- "model_source": "hf",
117
- "model_name": "/ALLaM-7B-Instruct",
118
- "model_name_sanitized": "/ALLaM-7B-Instruct",
119
- "system_instruction": null,
120
- "system_instruction_sha": null,
121
- "fewshot_as_multiturn": false,
122
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + ' [INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
123
- "chat_template_sha": "f1dff938141b507da4a409b6bb3431382088a97a963acd246a41f2f344ae831f",
124
- "start_time": 1392261.292633723,
125
- "end_time": 1392626.942167409,
126
- "total_evaluation_time_seconds": "365.64953368599527"
127
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/moe_ien_tf_0_shot.json DELETED
@@ -1,129 +0,0 @@
1
- {
2
- "results": {
3
- "moe_ien_tf": {
4
- "alias": "moe_ien_tf",
5
- "acc,none": 0.8294693456980937,
6
- "acc_stderr,none": 0.004929073554117403,
7
- "acc_norm,none": 0.8294693456980937,
8
- "acc_norm_stderr,none": 0.004929073554117403
9
- }
10
- },
11
- "group_subtasks": {
12
- "moe_ien_tf": []
13
- },
14
- "configs": {
15
- "moe_ien_tf": {
16
- "task": "moe_ien_tf",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/moe_ien_tf/moe_ien_tf.py",
21
- "dataset_name": "moe_ien_tf",
22
- "dataset_kwargs": {
23
- "trust_remote_code": true
24
- },
25
- "validation_split": "validation",
26
- "test_split": "test",
27
- "fewshot_split": "validation",
28
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc):\n keys=[\"\u0635\u062d\u064a\u062d\u0629\",\n \"\u062e\u0627\u0637\u0626\u0629\"\n ]\n #keys =[\"\u0635\u0648\u0627\u0628\",\n # \"\u062e\u0637\u0623\"]\n target_key = int(doc[\"Answer\"])-1\n\n out_doc = {\n \"query\": \"\\n\\n\u0627\u0644\u0633\u0624\u0627\u0644:\" +doc[\"Question\"]+\"\\n\u0625\u062c\u0627\u0628\u0629:'\", \n \"choices\": keys,\n \"gold\": target_key,\n }\n return out_doc\n return dataset.map(_process_docs)\n",
29
- "doc_to_text": "query",
30
- "doc_to_target": "gold",
31
- "doc_to_choice": "choices",
32
- "description": "\u0641\u064a\u0645\u0627 \u064a\u0644\u064a \u0639\u0628\u0627\u0631\u0627\u062a \u0625\u0645\u0627 \u0635\u062d\u064a\u062d\u0629 \u0623\u0648 \u062e\u0627\u0637\u0626\u0629 \u062d\u0648\u0644 {{Subject}}\n \u0627\u0644\u0631\u062c\u0627\u0621 \u062a\u0635\u0646\u064a\u0641 \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u0625\u0644\u0649 '\u0635\u062d\u064a\u062d\u0629' \u0623\u0648 '\u062e\u0627\u0637\u0626\u0629' \u062f\u0648\u0646 \u0634\u0631\u062d ",
33
- "target_delimiter": " ",
34
- "fewshot_delimiter": "\n\n",
35
- "fewshot_config": {
36
- "sampler": "balanced_cat"
37
- },
38
- "num_fewshot": 0,
39
- "metric_list": [
40
- {
41
- "metric": "acc",
42
- "aggregation": "mean",
43
- "higher_is_better": true
44
- },
45
- {
46
- "metric": "acc_norm",
47
- "aggregation": "mean",
48
- "higher_is_better": true
49
- }
50
- ],
51
- "output_type": "multiple_choice",
52
- "repeats": 1,
53
- "should_decontaminate": false,
54
- "metadata": {
55
- "version": 2.0
56
- }
57
- }
58
- },
59
- "versions": {
60
- "moe_ien_tf": 2.0
61
- },
62
- "n-shot": {
63
- "moe_ien_tf": 0
64
- },
65
- "higher_is_better": {
66
- "moe_ien_tf": {
67
- "acc": true,
68
- "acc_norm": true
69
- }
70
- },
71
- "n-samples": {
72
- "moe_ien_tf": {
73
- "original": 5823,
74
- "effective": 5823
75
- }
76
- },
77
- "config": {
78
- "model": "hf",
79
- "model_args": "pretrained=/ALLaM-7B-Instruct,trust_remote_code=True,cache_dir=/tmp,parallelize=False",
80
- "model_num_parameters": 7000559616,
81
- "model_dtype": "torch.bfloat16",
82
- "model_revision": "main",
83
- "model_sha": "",
84
- "batch_size": 1,
85
- "batch_sizes": [],
86
- "device": null,
87
- "use_cache": null,
88
- "limit": null,
89
- "bootstrap_iters": 100000,
90
- "gen_kwargs": null,
91
- "random_seed": 0,
92
- "numpy_seed": 1234,
93
- "torch_seed": 1234,
94
- "fewshot_seed": 1234
95
- },
96
- "git_hash": "b955b2950",
97
- "date": 1739617995.3462336,
98
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.88\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
99
- "transformers_version": "4.48.3",
100
- "upper_git_hash": null,
101
- "tokenizer_pad_token": [
102
- "<unk>",
103
- "0"
104
- ],
105
- "tokenizer_eos_token": [
106
- "</s>",
107
- "2"
108
- ],
109
- "tokenizer_bos_token": [
110
- "<s>",
111
- "1"
112
- ],
113
- "eot_token_id": 2,
114
- "max_length": 4096,
115
- "task_hashes": {
116
- "moe_ien_tf": "8701a646f6ea8b9bb96c028f817fbeabfb9031580f5054368b43d14d4a5a1270"
117
- },
118
- "model_source": "hf",
119
- "model_name": "/ALLaM-7B-Instruct",
120
- "model_name_sanitized": "/ALLaM-7B-Instruct",
121
- "system_instruction": null,
122
- "system_instruction_sha": null,
123
- "fewshot_as_multiturn": false,
124
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + ' [INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
125
- "chat_template_sha": "f1dff938141b507da4a409b6bb3431382088a97a963acd246a41f2f344ae831f",
126
- "start_time": 1392684.818305694,
127
- "end_time": 1392900.218863064,
128
- "total_evaluation_time_seconds": "215.40055736992508"
129
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/openaimmlu_0_shot.json DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/en/agieval_0_shot.json DELETED
@@ -1,1108 +0,0 @@
1
- {
2
- "results": {
3
- "agieval": {
4
- "acc,none": 0.4175133043057571,
5
- "acc_stderr,none": 0.0050080978184310855,
6
- "alias": "agieval"
7
- },
8
- "agieval_aqua_rat": {
9
- "alias": " - agieval_aqua_rat",
10
- "acc,none": 0.28346456692913385,
11
- "acc_stderr,none": 0.028334004921307634,
12
- "acc_norm,none": 0.28346456692913385,
13
- "acc_norm_stderr,none": 0.02833400492130763
14
- },
15
- "agieval_gaokao_biology": {
16
- "alias": " - agieval_gaokao_biology",
17
- "acc,none": 0.319047619047619,
18
- "acc_stderr,none": 0.03224133248962465,
19
- "acc_norm,none": 0.3619047619047619,
20
- "acc_norm_stderr,none": 0.03324043951593503
21
- },
22
- "agieval_gaokao_chemistry": {
23
- "alias": " - agieval_gaokao_chemistry",
24
- "acc,none": 0.33816425120772947,
25
- "acc_stderr,none": 0.03296137710480074,
26
- "acc_norm,none": 0.32367149758454106,
27
- "acc_norm_stderr,none": 0.03259848850179343
28
- },
29
- "agieval_gaokao_chinese": {
30
- "alias": " - agieval_gaokao_chinese",
31
- "acc,none": 0.3089430894308943,
32
- "acc_stderr,none": 0.02951977938940491,
33
- "acc_norm,none": 0.3048780487804878,
34
- "acc_norm_stderr,none": 0.029411050550756265
35
- },
36
- "agieval_gaokao_english": {
37
- "alias": " - agieval_gaokao_english",
38
- "acc,none": 0.7352941176470589,
39
- "acc_stderr,none": 0.025261691219729494,
40
- "acc_norm,none": 0.7516339869281046,
41
- "acc_norm_stderr,none": 0.02473998135511359
42
- },
43
- "agieval_gaokao_geography": {
44
- "alias": " - agieval_gaokao_geography",
45
- "acc,none": 0.4472361809045226,
46
- "acc_stderr,none": 0.035335047084973224,
47
- "acc_norm,none": 0.4472361809045226,
48
- "acc_norm_stderr,none": 0.035335047084973224
49
- },
50
- "agieval_gaokao_history": {
51
- "alias": " - agieval_gaokao_history",
52
- "acc,none": 0.43829787234042555,
53
- "acc_stderr,none": 0.03243618636108102,
54
- "acc_norm,none": 0.39574468085106385,
55
- "acc_norm_stderr,none": 0.03196758697835362
56
- },
57
- "agieval_gaokao_mathcloze": {
58
- "alias": " - agieval_gaokao_mathcloze",
59
- "acc,none": 0.0423728813559322,
60
- "acc_stderr,none": 0.018622984668462274
61
- },
62
- "agieval_gaokao_mathqa": {
63
- "alias": " - agieval_gaokao_mathqa",
64
- "acc,none": 0.27635327635327633,
65
- "acc_stderr,none": 0.02390350500312722,
66
- "acc_norm,none": 0.2678062678062678,
67
- "acc_norm_stderr,none": 0.023669514493780283
68
- },
69
- "agieval_gaokao_physics": {
70
- "alias": " - agieval_gaokao_physics",
71
- "acc,none": 0.36,
72
- "acc_stderr,none": 0.034026297840400156,
73
- "acc_norm,none": 0.355,
74
- "acc_norm_stderr,none": 0.03392091008070853
75
- },
76
- "agieval_jec_qa_ca": {
77
- "alias": " - agieval_jec_qa_ca",
78
- "acc,none": 0.5025025025025025,
79
- "acc_stderr,none": 0.015827025208013587,
80
- "acc_norm,none": 0.4924924924924925,
81
- "acc_norm_stderr,none": 0.015825439216141556
82
- },
83
- "agieval_jec_qa_kd": {
84
- "alias": " - agieval_jec_qa_kd",
85
- "acc,none": 0.568,
86
- "acc_stderr,none": 0.01567232023733621,
87
- "acc_norm,none": 0.518,
88
- "acc_norm_stderr,none": 0.015809045699406728
89
- },
90
- "agieval_logiqa_en": {
91
- "alias": " - agieval_logiqa_en",
92
- "acc,none": 0.42242703533026116,
93
- "acc_stderr,none": 0.01937414753071922,
94
- "acc_norm,none": 0.42857142857142855,
95
- "acc_norm_stderr,none": 0.01941046344247875
96
- },
97
- "agieval_logiqa_zh": {
98
- "alias": " - agieval_logiqa_zh",
99
- "acc,none": 0.38095238095238093,
100
- "acc_stderr,none": 0.01904761904761897,
101
- "acc_norm,none": 0.3717357910906298,
102
- "acc_norm_stderr,none": 0.01895534398822881
103
- },
104
- "agieval_lsat_ar": {
105
- "alias": " - agieval_lsat_ar",
106
- "acc,none": 0.17391304347826086,
107
- "acc_stderr,none": 0.02504731738604971,
108
- "acc_norm,none": 0.1826086956521739,
109
- "acc_norm_stderr,none": 0.02553042195273417
110
- },
111
- "agieval_lsat_lr": {
112
- "alias": " - agieval_lsat_lr",
113
- "acc,none": 0.696078431372549,
114
- "acc_stderr,none": 0.0203868890006473,
115
- "acc_norm,none": 0.6647058823529411,
116
- "acc_norm_stderr,none": 0.020925162390233513
117
- },
118
- "agieval_lsat_rc": {
119
- "alias": " - agieval_lsat_rc",
120
- "acc,none": 0.5836431226765799,
121
- "acc_stderr,none": 0.030111969407536524,
122
- "acc_norm,none": 0.5464684014869888,
123
- "acc_norm_stderr,none": 0.03041017404275444
124
- },
125
- "agieval_math": {
126
- "alias": " - agieval_math",
127
- "acc,none": 0.086,
128
- "acc_stderr,none": 0.008870325962594766
129
- },
130
- "agieval_sat_en": {
131
- "alias": " - agieval_sat_en",
132
- "acc,none": 0.8155339805825242,
133
- "acc_stderr,none": 0.02708958103176961,
134
- "acc_norm,none": 0.7912621359223301,
135
- "acc_norm_stderr,none": 0.028384671935185523
136
- },
137
- "agieval_sat_en_without_passage": {
138
- "alias": " - agieval_sat_en_without_passage",
139
- "acc,none": 0.44660194174757284,
140
- "acc_stderr,none": 0.03472179658263948,
141
- "acc_norm,none": 0.4174757281553398,
142
- "acc_norm_stderr,none": 0.034442581739193366
143
- },
144
- "agieval_sat_math": {
145
- "alias": " - agieval_sat_math",
146
- "acc,none": 0.38636363636363635,
147
- "acc_stderr,none": 0.03290270539316666,
148
- "acc_norm,none": 0.37272727272727274,
149
- "acc_norm_stderr,none": 0.0326739568483895
150
- }
151
- },
152
- "groups": {
153
- "agieval": {
154
- "acc,none": 0.4175133043057571,
155
- "acc_stderr,none": 0.0050080978184310855,
156
- "alias": "agieval"
157
- }
158
- },
159
- "group_subtasks": {
160
- "agieval": [
161
- "agieval_gaokao_biology",
162
- "agieval_gaokao_chemistry",
163
- "agieval_gaokao_chinese",
164
- "agieval_gaokao_geography",
165
- "agieval_gaokao_history",
166
- "agieval_gaokao_mathcloze",
167
- "agieval_gaokao_mathqa",
168
- "agieval_gaokao_physics",
169
- "agieval_jec_qa_ca",
170
- "agieval_jec_qa_kd",
171
- "agieval_logiqa_zh",
172
- "agieval_aqua_rat",
173
- "agieval_gaokao_english",
174
- "agieval_logiqa_en",
175
- "agieval_lsat_ar",
176
- "agieval_lsat_lr",
177
- "agieval_lsat_rc",
178
- "agieval_math",
179
- "agieval_sat_en_without_passage",
180
- "agieval_sat_en",
181
- "agieval_sat_math"
182
- ]
183
- },
184
- "configs": {
185
- "agieval_aqua_rat": {
186
- "task": "agieval_aqua_rat",
187
- "dataset_path": "hails/agieval-aqua-rat",
188
- "test_split": "test",
189
- "doc_to_text": "{{query}}",
190
- "doc_to_target": "{{gold}}",
191
- "doc_to_choice": "{{choices}}",
192
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
193
- "description": "",
194
- "target_delimiter": " ",
195
- "fewshot_delimiter": "\n\n",
196
- "num_fewshot": 0,
197
- "metric_list": [
198
- {
199
- "metric": "acc",
200
- "aggregation": "mean",
201
- "higher_is_better": true
202
- },
203
- {
204
- "metric": "acc_norm",
205
- "aggregation": "mean",
206
- "higher_is_better": true
207
- }
208
- ],
209
- "output_type": "multiple_choice",
210
- "repeats": 1,
211
- "should_decontaminate": false,
212
- "metadata": {
213
- "version": 1.0
214
- }
215
- },
216
- "agieval_gaokao_biology": {
217
- "task": "agieval_gaokao_biology",
218
- "dataset_path": "hails/agieval-gaokao-biology",
219
- "test_split": "test",
220
- "doc_to_text": "{{query}}",
221
- "doc_to_target": "{{gold}}",
222
- "doc_to_choice": "{{choices}}",
223
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
224
- "description": "",
225
- "target_delimiter": " ",
226
- "fewshot_delimiter": "\n\n",
227
- "num_fewshot": 0,
228
- "metric_list": [
229
- {
230
- "metric": "acc",
231
- "aggregation": "mean",
232
- "higher_is_better": true
233
- },
234
- {
235
- "metric": "acc_norm",
236
- "aggregation": "mean",
237
- "higher_is_better": true
238
- }
239
- ],
240
- "output_type": "multiple_choice",
241
- "repeats": 1,
242
- "should_decontaminate": false,
243
- "metadata": {
244
- "version": 1.0
245
- }
246
- },
247
- "agieval_gaokao_chemistry": {
248
- "task": "agieval_gaokao_chemistry",
249
- "dataset_path": "hails/agieval-gaokao-chemistry",
250
- "test_split": "test",
251
- "doc_to_text": "{{query}}",
252
- "doc_to_target": "{{gold}}",
253
- "doc_to_choice": "{{choices}}",
254
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
255
- "description": "",
256
- "target_delimiter": " ",
257
- "fewshot_delimiter": "\n\n",
258
- "num_fewshot": 0,
259
- "metric_list": [
260
- {
261
- "metric": "acc",
262
- "aggregation": "mean",
263
- "higher_is_better": true
264
- },
265
- {
266
- "metric": "acc_norm",
267
- "aggregation": "mean",
268
- "higher_is_better": true
269
- }
270
- ],
271
- "output_type": "multiple_choice",
272
- "repeats": 1,
273
- "should_decontaminate": false,
274
- "metadata": {
275
- "version": 1.0
276
- }
277
- },
278
- "agieval_gaokao_chinese": {
279
- "task": "agieval_gaokao_chinese",
280
- "dataset_path": "hails/agieval-gaokao-chinese",
281
- "test_split": "test",
282
- "doc_to_text": "{{query}}",
283
- "doc_to_target": "{{gold}}",
284
- "doc_to_choice": "{{choices}}",
285
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
286
- "description": "",
287
- "target_delimiter": " ",
288
- "fewshot_delimiter": "\n\n",
289
- "num_fewshot": 0,
290
- "metric_list": [
291
- {
292
- "metric": "acc",
293
- "aggregation": "mean",
294
- "higher_is_better": true
295
- },
296
- {
297
- "metric": "acc_norm",
298
- "aggregation": "mean",
299
- "higher_is_better": true
300
- }
301
- ],
302
- "output_type": "multiple_choice",
303
- "repeats": 1,
304
- "should_decontaminate": false,
305
- "metadata": {
306
- "version": 1.0
307
- }
308
- },
309
- "agieval_gaokao_english": {
310
- "task": "agieval_gaokao_english",
311
- "dataset_path": "hails/agieval-gaokao-english",
312
- "test_split": "test",
313
- "doc_to_text": "{{query}}",
314
- "doc_to_target": "{{gold}}",
315
- "doc_to_choice": "{{choices}}",
316
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
317
- "description": "",
318
- "target_delimiter": " ",
319
- "fewshot_delimiter": "\n\n",
320
- "num_fewshot": 0,
321
- "metric_list": [
322
- {
323
- "metric": "acc",
324
- "aggregation": "mean",
325
- "higher_is_better": true
326
- },
327
- {
328
- "metric": "acc_norm",
329
- "aggregation": "mean",
330
- "higher_is_better": true
331
- }
332
- ],
333
- "output_type": "multiple_choice",
334
- "repeats": 1,
335
- "should_decontaminate": false,
336
- "metadata": {
337
- "version": 1.0
338
- }
339
- },
340
- "agieval_gaokao_geography": {
341
- "task": "agieval_gaokao_geography",
342
- "dataset_path": "hails/agieval-gaokao-geography",
343
- "test_split": "test",
344
- "doc_to_text": "{{query}}",
345
- "doc_to_target": "{{gold}}",
346
- "doc_to_choice": "{{choices}}",
347
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
348
- "description": "",
349
- "target_delimiter": " ",
350
- "fewshot_delimiter": "\n\n",
351
- "num_fewshot": 0,
352
- "metric_list": [
353
- {
354
- "metric": "acc",
355
- "aggregation": "mean",
356
- "higher_is_better": true
357
- },
358
- {
359
- "metric": "acc_norm",
360
- "aggregation": "mean",
361
- "higher_is_better": true
362
- }
363
- ],
364
- "output_type": "multiple_choice",
365
- "repeats": 1,
366
- "should_decontaminate": false,
367
- "metadata": {
368
- "version": 1.0
369
- }
370
- },
371
- "agieval_gaokao_history": {
372
- "task": "agieval_gaokao_history",
373
- "dataset_path": "hails/agieval-gaokao-history",
374
- "test_split": "test",
375
- "doc_to_text": "{{query}}",
376
- "doc_to_target": "{{gold}}",
377
- "doc_to_choice": "{{choices}}",
378
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
379
- "description": "",
380
- "target_delimiter": " ",
381
- "fewshot_delimiter": "\n\n",
382
- "num_fewshot": 0,
383
- "metric_list": [
384
- {
385
- "metric": "acc",
386
- "aggregation": "mean",
387
- "higher_is_better": true
388
- },
389
- {
390
- "metric": "acc_norm",
391
- "aggregation": "mean",
392
- "higher_is_better": true
393
- }
394
- ],
395
- "output_type": "multiple_choice",
396
- "repeats": 1,
397
- "should_decontaminate": false,
398
- "metadata": {
399
- "version": 1.0
400
- }
401
- },
402
- "agieval_gaokao_mathcloze": {
403
- "task": "agieval_gaokao_mathcloze",
404
- "dataset_path": "hails/agieval-gaokao-mathcloze",
405
- "test_split": "test",
406
- "doc_to_text": "{{query}}",
407
- "doc_to_target": "{{answer}}",
408
- "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidate = results[0]\n\n gold = doc[\"answer\"]\n\n if not gold:\n print(doc, candidate, gold)\n if is_equiv(candidate, gold):\n retval = 1\n else:\n retval = 0\n\n results = {\n \"acc\": retval,\n }\n return results\n",
409
- "description": "",
410
- "target_delimiter": " ",
411
- "fewshot_delimiter": "\n\n",
412
- "num_fewshot": 0,
413
- "metric_list": [
414
- {
415
- "metric": "acc",
416
- "aggregation": "mean",
417
- "higher_is_better": true
418
- }
419
- ],
420
- "output_type": "generate_until",
421
- "generation_kwargs": {
422
- "max_gen_toks": 32,
423
- "do_sample": false,
424
- "temperature": 0.0,
425
- "until": [
426
- "Q:"
427
- ]
428
- },
429
- "repeats": 1,
430
- "should_decontaminate": false,
431
- "metadata": {
432
- "version": 1.0
433
- }
434
- },
435
- "agieval_gaokao_mathqa": {
436
- "task": "agieval_gaokao_mathqa",
437
- "dataset_path": "hails/agieval-gaokao-mathqa",
438
- "test_split": "test",
439
- "doc_to_text": "{{query}}",
440
- "doc_to_target": "{{gold}}",
441
- "doc_to_choice": "{{choices}}",
442
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
443
- "description": "",
444
- "target_delimiter": " ",
445
- "fewshot_delimiter": "\n\n",
446
- "num_fewshot": 0,
447
- "metric_list": [
448
- {
449
- "metric": "acc",
450
- "aggregation": "mean",
451
- "higher_is_better": true
452
- },
453
- {
454
- "metric": "acc_norm",
455
- "aggregation": "mean",
456
- "higher_is_better": true
457
- }
458
- ],
459
- "output_type": "multiple_choice",
460
- "repeats": 1,
461
- "should_decontaminate": false,
462
- "metadata": {
463
- "version": 1.0
464
- }
465
- },
466
- "agieval_gaokao_physics": {
467
- "task": "agieval_gaokao_physics",
468
- "dataset_path": "hails/agieval-gaokao-physics",
469
- "test_split": "test",
470
- "doc_to_text": "{{query}}",
471
- "doc_to_target": "{{gold}}",
472
- "doc_to_choice": "{{choices}}",
473
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
474
- "description": "",
475
- "target_delimiter": " ",
476
- "fewshot_delimiter": "\n\n",
477
- "num_fewshot": 0,
478
- "metric_list": [
479
- {
480
- "metric": "acc",
481
- "aggregation": "mean",
482
- "higher_is_better": true
483
- },
484
- {
485
- "metric": "acc_norm",
486
- "aggregation": "mean",
487
- "higher_is_better": true
488
- }
489
- ],
490
- "output_type": "multiple_choice",
491
- "repeats": 1,
492
- "should_decontaminate": false,
493
- "metadata": {
494
- "version": 1.0
495
- }
496
- },
497
- "agieval_jec_qa_ca": {
498
- "task": "agieval_jec_qa_ca",
499
- "dataset_path": "hails/agieval-jec-qa-ca",
500
- "test_split": "test",
501
- "doc_to_text": "{{query}}",
502
- "doc_to_target": "{{gold}}",
503
- "doc_to_choice": "{{choices}}",
504
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
505
- "description": "",
506
- "target_delimiter": " ",
507
- "fewshot_delimiter": "\n\n",
508
- "num_fewshot": 0,
509
- "metric_list": [
510
- {
511
- "metric": "acc",
512
- "aggregation": "mean",
513
- "higher_is_better": true
514
- },
515
- {
516
- "metric": "acc_norm",
517
- "aggregation": "mean",
518
- "higher_is_better": true
519
- }
520
- ],
521
- "output_type": "multiple_choice",
522
- "repeats": 1,
523
- "should_decontaminate": false,
524
- "metadata": {
525
- "version": 1.0
526
- }
527
- },
528
- "agieval_jec_qa_kd": {
529
- "task": "agieval_jec_qa_kd",
530
- "dataset_path": "hails/agieval-jec-qa-kd",
531
- "test_split": "test",
532
- "doc_to_text": "{{query}}",
533
- "doc_to_target": "{{gold}}",
534
- "doc_to_choice": "{{choices}}",
535
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
536
- "description": "",
537
- "target_delimiter": " ",
538
- "fewshot_delimiter": "\n\n",
539
- "num_fewshot": 0,
540
- "metric_list": [
541
- {
542
- "metric": "acc",
543
- "aggregation": "mean",
544
- "higher_is_better": true
545
- },
546
- {
547
- "metric": "acc_norm",
548
- "aggregation": "mean",
549
- "higher_is_better": true
550
- }
551
- ],
552
- "output_type": "multiple_choice",
553
- "repeats": 1,
554
- "should_decontaminate": false,
555
- "metadata": {
556
- "version": 1.0
557
- }
558
- },
559
- "agieval_logiqa_en": {
560
- "task": "agieval_logiqa_en",
561
- "dataset_path": "hails/agieval-logiqa-en",
562
- "test_split": "test",
563
- "doc_to_text": "{{query}}",
564
- "doc_to_target": "{{gold}}",
565
- "doc_to_choice": "{{choices}}",
566
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
567
- "description": "",
568
- "target_delimiter": " ",
569
- "fewshot_delimiter": "\n\n",
570
- "num_fewshot": 0,
571
- "metric_list": [
572
- {
573
- "metric": "acc",
574
- "aggregation": "mean",
575
- "higher_is_better": true
576
- },
577
- {
578
- "metric": "acc_norm",
579
- "aggregation": "mean",
580
- "higher_is_better": true
581
- }
582
- ],
583
- "output_type": "multiple_choice",
584
- "repeats": 1,
585
- "should_decontaminate": false,
586
- "metadata": {
587
- "version": 1.0
588
- }
589
- },
590
- "agieval_logiqa_zh": {
591
- "task": "agieval_logiqa_zh",
592
- "dataset_path": "hails/agieval-logiqa-zh",
593
- "test_split": "test",
594
- "doc_to_text": "{{query}}",
595
- "doc_to_target": "{{gold}}",
596
- "doc_to_choice": "{{choices}}",
597
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
598
- "description": "",
599
- "target_delimiter": " ",
600
- "fewshot_delimiter": "\n\n",
601
- "num_fewshot": 0,
602
- "metric_list": [
603
- {
604
- "metric": "acc",
605
- "aggregation": "mean",
606
- "higher_is_better": true
607
- },
608
- {
609
- "metric": "acc_norm",
610
- "aggregation": "mean",
611
- "higher_is_better": true
612
- }
613
- ],
614
- "output_type": "multiple_choice",
615
- "repeats": 1,
616
- "should_decontaminate": false,
617
- "metadata": {
618
- "version": 1.0
619
- }
620
- },
621
- "agieval_lsat_ar": {
622
- "task": "agieval_lsat_ar",
623
- "dataset_path": "hails/agieval-lsat-ar",
624
- "test_split": "test",
625
- "doc_to_text": "{{query}}",
626
- "doc_to_target": "{{gold}}",
627
- "doc_to_choice": "{{choices}}",
628
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
629
- "description": "",
630
- "target_delimiter": " ",
631
- "fewshot_delimiter": "\n\n",
632
- "num_fewshot": 0,
633
- "metric_list": [
634
- {
635
- "metric": "acc",
636
- "aggregation": "mean",
637
- "higher_is_better": true
638
- },
639
- {
640
- "metric": "acc_norm",
641
- "aggregation": "mean",
642
- "higher_is_better": true
643
- }
644
- ],
645
- "output_type": "multiple_choice",
646
- "repeats": 1,
647
- "should_decontaminate": false,
648
- "metadata": {
649
- "version": 1.0
650
- }
651
- },
652
- "agieval_lsat_lr": {
653
- "task": "agieval_lsat_lr",
654
- "dataset_path": "hails/agieval-lsat-lr",
655
- "test_split": "test",
656
- "doc_to_text": "{{query}}",
657
- "doc_to_target": "{{gold}}",
658
- "doc_to_choice": "{{choices}}",
659
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
660
- "description": "",
661
- "target_delimiter": " ",
662
- "fewshot_delimiter": "\n\n",
663
- "num_fewshot": 0,
664
- "metric_list": [
665
- {
666
- "metric": "acc",
667
- "aggregation": "mean",
668
- "higher_is_better": true
669
- },
670
- {
671
- "metric": "acc_norm",
672
- "aggregation": "mean",
673
- "higher_is_better": true
674
- }
675
- ],
676
- "output_type": "multiple_choice",
677
- "repeats": 1,
678
- "should_decontaminate": false,
679
- "metadata": {
680
- "version": 1.0
681
- }
682
- },
683
- "agieval_lsat_rc": {
684
- "task": "agieval_lsat_rc",
685
- "dataset_path": "hails/agieval-lsat-rc",
686
- "test_split": "test",
687
- "doc_to_text": "{{query}}",
688
- "doc_to_target": "{{gold}}",
689
- "doc_to_choice": "{{choices}}",
690
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
691
- "description": "",
692
- "target_delimiter": " ",
693
- "fewshot_delimiter": "\n\n",
694
- "num_fewshot": 0,
695
- "metric_list": [
696
- {
697
- "metric": "acc",
698
- "aggregation": "mean",
699
- "higher_is_better": true
700
- },
701
- {
702
- "metric": "acc_norm",
703
- "aggregation": "mean",
704
- "higher_is_better": true
705
- }
706
- ],
707
- "output_type": "multiple_choice",
708
- "repeats": 1,
709
- "should_decontaminate": false,
710
- "metadata": {
711
- "version": 1.0
712
- }
713
- },
714
- "agieval_math": {
715
- "task": "agieval_math",
716
- "dataset_path": "hails/agieval-math",
717
- "test_split": "test",
718
- "doc_to_text": "{{query}}",
719
- "doc_to_target": "{{answer}}",
720
- "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidate = results[0]\n\n gold = doc[\"answer\"]\n\n if not gold:\n print(doc, candidate, gold)\n if is_equiv(candidate, gold):\n retval = 1\n else:\n retval = 0\n\n results = {\n \"acc\": retval,\n }\n return results\n",
721
- "description": "",
722
- "target_delimiter": " ",
723
- "fewshot_delimiter": "\n\n",
724
- "num_fewshot": 0,
725
- "metric_list": [
726
- {
727
- "metric": "acc",
728
- "aggregation": "mean",
729
- "higher_is_better": true
730
- }
731
- ],
732
- "output_type": "generate_until",
733
- "generation_kwargs": {
734
- "max_gen_toks": 32,
735
- "do_sample": false,
736
- "temperature": 0.0,
737
- "until": [
738
- "Q:"
739
- ]
740
- },
741
- "repeats": 1,
742
- "should_decontaminate": false,
743
- "metadata": {
744
- "version": 1.0
745
- }
746
- },
747
- "agieval_sat_en": {
748
- "task": "agieval_sat_en",
749
- "dataset_path": "hails/agieval-sat-en",
750
- "test_split": "test",
751
- "doc_to_text": "{{query}}",
752
- "doc_to_target": "{{gold}}",
753
- "doc_to_choice": "{{choices}}",
754
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
755
- "description": "",
756
- "target_delimiter": " ",
757
- "fewshot_delimiter": "\n\n",
758
- "num_fewshot": 0,
759
- "metric_list": [
760
- {
761
- "metric": "acc",
762
- "aggregation": "mean",
763
- "higher_is_better": true
764
- },
765
- {
766
- "metric": "acc_norm",
767
- "aggregation": "mean",
768
- "higher_is_better": true
769
- }
770
- ],
771
- "output_type": "multiple_choice",
772
- "repeats": 1,
773
- "should_decontaminate": false,
774
- "metadata": {
775
- "version": 1.0
776
- }
777
- },
778
- "agieval_sat_en_without_passage": {
779
- "task": "agieval_sat_en_without_passage",
780
- "dataset_path": "hails/agieval-sat-en-without-passage",
781
- "test_split": "test",
782
- "doc_to_text": "{{query}}",
783
- "doc_to_target": "{{gold}}",
784
- "doc_to_choice": "{{choices}}",
785
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
786
- "description": "",
787
- "target_delimiter": " ",
788
- "fewshot_delimiter": "\n\n",
789
- "num_fewshot": 0,
790
- "metric_list": [
791
- {
792
- "metric": "acc",
793
- "aggregation": "mean",
794
- "higher_is_better": true
795
- },
796
- {
797
- "metric": "acc_norm",
798
- "aggregation": "mean",
799
- "higher_is_better": true
800
- }
801
- ],
802
- "output_type": "multiple_choice",
803
- "repeats": 1,
804
- "should_decontaminate": false,
805
- "metadata": {
806
- "version": 1.0
807
- }
808
- },
809
- "agieval_sat_math": {
810
- "task": "agieval_sat_math",
811
- "dataset_path": "hails/agieval-sat-math",
812
- "test_split": "test",
813
- "doc_to_text": "{{query}}",
814
- "doc_to_target": "{{gold}}",
815
- "doc_to_choice": "{{choices}}",
816
- "process_results": "def process_results_mcqa(doc, results):\n results = [result[0] for result in results]\n\n gold = doc[\"gold\"]\n\n acc = 1.0 if int(np.argmax(results)) in gold else 0.0\n completion_len = np.array([float(len(i)) for i in doc[\"choices\"]])\n acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0\n\n return {\n \"acc\": acc,\n \"acc_norm\": acc_norm,\n }\n",
817
- "description": "",
818
- "target_delimiter": " ",
819
- "fewshot_delimiter": "\n\n",
820
- "num_fewshot": 0,
821
- "metric_list": [
822
- {
823
- "metric": "acc",
824
- "aggregation": "mean",
825
- "higher_is_better": true
826
- },
827
- {
828
- "metric": "acc_norm",
829
- "aggregation": "mean",
830
- "higher_is_better": true
831
- }
832
- ],
833
- "output_type": "multiple_choice",
834
- "repeats": 1,
835
- "should_decontaminate": false,
836
- "metadata": {
837
- "version": 1.0
838
- }
839
- }
840
- },
841
- "versions": {
842
- "agieval": 0.0,
843
- "agieval_aqua_rat": 1.0,
844
- "agieval_gaokao_biology": 1.0,
845
- "agieval_gaokao_chemistry": 1.0,
846
- "agieval_gaokao_chinese": 1.0,
847
- "agieval_gaokao_english": 1.0,
848
- "agieval_gaokao_geography": 1.0,
849
- "agieval_gaokao_history": 1.0,
850
- "agieval_gaokao_mathcloze": 1.0,
851
- "agieval_gaokao_mathqa": 1.0,
852
- "agieval_gaokao_physics": 1.0,
853
- "agieval_jec_qa_ca": 1.0,
854
- "agieval_jec_qa_kd": 1.0,
855
- "agieval_logiqa_en": 1.0,
856
- "agieval_logiqa_zh": 1.0,
857
- "agieval_lsat_ar": 1.0,
858
- "agieval_lsat_lr": 1.0,
859
- "agieval_lsat_rc": 1.0,
860
- "agieval_math": 1.0,
861
- "agieval_sat_en": 1.0,
862
- "agieval_sat_en_without_passage": 1.0,
863
- "agieval_sat_math": 1.0
864
- },
865
- "n-shot": {
866
- "agieval_aqua_rat": 0,
867
- "agieval_gaokao_biology": 0,
868
- "agieval_gaokao_chemistry": 0,
869
- "agieval_gaokao_chinese": 0,
870
- "agieval_gaokao_english": 0,
871
- "agieval_gaokao_geography": 0,
872
- "agieval_gaokao_history": 0,
873
- "agieval_gaokao_mathcloze": 0,
874
- "agieval_gaokao_mathqa": 0,
875
- "agieval_gaokao_physics": 0,
876
- "agieval_jec_qa_ca": 0,
877
- "agieval_jec_qa_kd": 0,
878
- "agieval_logiqa_en": 0,
879
- "agieval_logiqa_zh": 0,
880
- "agieval_lsat_ar": 0,
881
- "agieval_lsat_lr": 0,
882
- "agieval_lsat_rc": 0,
883
- "agieval_math": 0,
884
- "agieval_sat_en": 0,
885
- "agieval_sat_en_without_passage": 0,
886
- "agieval_sat_math": 0
887
- },
888
- "higher_is_better": {
889
- "agieval": {
890
- "acc": true,
891
- "acc_norm": true
892
- },
893
- "agieval_aqua_rat": {
894
- "acc": true,
895
- "acc_norm": true
896
- },
897
- "agieval_gaokao_biology": {
898
- "acc": true,
899
- "acc_norm": true
900
- },
901
- "agieval_gaokao_chemistry": {
902
- "acc": true,
903
- "acc_norm": true
904
- },
905
- "agieval_gaokao_chinese": {
906
- "acc": true,
907
- "acc_norm": true
908
- },
909
- "agieval_gaokao_english": {
910
- "acc": true,
911
- "acc_norm": true
912
- },
913
- "agieval_gaokao_geography": {
914
- "acc": true,
915
- "acc_norm": true
916
- },
917
- "agieval_gaokao_history": {
918
- "acc": true,
919
- "acc_norm": true
920
- },
921
- "agieval_gaokao_mathcloze": {
922
- "acc": true
923
- },
924
- "agieval_gaokao_mathqa": {
925
- "acc": true,
926
- "acc_norm": true
927
- },
928
- "agieval_gaokao_physics": {
929
- "acc": true,
930
- "acc_norm": true
931
- },
932
- "agieval_jec_qa_ca": {
933
- "acc": true,
934
- "acc_norm": true
935
- },
936
- "agieval_jec_qa_kd": {
937
- "acc": true,
938
- "acc_norm": true
939
- },
940
- "agieval_logiqa_en": {
941
- "acc": true,
942
- "acc_norm": true
943
- },
944
- "agieval_logiqa_zh": {
945
- "acc": true,
946
- "acc_norm": true
947
- },
948
- "agieval_lsat_ar": {
949
- "acc": true,
950
- "acc_norm": true
951
- },
952
- "agieval_lsat_lr": {
953
- "acc": true,
954
- "acc_norm": true
955
- },
956
- "agieval_lsat_rc": {
957
- "acc": true,
958
- "acc_norm": true
959
- },
960
- "agieval_math": {
961
- "acc": true
962
- },
963
- "agieval_sat_en": {
964
- "acc": true,
965
- "acc_norm": true
966
- },
967
- "agieval_sat_en_without_passage": {
968
- "acc": true,
969
- "acc_norm": true
970
- },
971
- "agieval_sat_math": {
972
- "acc": true,
973
- "acc_norm": true
974
- }
975
- },
976
- "n-samples": {
977
- "agieval_gaokao_biology": {
978
- "original": 210,
979
- "effective": 210
980
- },
981
- "agieval_gaokao_chemistry": {
982
- "original": 207,
983
- "effective": 207
984
- },
985
- "agieval_gaokao_chinese": {
986
- "original": 246,
987
- "effective": 246
988
- },
989
- "agieval_gaokao_geography": {
990
- "original": 199,
991
- "effective": 199
992
- },
993
- "agieval_gaokao_history": {
994
- "original": 235,
995
- "effective": 235
996
- },
997
- "agieval_gaokao_mathcloze": {
998
- "original": 118,
999
- "effective": 118
1000
- },
1001
- "agieval_gaokao_mathqa": {
1002
- "original": 351,
1003
- "effective": 351
1004
- },
1005
- "agieval_gaokao_physics": {
1006
- "original": 200,
1007
- "effective": 200
1008
- },
1009
- "agieval_jec_qa_ca": {
1010
- "original": 999,
1011
- "effective": 999
1012
- },
1013
- "agieval_jec_qa_kd": {
1014
- "original": 1000,
1015
- "effective": 1000
1016
- },
1017
- "agieval_logiqa_zh": {
1018
- "original": 651,
1019
- "effective": 651
1020
- },
1021
- "agieval_aqua_rat": {
1022
- "original": 254,
1023
- "effective": 254
1024
- },
1025
- "agieval_gaokao_english": {
1026
- "original": 306,
1027
- "effective": 306
1028
- },
1029
- "agieval_logiqa_en": {
1030
- "original": 651,
1031
- "effective": 651
1032
- },
1033
- "agieval_lsat_ar": {
1034
- "original": 230,
1035
- "effective": 230
1036
- },
1037
- "agieval_lsat_lr": {
1038
- "original": 510,
1039
- "effective": 510
1040
- },
1041
- "agieval_lsat_rc": {
1042
- "original": 269,
1043
- "effective": 269
1044
- },
1045
- "agieval_math": {
1046
- "original": 1000,
1047
- "effective": 1000
1048
- },
1049
- "agieval_sat_en_without_passage": {
1050
- "original": 206,
1051
- "effective": 206
1052
- },
1053
- "agieval_sat_en": {
1054
- "original": 206,
1055
- "effective": 206
1056
- },
1057
- "agieval_sat_math": {
1058
- "original": 220,
1059
- "effective": 220
1060
- }
1061
- },
1062
- "config": {
1063
- "model": "vllm",
1064
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=4,data_parallel_size=2,gpu_memory_utilization=0.5,download_dir=/tmp",
1065
- "batch_size": 1,
1066
- "batch_sizes": [],
1067
- "device": null,
1068
- "use_cache": null,
1069
- "limit": null,
1070
- "bootstrap_iters": 100000,
1071
- "gen_kwargs": null,
1072
- "random_seed": 0,
1073
- "numpy_seed": 1234,
1074
- "torch_seed": 1234,
1075
- "fewshot_seed": 1234
1076
- },
1077
- "git_hash": "8e1bd48d",
1078
- "date": 1735956443.5467572,
1079
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
1080
- "transformers_version": "4.47.1",
1081
- "upper_git_hash": null,
1082
- "tokenizer_pad_token": [
1083
- "<unk>",
1084
- "0"
1085
- ],
1086
- "tokenizer_eos_token": [
1087
- "</s>",
1088
- "2"
1089
- ],
1090
- "tokenizer_bos_token": [
1091
- "<s>",
1092
- "1"
1093
- ],
1094
- "eot_token_id": 2,
1095
- "max_length": 4096,
1096
- "task_hashes": {},
1097
- "model_source": "vllm",
1098
- "model_name": "/ALLaM-7B-Instruct",
1099
- "model_name_sanitized": "/ALLaM-7B-Instruct",
1100
- "system_instruction": null,
1101
- "system_instruction_sha": null,
1102
- "fewshot_as_multiturn": false,
1103
- "chat_template": null,
1104
- "chat_template_sha": null,
1105
- "start_time": 23113.003334144,
1106
- "end_time": 23735.631059832,
1107
- "total_evaluation_time_seconds": "622.6277256880021"
1108
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/gpqa_main_n_shot_0_shot.json DELETED
@@ -1,123 +0,0 @@
1
- {
2
- "results": {
3
- "gpqa_main_n_shot": {
4
- "alias": "gpqa_main_n_shot",
5
- "acc,none": 0.22098214285714285,
6
- "acc_stderr,none": 0.01962449705224272,
7
- "acc_norm,none": 0.22098214285714285,
8
- "acc_norm_stderr,none": 0.01962449705224272
9
- }
10
- },
11
- "group_subtasks": {
12
- "gpqa_main_n_shot": []
13
- },
14
- "configs": {
15
- "gpqa_main_n_shot": {
16
- "task": "gpqa_main_n_shot",
17
- "tag": "gpqa",
18
- "dataset_path": "Idavidrein/gpqa",
19
- "dataset_name": "gpqa_main",
20
- "training_split": "train",
21
- "validation_split": "train",
22
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n rng.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
23
- "doc_to_text": "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:",
24
- "doc_to_target": "answer",
25
- "doc_to_choice": [
26
- "(A)",
27
- "(B)",
28
- "(C)",
29
- "(D)"
30
- ],
31
- "description": "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n",
32
- "target_delimiter": " ",
33
- "fewshot_delimiter": "\n\n",
34
- "num_fewshot": 0,
35
- "metric_list": [
36
- {
37
- "metric": "acc",
38
- "aggregation": "mean",
39
- "higher_is_better": true
40
- },
41
- {
42
- "metric": "acc_norm",
43
- "aggregation": "mean",
44
- "higher_is_better": true
45
- }
46
- ],
47
- "output_type": "multiple_choice",
48
- "repeats": 1,
49
- "should_decontaminate": false,
50
- "metadata": {
51
- "version": 2.0
52
- }
53
- }
54
- },
55
- "versions": {
56
- "gpqa_main_n_shot": 2.0
57
- },
58
- "n-shot": {
59
- "gpqa_main_n_shot": 0
60
- },
61
- "higher_is_better": {
62
- "gpqa_main_n_shot": {
63
- "acc": true,
64
- "acc_norm": true
65
- }
66
- },
67
- "n-samples": {
68
- "gpqa_main_n_shot": {
69
- "original": 448,
70
- "effective": 448
71
- }
72
- },
73
- "config": {
74
- "model": "hf",
75
- "model_args": "pretrained=/ALLaM-7B-Instruct,trust_remote_code=True,cache_dir=/tmp,parallelize=True",
76
- "model_num_parameters": 7000559616,
77
- "model_dtype": "torch.bfloat16",
78
- "model_revision": "main",
79
- "model_sha": "",
80
- "batch_size": 1,
81
- "batch_sizes": [],
82
- "device": null,
83
- "use_cache": null,
84
- "limit": null,
85
- "bootstrap_iters": 100000,
86
- "gen_kwargs": null,
87
- "random_seed": 0,
88
- "numpy_seed": 1234,
89
- "torch_seed": 1234,
90
- "fewshot_seed": 1234
91
- },
92
- "git_hash": "8e1bd48d",
93
- "date": 1734941625.7186382,
94
- "pretty_env_info": "PyTorch version: 2.1.0a0+29c30b1\nIs debug build: False\nCUDA used to build PyTorch: 12.2\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.22.2\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.1.0a0+29c30b1\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.16.0a0\n[pip3] triton==2.0.0.dev20221202\n[conda] Could not collect",
95
- "transformers_version": "4.47.1",
96
- "upper_git_hash": "18b53334e0494773088a01c543e721a58f958e0d",
97
- "tokenizer_pad_token": [
98
- "<unk>",
99
- "0"
100
- ],
101
- "tokenizer_eos_token": [
102
- "</s>",
103
- "2"
104
- ],
105
- "tokenizer_bos_token": [
106
- "<s>",
107
- "1"
108
- ],
109
- "eot_token_id": 2,
110
- "max_length": 4096,
111
- "task_hashes": {},
112
- "model_source": "hf",
113
- "model_name": "/ALLaM-7B-Instruct",
114
- "model_name_sanitized": "/ALLaM-7B-Instruct",
115
- "system_instruction": null,
116
- "system_instruction_sha": null,
117
- "fewshot_as_multiturn": false,
118
- "chat_template": null,
119
- "chat_template_sha": null,
120
- "start_time": 66386.780938561,
121
- "end_time": 66441.200832346,
122
- "total_evaluation_time_seconds": "54.41989378500148"
123
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/gsm8k_5_shot.json DELETED
@@ -1,153 +0,0 @@
1
- {
2
- "results": {
3
- "gsm8k": {
4
- "alias": "gsm8k",
5
- "exact_match,strict-match": 0.620166793025019,
6
- "exact_match_stderr,strict-match": 0.013368818096960501,
7
- "exact_match,flexible-extract": 0.623199393479909,
8
- "exact_match_stderr,flexible-extract": 0.01334785875782916
9
- }
10
- },
11
- "group_subtasks": {
12
- "gsm8k": []
13
- },
14
- "configs": {
15
- "gsm8k": {
16
- "task": "gsm8k",
17
- "tag": [
18
- "math_word_problems"
19
- ],
20
- "dataset_path": "gsm8k",
21
- "dataset_name": "main",
22
- "training_split": "train",
23
- "test_split": "test",
24
- "fewshot_split": "train",
25
- "doc_to_text": "Question: {{question}}\nAnswer:",
26
- "doc_to_target": "{{answer}}",
27
- "description": "",
28
- "target_delimiter": " ",
29
- "fewshot_delimiter": "\n\n",
30
- "num_fewshot": 5,
31
- "metric_list": [
32
- {
33
- "metric": "exact_match",
34
- "aggregation": "mean",
35
- "higher_is_better": true,
36
- "ignore_case": true,
37
- "ignore_punctuation": false,
38
- "regexes_to_ignore": [
39
- ",",
40
- "\\$",
41
- "(?s).*#### ",
42
- "\\.$"
43
- ]
44
- }
45
- ],
46
- "output_type": "generate_until",
47
- "generation_kwargs": {
48
- "until": [
49
- "Question:",
50
- "</s>",
51
- "<|im_end|>"
52
- ],
53
- "do_sample": false,
54
- "temperature": 0.0
55
- },
56
- "repeats": 1,
57
- "filter_list": [
58
- {
59
- "name": "strict-match",
60
- "filter": [
61
- {
62
- "function": "regex",
63
- "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
64
- },
65
- {
66
- "function": "take_first"
67
- }
68
- ]
69
- },
70
- {
71
- "name": "flexible-extract",
72
- "filter": [
73
- {
74
- "function": "regex",
75
- "group_select": -1,
76
- "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
77
- },
78
- {
79
- "function": "take_first"
80
- }
81
- ]
82
- }
83
- ],
84
- "should_decontaminate": false,
85
- "metadata": {
86
- "version": 3.0
87
- }
88
- }
89
- },
90
- "versions": {
91
- "gsm8k": 3.0
92
- },
93
- "n-shot": {
94
- "gsm8k": 5
95
- },
96
- "higher_is_better": {
97
- "gsm8k": {
98
- "exact_match": true
99
- }
100
- },
101
- "n-samples": {
102
- "gsm8k": {
103
- "original": 1319,
104
- "effective": 1319
105
- }
106
- },
107
- "config": {
108
- "model": "vllm",
109
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=4,data_parallel_size=2,gpu_memory_utilization=0.5,download_dir=/tmp",
110
- "batch_size": 1,
111
- "batch_sizes": [],
112
- "device": null,
113
- "use_cache": null,
114
- "limit": null,
115
- "bootstrap_iters": 100000,
116
- "gen_kwargs": null,
117
- "random_seed": 0,
118
- "numpy_seed": 1234,
119
- "torch_seed": 1234,
120
- "fewshot_seed": 1234
121
- },
122
- "git_hash": "8e1bd48d",
123
- "date": 1735956272.5546186,
124
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
125
- "transformers_version": "4.47.1",
126
- "upper_git_hash": null,
127
- "tokenizer_pad_token": [
128
- "<unk>",
129
- "0"
130
- ],
131
- "tokenizer_eos_token": [
132
- "</s>",
133
- "2"
134
- ],
135
- "tokenizer_bos_token": [
136
- "<s>",
137
- "1"
138
- ],
139
- "eot_token_id": 2,
140
- "max_length": 4096,
141
- "task_hashes": {},
142
- "model_source": "vllm",
143
- "model_name": "/ALLaM-7B-Instruct",
144
- "model_name_sanitized": "/ALLaM-7B-Instruct",
145
- "system_instruction": null,
146
- "system_instruction_sha": null,
147
- "fewshot_as_multiturn": false,
148
- "chat_template": null,
149
- "chat_template_sha": null,
150
- "start_time": 22942.105525776,
151
- "end_time": 23057.183463458,
152
- "total_evaluation_time_seconds": "115.07793768199917"
153
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/hellaswag_0_shot.json DELETED
@@ -1,118 +0,0 @@
1
- {
2
- "results": {
3
- "hellaswag": {
4
- "alias": "hellaswag",
5
- "acc,none": 0.5771758613821948,
6
- "acc_stderr,none": 0.00492998369279507,
7
- "acc_norm,none": 0.7625970922127067,
8
- "acc_norm_stderr,none": 0.0042462162299898715
9
- }
10
- },
11
- "group_subtasks": {
12
- "hellaswag": []
13
- },
14
- "configs": {
15
- "hellaswag": {
16
- "task": "hellaswag",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "hellaswag",
21
- "dataset_kwargs": {
22
- "trust_remote_code": true
23
- },
24
- "training_split": "train",
25
- "validation_split": "validation",
26
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
- "doc_to_text": "{{query}}",
28
- "doc_to_target": "{{label}}",
29
- "doc_to_choice": "choices",
30
- "description": "",
31
- "target_delimiter": " ",
32
- "fewshot_delimiter": "\n\n",
33
- "num_fewshot": 0,
34
- "metric_list": [
35
- {
36
- "metric": "acc",
37
- "aggregation": "mean",
38
- "higher_is_better": true
39
- },
40
- {
41
- "metric": "acc_norm",
42
- "aggregation": "mean",
43
- "higher_is_better": true
44
- }
45
- ],
46
- "output_type": "multiple_choice",
47
- "repeats": 1,
48
- "should_decontaminate": false,
49
- "metadata": {
50
- "version": 1.0
51
- }
52
- }
53
- },
54
- "versions": {
55
- "hellaswag": 1.0
56
- },
57
- "n-shot": {
58
- "hellaswag": 0
59
- },
60
- "higher_is_better": {
61
- "hellaswag": {
62
- "acc": true,
63
- "acc_norm": true
64
- }
65
- },
66
- "n-samples": {
67
- "hellaswag": {
68
- "original": 10042,
69
- "effective": 10042
70
- }
71
- },
72
- "config": {
73
- "model": "vllm",
74
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=4,data_parallel_size=2,gpu_memory_utilization=0.5,download_dir=/tmp",
75
- "batch_size": 1,
76
- "batch_sizes": [],
77
- "device": null,
78
- "use_cache": null,
79
- "limit": null,
80
- "bootstrap_iters": 100000,
81
- "gen_kwargs": null,
82
- "random_seed": 0,
83
- "numpy_seed": 1234,
84
- "torch_seed": 1234,
85
- "fewshot_seed": 1234
86
- },
87
- "git_hash": "8e1bd48d",
88
- "date": 1735957117.4813576,
89
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
90
- "transformers_version": "4.47.1",
91
- "upper_git_hash": null,
92
- "tokenizer_pad_token": [
93
- "<unk>",
94
- "0"
95
- ],
96
- "tokenizer_eos_token": [
97
- "</s>",
98
- "2"
99
- ],
100
- "tokenizer_bos_token": [
101
- "<s>",
102
- "1"
103
- ],
104
- "eot_token_id": 2,
105
- "max_length": 4096,
106
- "task_hashes": {},
107
- "model_source": "vllm",
108
- "model_name": "/ALLaM-7B-Instruct",
109
- "model_name_sanitized": "/ALLaM-7B-Instruct",
110
- "system_instruction": null,
111
- "system_instruction_sha": null,
112
- "fewshot_as_multiturn": false,
113
- "chat_template": null,
114
- "chat_template_sha": null,
115
- "start_time": 23786.943776673,
116
- "end_time": 23998.958401018,
117
- "total_evaluation_time_seconds": "212.0146243449999"
118
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/hendrycks_ethics_0_shot.json DELETED
@@ -1,307 +0,0 @@
1
- {
2
- "results": {
3
- "ethics_cm": {
4
- "alias": "ethics_cm",
5
- "acc,none": 0.7392535392535392,
6
- "acc_stderr,none": 0.007044761695158352
7
- },
8
- "ethics_deontology": {
9
- "alias": "ethics_deontology",
10
- "acc,none": 0.5786985539488321,
11
- "acc_stderr,none": 0.00823518246369769
12
- },
13
- "ethics_justice": {
14
- "alias": "ethics_justice",
15
- "acc,none": 0.771819526627219,
16
- "acc_stderr,none": 0.00807186884011459
17
- },
18
- "ethics_utilitarianism": {
19
- "alias": "ethics_utilitarianism",
20
- "acc,none": 0.6541181364392679,
21
- "acc_stderr,none": 0.006860486742815242
22
- },
23
- "ethics_virtue": {
24
- "alias": "ethics_virtue",
25
- "acc,none": 0.9147738693467337,
26
- "acc_stderr,none": 0.003959044383441912
27
- }
28
- },
29
- "group_subtasks": {
30
- "ethics_deontology": [],
31
- "ethics_virtue": [],
32
- "ethics_cm": [],
33
- "ethics_utilitarianism": [],
34
- "ethics_justice": []
35
- },
36
- "configs": {
37
- "ethics_cm": {
38
- "task": "ethics_cm",
39
- "tag": [
40
- "hendrycks_ethics"
41
- ],
42
- "dataset_path": "EleutherAI/hendrycks_ethics",
43
- "dataset_name": "commonsense",
44
- "dataset_kwargs": {
45
- "trust_remote_code": true
46
- },
47
- "training_split": "train",
48
- "test_split": "test",
49
- "doc_to_text": "{{input}}\nQuestion: Is this wrong?\nAnswer:",
50
- "doc_to_target": "label",
51
- "doc_to_choice": [
52
- "no",
53
- "yes"
54
- ],
55
- "description": "",
56
- "target_delimiter": " ",
57
- "fewshot_delimiter": "\n\n",
58
- "num_fewshot": 0,
59
- "metric_list": [
60
- {
61
- "metric": "acc"
62
- }
63
- ],
64
- "output_type": "multiple_choice",
65
- "repeats": 1,
66
- "should_decontaminate": false,
67
- "metadata": {
68
- "version": 1.0
69
- }
70
- },
71
- "ethics_deontology": {
72
- "task": "ethics_deontology",
73
- "tag": [
74
- "hendrycks_ethics"
75
- ],
76
- "dataset_path": "EleutherAI/hendrycks_ethics",
77
- "dataset_name": "deontology",
78
- "dataset_kwargs": {
79
- "trust_remote_code": true
80
- },
81
- "training_split": "train",
82
- "test_split": "test",
83
- "doc_to_text": "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}} {{excuse.rstrip()}}\"\nAnswer:",
84
- "doc_to_target": "label",
85
- "doc_to_choice": [
86
- "unreasonable",
87
- "reasonable"
88
- ],
89
- "description": "",
90
- "target_delimiter": " ",
91
- "fewshot_delimiter": "\n\n",
92
- "num_fewshot": 0,
93
- "metric_list": [
94
- {
95
- "metric": "acc"
96
- }
97
- ],
98
- "output_type": "multiple_choice",
99
- "repeats": 1,
100
- "should_decontaminate": false,
101
- "metadata": {
102
- "version": 1.0
103
- }
104
- },
105
- "ethics_justice": {
106
- "task": "ethics_justice",
107
- "tag": [
108
- "hendrycks_ethics"
109
- ],
110
- "dataset_path": "EleutherAI/hendrycks_ethics",
111
- "dataset_name": "justice",
112
- "dataset_kwargs": {
113
- "trust_remote_code": true
114
- },
115
- "training_split": "train",
116
- "test_split": "test",
117
- "doc_to_text": "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:",
118
- "doc_to_target": "label",
119
- "doc_to_choice": [
120
- "unreasonable",
121
- "reasonable"
122
- ],
123
- "description": "",
124
- "target_delimiter": " ",
125
- "fewshot_delimiter": "\n\n",
126
- "num_fewshot": 0,
127
- "metric_list": [
128
- {
129
- "metric": "acc"
130
- }
131
- ],
132
- "output_type": "multiple_choice",
133
- "repeats": 1,
134
- "should_decontaminate": false,
135
- "metadata": {
136
- "version": 1.0
137
- }
138
- },
139
- "ethics_utilitarianism": {
140
- "task": "ethics_utilitarianism",
141
- "tag": [
142
- "hendrycks_ethics"
143
- ],
144
- "dataset_path": "EleutherAI/hendrycks_ethics",
145
- "dataset_name": "utilitarianism",
146
- "dataset_kwargs": {
147
- "trust_remote_code": true
148
- },
149
- "training_split": "train",
150
- "test_split": "test",
151
- "doc_to_text": "def doc_to_text(doc) -> str:\n doc = _preproc_doc(doc)\n return f\"Scenario 1: {doc['scenarios'][0]}\\nScenario 2: {doc['scenarios'][1]}\\nQuestion: Is Scenario 1 preferable?\\nAnswer:\"\n",
152
- "doc_to_target": "def doc_to_target(doc):\n doc = _preproc_doc(doc)\n return doc[\"label\"]\n",
153
- "doc_to_choice": [
154
- "no",
155
- "yes"
156
- ],
157
- "description": "",
158
- "target_delimiter": " ",
159
- "fewshot_delimiter": "\n\n",
160
- "num_fewshot": 0,
161
- "metric_list": [
162
- {
163
- "metric": "acc"
164
- }
165
- ],
166
- "output_type": "multiple_choice",
167
- "repeats": 1,
168
- "should_decontaminate": false,
169
- "metadata": {
170
- "version": 1.0
171
- }
172
- },
173
- "ethics_virtue": {
174
- "task": "ethics_virtue",
175
- "tag": [
176
- "hendrycks_ethics"
177
- ],
178
- "dataset_path": "EleutherAI/hendrycks_ethics",
179
- "dataset_name": "virtue",
180
- "dataset_kwargs": {
181
- "trust_remote_code": true
182
- },
183
- "training_split": "train",
184
- "test_split": "test",
185
- "doc_to_text": "Sentence: {{scenario}}\nQuestion: Does the character in this sentence exhibit the trait \"{{trait}}\"?\nAnswer:",
186
- "doc_to_target": "label",
187
- "doc_to_choice": [
188
- "no",
189
- "yes"
190
- ],
191
- "description": "",
192
- "target_delimiter": " ",
193
- "fewshot_delimiter": "\n\n",
194
- "num_fewshot": 0,
195
- "metric_list": [
196
- {
197
- "metric": "acc"
198
- }
199
- ],
200
- "output_type": "multiple_choice",
201
- "repeats": 1,
202
- "should_decontaminate": false,
203
- "metadata": {
204
- "version": 1.0
205
- }
206
- }
207
- },
208
- "versions": {
209
- "ethics_cm": 1.0,
210
- "ethics_deontology": 1.0,
211
- "ethics_justice": 1.0,
212
- "ethics_utilitarianism": 1.0,
213
- "ethics_virtue": 1.0
214
- },
215
- "n-shot": {
216
- "ethics_cm": 0,
217
- "ethics_deontology": 0,
218
- "ethics_justice": 0,
219
- "ethics_utilitarianism": 0,
220
- "ethics_virtue": 0
221
- },
222
- "higher_is_better": {
223
- "ethics_cm": {
224
- "acc": true
225
- },
226
- "ethics_deontology": {
227
- "acc": true
228
- },
229
- "ethics_justice": {
230
- "acc": true
231
- },
232
- "ethics_utilitarianism": {
233
- "acc": true
234
- },
235
- "ethics_virtue": {
236
- "acc": true
237
- }
238
- },
239
- "n-samples": {
240
- "ethics_justice": {
241
- "original": 2704,
242
- "effective": 2704
243
- },
244
- "ethics_utilitarianism": {
245
- "original": 4808,
246
- "effective": 4808
247
- },
248
- "ethics_cm": {
249
- "original": 3885,
250
- "effective": 3885
251
- },
252
- "ethics_virtue": {
253
- "original": 4975,
254
- "effective": 4975
255
- },
256
- "ethics_deontology": {
257
- "original": 3596,
258
- "effective": 3596
259
- }
260
- },
261
- "config": {
262
- "model": "vllm",
263
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=4,data_parallel_size=2,gpu_memory_utilization=0.5,download_dir=/tmp",
264
- "batch_size": 1,
265
- "batch_sizes": [],
266
- "device": null,
267
- "use_cache": null,
268
- "limit": null,
269
- "bootstrap_iters": 100000,
270
- "gen_kwargs": null,
271
- "random_seed": 0,
272
- "numpy_seed": 1234,
273
- "torch_seed": 1234,
274
- "fewshot_seed": 1234
275
- },
276
- "git_hash": "8e1bd48d",
277
- "date": 1735957382.509422,
278
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
279
- "transformers_version": "4.47.1",
280
- "upper_git_hash": null,
281
- "tokenizer_pad_token": [
282
- "<unk>",
283
- "0"
284
- ],
285
- "tokenizer_eos_token": [
286
- "</s>",
287
- "2"
288
- ],
289
- "tokenizer_bos_token": [
290
- "<s>",
291
- "1"
292
- ],
293
- "eot_token_id": 2,
294
- "max_length": 4096,
295
- "task_hashes": {},
296
- "model_source": "vllm",
297
- "model_name": "/ALLaM-7B-Instruct",
298
- "model_name_sanitized": "/ALLaM-7B-Instruct",
299
- "system_instruction": null,
300
- "system_instruction_sha": null,
301
- "fewshot_as_multiturn": false,
302
- "chat_template": null,
303
- "chat_template_sha": null,
304
- "start_time": 24051.95882374,
305
- "end_time": 24251.353762318,
306
- "total_evaluation_time_seconds": "199.3949385779997"
307
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/ifeval_0_shot.json DELETED
@@ -1,132 +0,0 @@
1
- {
2
- "results": {
3
- "ifeval": {
4
- "alias": "ifeval",
5
- "prompt_level_strict_acc,none": 0.37707948243992606,
6
- "prompt_level_strict_acc_stderr,none": 0.020856233918528456,
7
- "inst_level_strict_acc,none": 0.486810551558753,
8
- "inst_level_strict_acc_stderr,none": "N/A",
9
- "prompt_level_loose_acc,none": 0.41404805914972276,
10
- "prompt_level_loose_acc_stderr,none": 0.021196272552471213,
11
- "inst_level_loose_acc,none": 0.5239808153477218,
12
- "inst_level_loose_acc_stderr,none": "N/A"
13
- }
14
- },
15
- "group_subtasks": {
16
- "ifeval": []
17
- },
18
- "configs": {
19
- "ifeval": {
20
- "task": "ifeval",
21
- "dataset_path": "google/IFEval",
22
- "test_split": "train",
23
- "doc_to_text": "prompt",
24
- "doc_to_target": 0,
25
- "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n",
26
- "description": "",
27
- "target_delimiter": " ",
28
- "fewshot_delimiter": "\n\n",
29
- "num_fewshot": 0,
30
- "metric_list": [
31
- {
32
- "metric": "prompt_level_strict_acc",
33
- "aggregation": "mean",
34
- "higher_is_better": true
35
- },
36
- {
37
- "metric": "inst_level_strict_acc",
38
- "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
39
- "higher_is_better": true
40
- },
41
- {
42
- "metric": "prompt_level_loose_acc",
43
- "aggregation": "mean",
44
- "higher_is_better": true
45
- },
46
- {
47
- "metric": "inst_level_loose_acc",
48
- "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
49
- "higher_is_better": true
50
- }
51
- ],
52
- "output_type": "generate_until",
53
- "generation_kwargs": {
54
- "until": [],
55
- "do_sample": false,
56
- "temperature": 0.0,
57
- "max_gen_toks": 1280
58
- },
59
- "repeats": 1,
60
- "should_decontaminate": false,
61
- "metadata": {
62
- "version": 4.0
63
- }
64
- }
65
- },
66
- "versions": {
67
- "ifeval": 4.0
68
- },
69
- "n-shot": {
70
- "ifeval": 0
71
- },
72
- "higher_is_better": {
73
- "ifeval": {
74
- "prompt_level_strict_acc": true,
75
- "inst_level_strict_acc": true,
76
- "prompt_level_loose_acc": true,
77
- "inst_level_loose_acc": true
78
- }
79
- },
80
- "n-samples": {
81
- "ifeval": {
82
- "original": 541,
83
- "effective": 541
84
- }
85
- },
86
- "config": {
87
- "model": "vllm",
88
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=4,data_parallel_size=2,gpu_memory_utilization=0.5,download_dir=/tmp",
89
- "batch_size": 1,
90
- "batch_sizes": [],
91
- "device": null,
92
- "use_cache": null,
93
- "limit": null,
94
- "bootstrap_iters": 100000,
95
- "gen_kwargs": null,
96
- "random_seed": 0,
97
- "numpy_seed": 1234,
98
- "torch_seed": 1234,
99
- "fewshot_seed": 1234
100
- },
101
- "git_hash": "8e1bd48d",
102
- "date": 1735955103.211484,
103
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
104
- "transformers_version": "4.47.1",
105
- "upper_git_hash": null,
106
- "tokenizer_pad_token": [
107
- "<unk>",
108
- "0"
109
- ],
110
- "tokenizer_eos_token": [
111
- "</s>",
112
- "2"
113
- ],
114
- "tokenizer_bos_token": [
115
- "<s>",
116
- "1"
117
- ],
118
- "eot_token_id": 2,
119
- "max_length": 4096,
120
- "task_hashes": {},
121
- "model_source": "vllm",
122
- "model_name": "/ALLaM-7B-Instruct",
123
- "model_name_sanitized": "/ALLaM-7B-Instruct",
124
- "system_instruction": null,
125
- "system_instruction_sha": null,
126
- "fewshot_as_multiturn": false,
127
- "chat_template": null,
128
- "chat_template_sha": null,
129
- "start_time": 21772.672146886,
130
- "end_time": 21897.362057308,
131
- "total_evaluation_time_seconds": "124.68991042199923"
132
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/minerva_math_4_shot.json DELETED
@@ -1,525 +0,0 @@
1
- {
2
- "results": {
3
- "minerva_math": {
4
- "exact_match,none": 0.1742,
5
- "exact_match_stderr,none": 0.005167735460596966,
6
- "alias": "minerva_math"
7
- },
8
- "minerva_math_algebra": {
9
- "alias": " - minerva_math_algebra",
10
- "exact_match,none": 0.2443133951137321,
11
- "exact_match_stderr,none": 0.012476769647814658
12
- },
13
- "minerva_math_counting_and_prob": {
14
- "alias": " - minerva_math_counting_and_prob",
15
- "exact_match,none": 0.16666666666666666,
16
- "exact_match_stderr,none": 0.01713575252401387
17
- },
18
- "minerva_math_geometry": {
19
- "alias": " - minerva_math_geometry",
20
- "exact_match,none": 0.11899791231732777,
21
- "exact_match_stderr,none": 0.014809629428535889
22
- },
23
- "minerva_math_intermediate_algebra": {
24
- "alias": " - minerva_math_intermediate_algebra",
25
- "exact_match,none": 0.058693244739756366,
26
- "exact_match_stderr,none": 0.00782629796703524
27
- },
28
- "minerva_math_num_theory": {
29
- "alias": " - minerva_math_num_theory",
30
- "exact_match,none": 0.11481481481481481,
31
- "exact_match_stderr,none": 0.013731616019404622
32
- },
33
- "minerva_math_prealgebra": {
34
- "alias": " - minerva_math_prealgebra",
35
- "exact_match,none": 0.3409873708381171,
36
- "exact_match_stderr,none": 0.016071499145682847
37
- },
38
- "minerva_math_precalc": {
39
- "alias": " - minerva_math_precalc",
40
- "exact_match,none": 0.06043956043956044,
41
- "exact_match_stderr,none": 0.010207626216646911
42
- }
43
- },
44
- "groups": {
45
- "minerva_math": {
46
- "exact_match,none": 0.1742,
47
- "exact_match_stderr,none": 0.005167735460596966,
48
- "alias": "minerva_math"
49
- }
50
- },
51
- "group_subtasks": {
52
- "minerva_math": [
53
- "minerva_math_algebra",
54
- "minerva_math_counting_and_prob",
55
- "minerva_math_geometry",
56
- "minerva_math_intermediate_algebra",
57
- "minerva_math_num_theory",
58
- "minerva_math_prealgebra",
59
- "minerva_math_precalc"
60
- ]
61
- },
62
- "configs": {
63
- "minerva_math_algebra": {
64
- "task": "minerva_math_algebra",
65
- "tag": [
66
- "math_word_problems"
67
- ],
68
- "group": [
69
- "math_word_problems"
70
- ],
71
- "dataset_path": "EleutherAI/hendrycks_math",
72
- "dataset_name": "algebra",
73
- "dataset_kwargs": {
74
- "trust_remote_code": true
75
- },
76
- "training_split": "train",
77
- "test_split": "test",
78
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc: dict) -> dict:\n out_doc = {\n \"problem\": doc[\"problem\"],\n \"solution\": doc[\"solution\"],\n \"answer\": normalize_final_answer(\n remove_boxed(last_boxed_only_string(doc[\"solution\"]))\n ),\n }\n if getattr(doc, \"few_shot\", None) is not None:\n out_doc[\"few_shot\"] = True\n return out_doc\n\n return dataset.map(_process_doc)\n",
79
- "doc_to_text": "def doc_to_text(doc: dict) -> str:\n return \"Problem:\" + \"\\n\" + doc[\"problem\"] + \"\\n\\n\" + \"Solution:\"\n",
80
- "doc_to_target": "{{answer if few_shot is undefined else solution}}",
81
- "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidates = results[0]\n\n unnormalized_answer = get_unnormalized_answer(candidates)\n answer = normalize_final_answer(unnormalized_answer)\n\n if is_equiv(answer, doc[\"answer\"]):\n retval = 1\n else:\n retval = 0\n\n results = {\n \"exact_match\": retval,\n }\n return results\n",
82
- "description": "",
83
- "target_delimiter": " ",
84
- "fewshot_delimiter": "\n\n",
85
- "fewshot_config": {
86
- "sampler": "first_n",
87
- "samples": "<function list_fewshot_samples at 0x146d9c03c820>"
88
- },
89
- "num_fewshot": 4,
90
- "metric_list": [
91
- {
92
- "metric": "exact_match",
93
- "aggregation": "mean",
94
- "higher_is_better": true
95
- }
96
- ],
97
- "output_type": "generate_until",
98
- "generation_kwargs": {
99
- "until": [
100
- "Problem:"
101
- ],
102
- "do_sample": false,
103
- "temperature": 0.0
104
- },
105
- "repeats": 1,
106
- "should_decontaminate": false,
107
- "metadata": {
108
- "version": 1.0
109
- }
110
- },
111
- "minerva_math_counting_and_prob": {
112
- "task": "minerva_math_counting_and_prob",
113
- "tag": [
114
- "math_word_problems"
115
- ],
116
- "group": [
117
- "math_word_problems"
118
- ],
119
- "dataset_path": "EleutherAI/hendrycks_math",
120
- "dataset_name": "counting_and_probability",
121
- "dataset_kwargs": {
122
- "trust_remote_code": true
123
- },
124
- "training_split": "train",
125
- "test_split": "test",
126
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc: dict) -> dict:\n out_doc = {\n \"problem\": doc[\"problem\"],\n \"solution\": doc[\"solution\"],\n \"answer\": normalize_final_answer(\n remove_boxed(last_boxed_only_string(doc[\"solution\"]))\n ),\n }\n if getattr(doc, \"few_shot\", None) is not None:\n out_doc[\"few_shot\"] = True\n return out_doc\n\n return dataset.map(_process_doc)\n",
127
- "doc_to_text": "def doc_to_text(doc: dict) -> str:\n return \"Problem:\" + \"\\n\" + doc[\"problem\"] + \"\\n\\n\" + \"Solution:\"\n",
128
- "doc_to_target": "{{answer if few_shot is undefined else solution}}",
129
- "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidates = results[0]\n\n unnormalized_answer = get_unnormalized_answer(candidates)\n answer = normalize_final_answer(unnormalized_answer)\n\n if is_equiv(answer, doc[\"answer\"]):\n retval = 1\n else:\n retval = 0\n\n results = {\n \"exact_match\": retval,\n }\n return results\n",
130
- "description": "",
131
- "target_delimiter": " ",
132
- "fewshot_delimiter": "\n\n",
133
- "fewshot_config": {
134
- "sampler": "first_n",
135
- "samples": "<function list_fewshot_samples at 0x146d9c04e830>"
136
- },
137
- "num_fewshot": 4,
138
- "metric_list": [
139
- {
140
- "metric": "exact_match",
141
- "aggregation": "mean",
142
- "higher_is_better": true
143
- }
144
- ],
145
- "output_type": "generate_until",
146
- "generation_kwargs": {
147
- "until": [
148
- "Problem:"
149
- ],
150
- "do_sample": false,
151
- "temperature": 0.0
152
- },
153
- "repeats": 1,
154
- "should_decontaminate": false,
155
- "metadata": {
156
- "version": 1.0
157
- }
158
- },
159
- "minerva_math_geometry": {
160
- "task": "minerva_math_geometry",
161
- "tag": [
162
- "math_word_problems"
163
- ],
164
- "group": [
165
- "math_word_problems"
166
- ],
167
- "dataset_path": "EleutherAI/hendrycks_math",
168
- "dataset_name": "geometry",
169
- "dataset_kwargs": {
170
- "trust_remote_code": true
171
- },
172
- "training_split": "train",
173
- "test_split": "test",
174
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc: dict) -> dict:\n out_doc = {\n \"problem\": doc[\"problem\"],\n \"solution\": doc[\"solution\"],\n \"answer\": normalize_final_answer(\n remove_boxed(last_boxed_only_string(doc[\"solution\"]))\n ),\n }\n if getattr(doc, \"few_shot\", None) is not None:\n out_doc[\"few_shot\"] = True\n return out_doc\n\n return dataset.map(_process_doc)\n",
175
- "doc_to_text": "def doc_to_text(doc: dict) -> str:\n return \"Problem:\" + \"\\n\" + doc[\"problem\"] + \"\\n\\n\" + \"Solution:\"\n",
176
- "doc_to_target": "{{answer if few_shot is undefined else solution}}",
177
- "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidates = results[0]\n\n unnormalized_answer = get_unnormalized_answer(candidates)\n answer = normalize_final_answer(unnormalized_answer)\n\n if is_equiv(answer, doc[\"answer\"]):\n retval = 1\n else:\n retval = 0\n\n results = {\n \"exact_match\": retval,\n }\n return results\n",
178
- "description": "",
179
- "target_delimiter": " ",
180
- "fewshot_delimiter": "\n\n",
181
- "fewshot_config": {
182
- "sampler": "first_n",
183
- "samples": "<function list_fewshot_samples at 0x146d9c04c1f0>"
184
- },
185
- "num_fewshot": 4,
186
- "metric_list": [
187
- {
188
- "metric": "exact_match",
189
- "aggregation": "mean",
190
- "higher_is_better": true
191
- }
192
- ],
193
- "output_type": "generate_until",
194
- "generation_kwargs": {
195
- "until": [
196
- "Problem:"
197
- ],
198
- "do_sample": false,
199
- "temperature": 0.0
200
- },
201
- "repeats": 1,
202
- "should_decontaminate": false,
203
- "metadata": {
204
- "version": 1.0
205
- }
206
- },
207
- "minerva_math_intermediate_algebra": {
208
- "task": "minerva_math_intermediate_algebra",
209
- "tag": [
210
- "math_word_problems"
211
- ],
212
- "group": [
213
- "math_word_problems"
214
- ],
215
- "dataset_path": "EleutherAI/hendrycks_math",
216
- "dataset_name": "intermediate_algebra",
217
- "dataset_kwargs": {
218
- "trust_remote_code": true
219
- },
220
- "training_split": "train",
221
- "test_split": "test",
222
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc: dict) -> dict:\n out_doc = {\n \"problem\": doc[\"problem\"],\n \"solution\": doc[\"solution\"],\n \"answer\": normalize_final_answer(\n remove_boxed(last_boxed_only_string(doc[\"solution\"]))\n ),\n }\n if getattr(doc, \"few_shot\", None) is not None:\n out_doc[\"few_shot\"] = True\n return out_doc\n\n return dataset.map(_process_doc)\n",
223
- "doc_to_text": "def doc_to_text(doc: dict) -> str:\n return \"Problem:\" + \"\\n\" + doc[\"problem\"] + \"\\n\\n\" + \"Solution:\"\n",
224
- "doc_to_target": "{{answer if few_shot is undefined else solution}}",
225
- "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidates = results[0]\n\n unnormalized_answer = get_unnormalized_answer(candidates)\n answer = normalize_final_answer(unnormalized_answer)\n\n if is_equiv(answer, doc[\"answer\"]):\n retval = 1\n else:\n retval = 0\n\n results = {\n \"exact_match\": retval,\n }\n return results\n",
226
- "description": "",
227
- "target_delimiter": " ",
228
- "fewshot_delimiter": "\n\n",
229
- "fewshot_config": {
230
- "sampler": "first_n",
231
- "samples": "<function list_fewshot_samples at 0x146d9c0eecb0>"
232
- },
233
- "num_fewshot": 4,
234
- "metric_list": [
235
- {
236
- "metric": "exact_match",
237
- "aggregation": "mean",
238
- "higher_is_better": true
239
- }
240
- ],
241
- "output_type": "generate_until",
242
- "generation_kwargs": {
243
- "until": [
244
- "Problem:"
245
- ],
246
- "do_sample": false,
247
- "temperature": 0.0
248
- },
249
- "repeats": 1,
250
- "should_decontaminate": false,
251
- "metadata": {
252
- "version": 1.0
253
- }
254
- },
255
- "minerva_math_num_theory": {
256
- "task": "minerva_math_num_theory",
257
- "tag": [
258
- "math_word_problems"
259
- ],
260
- "group": [
261
- "math_word_problems"
262
- ],
263
- "dataset_path": "EleutherAI/hendrycks_math",
264
- "dataset_name": "number_theory",
265
- "dataset_kwargs": {
266
- "trust_remote_code": true
267
- },
268
- "training_split": "train",
269
- "test_split": "test",
270
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc: dict) -> dict:\n out_doc = {\n \"problem\": doc[\"problem\"],\n \"solution\": doc[\"solution\"],\n \"answer\": normalize_final_answer(\n remove_boxed(last_boxed_only_string(doc[\"solution\"]))\n ),\n }\n if getattr(doc, \"few_shot\", None) is not None:\n out_doc[\"few_shot\"] = True\n return out_doc\n\n return dataset.map(_process_doc)\n",
271
- "doc_to_text": "def doc_to_text(doc: dict) -> str:\n return \"Problem:\" + \"\\n\" + doc[\"problem\"] + \"\\n\\n\" + \"Solution:\"\n",
272
- "doc_to_target": "{{answer if few_shot is undefined else solution}}",
273
- "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidates = results[0]\n\n unnormalized_answer = get_unnormalized_answer(candidates)\n answer = normalize_final_answer(unnormalized_answer)\n\n if is_equiv(answer, doc[\"answer\"]):\n retval = 1\n else:\n retval = 0\n\n results = {\n \"exact_match\": retval,\n }\n return results\n",
274
- "description": "",
275
- "target_delimiter": " ",
276
- "fewshot_delimiter": "\n\n",
277
- "fewshot_config": {
278
- "sampler": "first_n",
279
- "samples": "<function list_fewshot_samples at 0x146d9c0ec040>"
280
- },
281
- "num_fewshot": 4,
282
- "metric_list": [
283
- {
284
- "metric": "exact_match",
285
- "aggregation": "mean",
286
- "higher_is_better": true
287
- }
288
- ],
289
- "output_type": "generate_until",
290
- "generation_kwargs": {
291
- "until": [
292
- "Problem:"
293
- ],
294
- "do_sample": false,
295
- "temperature": 0.0
296
- },
297
- "repeats": 1,
298
- "should_decontaminate": false,
299
- "metadata": {
300
- "version": 1.0
301
- }
302
- },
303
- "minerva_math_prealgebra": {
304
- "task": "minerva_math_prealgebra",
305
- "tag": [
306
- "math_word_problems"
307
- ],
308
- "group": [
309
- "math_word_problems"
310
- ],
311
- "dataset_path": "EleutherAI/hendrycks_math",
312
- "dataset_name": "prealgebra",
313
- "dataset_kwargs": {
314
- "trust_remote_code": true
315
- },
316
- "training_split": "train",
317
- "test_split": "test",
318
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc: dict) -> dict:\n out_doc = {\n \"problem\": doc[\"problem\"],\n \"solution\": doc[\"solution\"],\n \"answer\": normalize_final_answer(\n remove_boxed(last_boxed_only_string(doc[\"solution\"]))\n ),\n }\n if getattr(doc, \"few_shot\", None) is not None:\n out_doc[\"few_shot\"] = True\n return out_doc\n\n return dataset.map(_process_doc)\n",
319
- "doc_to_text": "def doc_to_text(doc: dict) -> str:\n return \"Problem:\" + \"\\n\" + doc[\"problem\"] + \"\\n\\n\" + \"Solution:\"\n",
320
- "doc_to_target": "{{answer if few_shot is undefined else solution}}",
321
- "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidates = results[0]\n\n unnormalized_answer = get_unnormalized_answer(candidates)\n answer = normalize_final_answer(unnormalized_answer)\n\n if is_equiv(answer, doc[\"answer\"]):\n retval = 1\n else:\n retval = 0\n\n results = {\n \"exact_match\": retval,\n }\n return results\n",
322
- "description": "",
323
- "target_delimiter": " ",
324
- "fewshot_delimiter": "\n\n",
325
- "fewshot_config": {
326
- "sampler": "first_n",
327
- "samples": "<function list_fewshot_samples at 0x146d996368c0>"
328
- },
329
- "num_fewshot": 4,
330
- "metric_list": [
331
- {
332
- "metric": "exact_match",
333
- "aggregation": "mean",
334
- "higher_is_better": true
335
- }
336
- ],
337
- "output_type": "generate_until",
338
- "generation_kwargs": {
339
- "until": [
340
- "Problem:"
341
- ],
342
- "do_sample": false,
343
- "temperature": 0.0
344
- },
345
- "repeats": 1,
346
- "should_decontaminate": false,
347
- "metadata": {
348
- "version": 1.0
349
- }
350
- },
351
- "minerva_math_precalc": {
352
- "task": "minerva_math_precalc",
353
- "tag": [
354
- "math_word_problems"
355
- ],
356
- "group": [
357
- "math_word_problems"
358
- ],
359
- "dataset_path": "EleutherAI/hendrycks_math",
360
- "dataset_name": "precalculus",
361
- "dataset_kwargs": {
362
- "trust_remote_code": true
363
- },
364
- "training_split": "train",
365
- "test_split": "test",
366
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc: dict) -> dict:\n out_doc = {\n \"problem\": doc[\"problem\"],\n \"solution\": doc[\"solution\"],\n \"answer\": normalize_final_answer(\n remove_boxed(last_boxed_only_string(doc[\"solution\"]))\n ),\n }\n if getattr(doc, \"few_shot\", None) is not None:\n out_doc[\"few_shot\"] = True\n return out_doc\n\n return dataset.map(_process_doc)\n",
367
- "doc_to_text": "def doc_to_text(doc: dict) -> str:\n return \"Problem:\" + \"\\n\" + doc[\"problem\"] + \"\\n\\n\" + \"Solution:\"\n",
368
- "doc_to_target": "{{answer if few_shot is undefined else solution}}",
369
- "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidates = results[0]\n\n unnormalized_answer = get_unnormalized_answer(candidates)\n answer = normalize_final_answer(unnormalized_answer)\n\n if is_equiv(answer, doc[\"answer\"]):\n retval = 1\n else:\n retval = 0\n\n results = {\n \"exact_match\": retval,\n }\n return results\n",
370
- "description": "",
371
- "target_delimiter": " ",
372
- "fewshot_delimiter": "\n\n",
373
- "fewshot_config": {
374
- "sampler": "first_n",
375
- "samples": "<function list_fewshot_samples at 0x146d995cb490>"
376
- },
377
- "num_fewshot": 4,
378
- "metric_list": [
379
- {
380
- "metric": "exact_match",
381
- "aggregation": "mean",
382
- "higher_is_better": true
383
- }
384
- ],
385
- "output_type": "generate_until",
386
- "generation_kwargs": {
387
- "until": [
388
- "Problem:"
389
- ],
390
- "do_sample": false,
391
- "temperature": 0.0
392
- },
393
- "repeats": 1,
394
- "should_decontaminate": false,
395
- "metadata": {
396
- "version": 1.0
397
- }
398
- }
399
- },
400
- "versions": {
401
- "minerva_math": 1.0,
402
- "minerva_math_algebra": 1.0,
403
- "minerva_math_counting_and_prob": 1.0,
404
- "minerva_math_geometry": 1.0,
405
- "minerva_math_intermediate_algebra": 1.0,
406
- "minerva_math_num_theory": 1.0,
407
- "minerva_math_prealgebra": 1.0,
408
- "minerva_math_precalc": 1.0
409
- },
410
- "n-shot": {
411
- "minerva_math_algebra": 4,
412
- "minerva_math_counting_and_prob": 4,
413
- "minerva_math_geometry": 4,
414
- "minerva_math_intermediate_algebra": 4,
415
- "minerva_math_num_theory": 4,
416
- "minerva_math_prealgebra": 4,
417
- "minerva_math_precalc": 4
418
- },
419
- "higher_is_better": {
420
- "minerva_math": {
421
- "exact_match": true
422
- },
423
- "minerva_math_algebra": {
424
- "exact_match": true
425
- },
426
- "minerva_math_counting_and_prob": {
427
- "exact_match": true
428
- },
429
- "minerva_math_geometry": {
430
- "exact_match": true
431
- },
432
- "minerva_math_intermediate_algebra": {
433
- "exact_match": true
434
- },
435
- "minerva_math_num_theory": {
436
- "exact_match": true
437
- },
438
- "minerva_math_prealgebra": {
439
- "exact_match": true
440
- },
441
- "minerva_math_precalc": {
442
- "exact_match": true
443
- }
444
- },
445
- "n-samples": {
446
- "minerva_math_algebra": {
447
- "original": 1187,
448
- "effective": 1187
449
- },
450
- "minerva_math_counting_and_prob": {
451
- "original": 474,
452
- "effective": 474
453
- },
454
- "minerva_math_geometry": {
455
- "original": 479,
456
- "effective": 479
457
- },
458
- "minerva_math_intermediate_algebra": {
459
- "original": 903,
460
- "effective": 903
461
- },
462
- "minerva_math_num_theory": {
463
- "original": 540,
464
- "effective": 540
465
- },
466
- "minerva_math_prealgebra": {
467
- "original": 871,
468
- "effective": 871
469
- },
470
- "minerva_math_precalc": {
471
- "original": 546,
472
- "effective": 546
473
- }
474
- },
475
- "config": {
476
- "model": "hf",
477
- "model_args": "pretrained=/ALLaM-7B-Instruct,trust_remote_code=True,cache_dir=/tmp,parallelize=True",
478
- "model_num_parameters": 7000559616,
479
- "model_dtype": "torch.bfloat16",
480
- "model_revision": "main",
481
- "model_sha": "",
482
- "batch_size": "auto",
483
- "batch_sizes": [],
484
- "device": null,
485
- "use_cache": null,
486
- "limit": null,
487
- "bootstrap_iters": 100000,
488
- "gen_kwargs": null,
489
- "random_seed": 0,
490
- "numpy_seed": 1234,
491
- "torch_seed": 1234,
492
- "fewshot_seed": 1234
493
- },
494
- "git_hash": "8e1bd48d",
495
- "date": 1735683439.646248,
496
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.88\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
497
- "transformers_version": "4.47.1",
498
- "upper_git_hash": null,
499
- "tokenizer_pad_token": [
500
- "<unk>",
501
- "0"
502
- ],
503
- "tokenizer_eos_token": [
504
- "</s>",
505
- "2"
506
- ],
507
- "tokenizer_bos_token": [
508
- "<s>",
509
- "1"
510
- ],
511
- "eot_token_id": 2,
512
- "max_length": 4096,
513
- "task_hashes": {},
514
- "model_source": "hf",
515
- "model_name": "/ALLaM-7B-Instruct",
516
- "model_name_sanitized": "/ALLaM-7B-Instruct",
517
- "system_instruction": null,
518
- "system_instruction_sha": null,
519
- "fewshot_as_multiturn": false,
520
- "chat_template": null,
521
- "chat_template_sha": null,
522
- "start_time": 29617.613485255,
523
- "end_time": 33957.45925665,
524
- "total_evaluation_time_seconds": "4339.845771395001"
525
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/mmlu_0_shot.json DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/en/mmlu_pro_5_shot.json DELETED
@@ -1,1088 +0,0 @@
1
- {
2
- "results": {
3
- "mmlu_pro": {
4
- "exact_match,custom-extract": 0.3042719414893617,
5
- "exact_match_stderr,custom-extract": 0.00404763190810295,
6
- "alias": "mmlu_pro"
7
- },
8
- "mmlu_pro_biology": {
9
- "alias": " - biology",
10
- "exact_match,custom-extract": 0.5788005578800558,
11
- "exact_match_stderr,custom-extract": 0.01845235719744687
12
- },
13
- "mmlu_pro_business": {
14
- "alias": " - business",
15
- "exact_match,custom-extract": 0.2915082382762991,
16
- "exact_match_stderr,custom-extract": 0.016189361099463357
17
- },
18
- "mmlu_pro_chemistry": {
19
- "alias": " - chemistry",
20
- "exact_match,custom-extract": 0.14752650176678445,
21
- "exact_match_stderr,custom-extract": 0.010544941212928488
22
- },
23
- "mmlu_pro_computer_science": {
24
- "alias": " - computer_science",
25
- "exact_match,custom-extract": 0.2975609756097561,
26
- "exact_match_stderr,custom-extract": 0.022606360476532427
27
- },
28
- "mmlu_pro_economics": {
29
- "alias": " - economics",
30
- "exact_match,custom-extract": 0.44549763033175355,
31
- "exact_match_stderr,custom-extract": 0.017118299286531986
32
- },
33
- "mmlu_pro_engineering": {
34
- "alias": " - engineering",
35
- "exact_match,custom-extract": 0.17337461300309598,
36
- "exact_match_stderr,custom-extract": 0.012167726609185038
37
- },
38
- "mmlu_pro_health": {
39
- "alias": " - health",
40
- "exact_match,custom-extract": 0.3753056234718826,
41
- "exact_match_stderr,custom-extract": 0.0169400741062406
42
- },
43
- "mmlu_pro_history": {
44
- "alias": " - history",
45
- "exact_match,custom-extract": 0.3438320209973753,
46
- "exact_match_stderr,custom-extract": 0.024366260232577264
47
- },
48
- "mmlu_pro_law": {
49
- "alias": " - law",
50
- "exact_match,custom-extract": 0.21525885558583105,
51
- "exact_match_stderr,custom-extract": 0.012392170573599742
52
- },
53
- "mmlu_pro_math": {
54
- "alias": " - math",
55
- "exact_match,custom-extract": 0.26350851221317545,
56
- "exact_match_stderr,custom-extract": 0.011989865356312482
57
- },
58
- "mmlu_pro_other": {
59
- "alias": " - other",
60
- "exact_match,custom-extract": 0.38203463203463206,
61
- "exact_match_stderr,custom-extract": 0.015993097507618206
62
- },
63
- "mmlu_pro_philosophy": {
64
- "alias": " - philosophy",
65
- "exact_match,custom-extract": 0.2865731462925852,
66
- "exact_match_stderr,custom-extract": 0.02026178957298461
67
- },
68
- "mmlu_pro_physics": {
69
- "alias": " - physics",
70
- "exact_match,custom-extract": 0.20323325635103925,
71
- "exact_match_stderr,custom-extract": 0.01116929190053331
72
- },
73
- "mmlu_pro_psychology": {
74
- "alias": " - psychology",
75
- "exact_match,custom-extract": 0.49122807017543857,
76
- "exact_match_stderr,custom-extract": 0.017708182870812612
77
- }
78
- },
79
- "groups": {
80
- "mmlu_pro": {
81
- "exact_match,custom-extract": 0.3042719414893617,
82
- "exact_match_stderr,custom-extract": 0.00404763190810295,
83
- "alias": "mmlu_pro"
84
- }
85
- },
86
- "group_subtasks": {
87
- "mmlu_pro": [
88
- "mmlu_pro_biology",
89
- "mmlu_pro_business",
90
- "mmlu_pro_chemistry",
91
- "mmlu_pro_computer_science",
92
- "mmlu_pro_economics",
93
- "mmlu_pro_engineering",
94
- "mmlu_pro_health",
95
- "mmlu_pro_history",
96
- "mmlu_pro_law",
97
- "mmlu_pro_math",
98
- "mmlu_pro_other",
99
- "mmlu_pro_philosophy",
100
- "mmlu_pro_physics",
101
- "mmlu_pro_psychology"
102
- ]
103
- },
104
- "configs": {
105
- "mmlu_pro_biology": {
106
- "task": "mmlu_pro_biology",
107
- "task_alias": "biology",
108
- "dataset_path": "TIGER-Lab/MMLU-Pro",
109
- "test_split": "test",
110
- "fewshot_split": "validation",
111
- "process_docs": "functools.partial(<function process_docs at 0x14541d3696c0>, subject='biology')",
112
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541d36a710>, including_answer=False)",
113
- "doc_to_target": "answer",
114
- "description": "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
115
- "target_delimiter": " ",
116
- "fewshot_delimiter": "\n\n",
117
- "fewshot_config": {
118
- "sampler": "first_n",
119
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541d369240>, including_answer=True)",
120
- "doc_to_target": ""
121
- },
122
- "num_fewshot": 5,
123
- "metric_list": [
124
- {
125
- "metric": "exact_match",
126
- "aggregation": "mean",
127
- "higher_is_better": true,
128
- "ignore_case": true,
129
- "ignore_punctuation": true
130
- }
131
- ],
132
- "output_type": "generate_until",
133
- "generation_kwargs": {
134
- "until": [
135
- "</s>",
136
- "Q:",
137
- "<|im_end|>"
138
- ],
139
- "do_sample": false,
140
- "temperature": 0.0
141
- },
142
- "repeats": 1,
143
- "filter_list": [
144
- {
145
- "name": "custom-extract",
146
- "filter": [
147
- {
148
- "function": "regex",
149
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
150
- },
151
- {
152
- "function": "take_first"
153
- }
154
- ]
155
- }
156
- ],
157
- "should_decontaminate": false,
158
- "metadata": {
159
- "version": 1.0
160
- }
161
- },
162
- "mmlu_pro_business": {
163
- "task": "mmlu_pro_business",
164
- "task_alias": "business",
165
- "dataset_path": "TIGER-Lab/MMLU-Pro",
166
- "test_split": "test",
167
- "fewshot_split": "validation",
168
- "process_docs": "functools.partial(<function process_docs at 0x14541d3683a0>, subject='business')",
169
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541d369d80>, including_answer=False)",
170
- "doc_to_target": "answer",
171
- "description": "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
172
- "target_delimiter": " ",
173
- "fewshot_delimiter": "\n\n",
174
- "fewshot_config": {
175
- "sampler": "first_n",
176
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541d36b910>, including_answer=True)",
177
- "doc_to_target": ""
178
- },
179
- "num_fewshot": 5,
180
- "metric_list": [
181
- {
182
- "metric": "exact_match",
183
- "aggregation": "mean",
184
- "higher_is_better": true,
185
- "ignore_case": true,
186
- "ignore_punctuation": true
187
- }
188
- ],
189
- "output_type": "generate_until",
190
- "generation_kwargs": {
191
- "until": [
192
- "</s>",
193
- "Q:",
194
- "<|im_end|>"
195
- ],
196
- "do_sample": false,
197
- "temperature": 0.0
198
- },
199
- "repeats": 1,
200
- "filter_list": [
201
- {
202
- "name": "custom-extract",
203
- "filter": [
204
- {
205
- "function": "regex",
206
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
207
- },
208
- {
209
- "function": "take_first"
210
- }
211
- ]
212
- }
213
- ],
214
- "should_decontaminate": false,
215
- "metadata": {
216
- "version": 1.0
217
- }
218
- },
219
- "mmlu_pro_chemistry": {
220
- "task": "mmlu_pro_chemistry",
221
- "task_alias": "chemistry",
222
- "dataset_path": "TIGER-Lab/MMLU-Pro",
223
- "test_split": "test",
224
- "fewshot_split": "validation",
225
- "process_docs": "functools.partial(<function process_docs at 0x14541d3681f0>, subject='chemistry')",
226
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541d36a200>, including_answer=False)",
227
- "doc_to_target": "answer",
228
- "description": "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
229
- "target_delimiter": " ",
230
- "fewshot_delimiter": "\n\n",
231
- "fewshot_config": {
232
- "sampler": "first_n",
233
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541d369900>, including_answer=True)",
234
- "doc_to_target": ""
235
- },
236
- "num_fewshot": 5,
237
- "metric_list": [
238
- {
239
- "metric": "exact_match",
240
- "aggregation": "mean",
241
- "higher_is_better": true,
242
- "ignore_case": true,
243
- "ignore_punctuation": true
244
- }
245
- ],
246
- "output_type": "generate_until",
247
- "generation_kwargs": {
248
- "until": [
249
- "</s>",
250
- "Q:",
251
- "<|im_end|>"
252
- ],
253
- "do_sample": false,
254
- "temperature": 0.0
255
- },
256
- "repeats": 1,
257
- "filter_list": [
258
- {
259
- "name": "custom-extract",
260
- "filter": [
261
- {
262
- "function": "regex",
263
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
264
- },
265
- {
266
- "function": "take_first"
267
- }
268
- ]
269
- }
270
- ],
271
- "should_decontaminate": false,
272
- "metadata": {
273
- "version": 1.0
274
- }
275
- },
276
- "mmlu_pro_computer_science": {
277
- "task": "mmlu_pro_computer_science",
278
- "task_alias": "computer_science",
279
- "dataset_path": "TIGER-Lab/MMLU-Pro",
280
- "test_split": "test",
281
- "fewshot_split": "validation",
282
- "process_docs": "functools.partial(<function process_docs at 0x14541d368040>, subject='computer science')",
283
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541d3680d0>, including_answer=False)",
284
- "doc_to_target": "answer",
285
- "description": "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
286
- "target_delimiter": " ",
287
- "fewshot_delimiter": "\n\n",
288
- "fewshot_config": {
289
- "sampler": "first_n",
290
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541d368dc0>, including_answer=True)",
291
- "doc_to_target": ""
292
- },
293
- "num_fewshot": 5,
294
- "metric_list": [
295
- {
296
- "metric": "exact_match",
297
- "aggregation": "mean",
298
- "higher_is_better": true,
299
- "ignore_case": true,
300
- "ignore_punctuation": true
301
- }
302
- ],
303
- "output_type": "generate_until",
304
- "generation_kwargs": {
305
- "until": [
306
- "</s>",
307
- "Q:",
308
- "<|im_end|>"
309
- ],
310
- "do_sample": false,
311
- "temperature": 0.0
312
- },
313
- "repeats": 1,
314
- "filter_list": [
315
- {
316
- "name": "custom-extract",
317
- "filter": [
318
- {
319
- "function": "regex",
320
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
321
- },
322
- {
323
- "function": "take_first"
324
- }
325
- ]
326
- }
327
- ],
328
- "should_decontaminate": false,
329
- "metadata": {
330
- "version": 1.0
331
- }
332
- },
333
- "mmlu_pro_economics": {
334
- "task": "mmlu_pro_economics",
335
- "task_alias": "economics",
336
- "dataset_path": "TIGER-Lab/MMLU-Pro",
337
- "test_split": "test",
338
- "fewshot_split": "validation",
339
- "process_docs": "functools.partial(<function process_docs at 0x14541cf66f80>, subject='economics')",
340
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf66830>, including_answer=False)",
341
- "doc_to_target": "answer",
342
- "description": "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
343
- "target_delimiter": " ",
344
- "fewshot_delimiter": "\n\n",
345
- "fewshot_config": {
346
- "sampler": "first_n",
347
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf66b00>, including_answer=True)",
348
- "doc_to_target": ""
349
- },
350
- "num_fewshot": 5,
351
- "metric_list": [
352
- {
353
- "metric": "exact_match",
354
- "aggregation": "mean",
355
- "higher_is_better": true,
356
- "ignore_case": true,
357
- "ignore_punctuation": true
358
- }
359
- ],
360
- "output_type": "generate_until",
361
- "generation_kwargs": {
362
- "until": [
363
- "</s>",
364
- "Q:",
365
- "<|im_end|>"
366
- ],
367
- "do_sample": false,
368
- "temperature": 0.0
369
- },
370
- "repeats": 1,
371
- "filter_list": [
372
- {
373
- "name": "custom-extract",
374
- "filter": [
375
- {
376
- "function": "regex",
377
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
378
- },
379
- {
380
- "function": "take_first"
381
- }
382
- ]
383
- }
384
- ],
385
- "should_decontaminate": false,
386
- "metadata": {
387
- "version": 1.0
388
- }
389
- },
390
- "mmlu_pro_engineering": {
391
- "task": "mmlu_pro_engineering",
392
- "task_alias": "engineering",
393
- "dataset_path": "TIGER-Lab/MMLU-Pro",
394
- "test_split": "test",
395
- "fewshot_split": "validation",
396
- "process_docs": "functools.partial(<function process_docs at 0x14541cf641f0>, subject='engineering')",
397
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf653f0>, including_answer=False)",
398
- "doc_to_target": "answer",
399
- "description": "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
400
- "target_delimiter": " ",
401
- "fewshot_delimiter": "\n\n",
402
- "fewshot_config": {
403
- "sampler": "first_n",
404
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf67f40>, including_answer=True)",
405
- "doc_to_target": ""
406
- },
407
- "num_fewshot": 5,
408
- "metric_list": [
409
- {
410
- "metric": "exact_match",
411
- "aggregation": "mean",
412
- "higher_is_better": true,
413
- "ignore_case": true,
414
- "ignore_punctuation": true
415
- }
416
- ],
417
- "output_type": "generate_until",
418
- "generation_kwargs": {
419
- "until": [
420
- "</s>",
421
- "Q:",
422
- "<|im_end|>"
423
- ],
424
- "do_sample": false,
425
- "temperature": 0.0
426
- },
427
- "repeats": 1,
428
- "filter_list": [
429
- {
430
- "name": "custom-extract",
431
- "filter": [
432
- {
433
- "function": "regex",
434
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
435
- },
436
- {
437
- "function": "take_first"
438
- }
439
- ]
440
- }
441
- ],
442
- "should_decontaminate": false,
443
- "metadata": {
444
- "version": 1.0
445
- }
446
- },
447
- "mmlu_pro_health": {
448
- "task": "mmlu_pro_health",
449
- "task_alias": "health",
450
- "dataset_path": "TIGER-Lab/MMLU-Pro",
451
- "test_split": "test",
452
- "fewshot_split": "validation",
453
- "process_docs": "functools.partial(<function process_docs at 0x14541cf65f30>, subject='health')",
454
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf65b40>, including_answer=False)",
455
- "doc_to_target": "answer",
456
- "description": "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
457
- "target_delimiter": " ",
458
- "fewshot_delimiter": "\n\n",
459
- "fewshot_config": {
460
- "sampler": "first_n",
461
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf65e10>, including_answer=True)",
462
- "doc_to_target": ""
463
- },
464
- "num_fewshot": 5,
465
- "metric_list": [
466
- {
467
- "metric": "exact_match",
468
- "aggregation": "mean",
469
- "higher_is_better": true,
470
- "ignore_case": true,
471
- "ignore_punctuation": true
472
- }
473
- ],
474
- "output_type": "generate_until",
475
- "generation_kwargs": {
476
- "until": [
477
- "</s>",
478
- "Q:",
479
- "<|im_end|>"
480
- ],
481
- "do_sample": false,
482
- "temperature": 0.0
483
- },
484
- "repeats": 1,
485
- "filter_list": [
486
- {
487
- "name": "custom-extract",
488
- "filter": [
489
- {
490
- "function": "regex",
491
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
492
- },
493
- {
494
- "function": "take_first"
495
- }
496
- ]
497
- }
498
- ],
499
- "should_decontaminate": false,
500
- "metadata": {
501
- "version": 1.0
502
- }
503
- },
504
- "mmlu_pro_history": {
505
- "task": "mmlu_pro_history",
506
- "task_alias": "history",
507
- "dataset_path": "TIGER-Lab/MMLU-Pro",
508
- "test_split": "test",
509
- "fewshot_split": "validation",
510
- "process_docs": "functools.partial(<function process_docs at 0x14541cf67d00>, subject='history')",
511
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf66710>, including_answer=False)",
512
- "doc_to_target": "answer",
513
- "description": "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
514
- "target_delimiter": " ",
515
- "fewshot_delimiter": "\n\n",
516
- "fewshot_config": {
517
- "sampler": "first_n",
518
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf64820>, including_answer=True)",
519
- "doc_to_target": ""
520
- },
521
- "num_fewshot": 5,
522
- "metric_list": [
523
- {
524
- "metric": "exact_match",
525
- "aggregation": "mean",
526
- "higher_is_better": true,
527
- "ignore_case": true,
528
- "ignore_punctuation": true
529
- }
530
- ],
531
- "output_type": "generate_until",
532
- "generation_kwargs": {
533
- "until": [
534
- "</s>",
535
- "Q:",
536
- "<|im_end|>"
537
- ],
538
- "do_sample": false,
539
- "temperature": 0.0
540
- },
541
- "repeats": 1,
542
- "filter_list": [
543
- {
544
- "name": "custom-extract",
545
- "filter": [
546
- {
547
- "function": "regex",
548
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
549
- },
550
- {
551
- "function": "take_first"
552
- }
553
- ]
554
- }
555
- ],
556
- "should_decontaminate": false,
557
- "metadata": {
558
- "version": 1.0
559
- }
560
- },
561
- "mmlu_pro_law": {
562
- "task": "mmlu_pro_law",
563
- "task_alias": "law",
564
- "dataset_path": "TIGER-Lab/MMLU-Pro",
565
- "test_split": "test",
566
- "fewshot_split": "validation",
567
- "process_docs": "functools.partial(<function process_docs at 0x14541cf65bd0>, subject='law')",
568
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf66a70>, including_answer=False)",
569
- "doc_to_target": "answer",
570
- "description": "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
571
- "target_delimiter": " ",
572
- "fewshot_delimiter": "\n\n",
573
- "fewshot_config": {
574
- "sampler": "first_n",
575
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf66320>, including_answer=True)",
576
- "doc_to_target": ""
577
- },
578
- "num_fewshot": 5,
579
- "metric_list": [
580
- {
581
- "metric": "exact_match",
582
- "aggregation": "mean",
583
- "higher_is_better": true,
584
- "ignore_case": true,
585
- "ignore_punctuation": true
586
- }
587
- ],
588
- "output_type": "generate_until",
589
- "generation_kwargs": {
590
- "until": [
591
- "</s>",
592
- "Q:",
593
- "<|im_end|>"
594
- ],
595
- "do_sample": false,
596
- "temperature": 0.0
597
- },
598
- "repeats": 1,
599
- "filter_list": [
600
- {
601
- "name": "custom-extract",
602
- "filter": [
603
- {
604
- "function": "regex",
605
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
606
- },
607
- {
608
- "function": "take_first"
609
- }
610
- ]
611
- }
612
- ],
613
- "should_decontaminate": false,
614
- "metadata": {
615
- "version": 1.0
616
- }
617
- },
618
- "mmlu_pro_math": {
619
- "task": "mmlu_pro_math",
620
- "task_alias": "math",
621
- "dataset_path": "TIGER-Lab/MMLU-Pro",
622
- "test_split": "test",
623
- "fewshot_split": "validation",
624
- "process_docs": "functools.partial(<function process_docs at 0x14541cf64b80>, subject='math')",
625
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf66dd0>, including_answer=False)",
626
- "doc_to_target": "answer",
627
- "description": "The following are multiple choice questions (with answers) about math. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
628
- "target_delimiter": " ",
629
- "fewshot_delimiter": "\n\n",
630
- "fewshot_config": {
631
- "sampler": "first_n",
632
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf66c20>, including_answer=True)",
633
- "doc_to_target": ""
634
- },
635
- "num_fewshot": 5,
636
- "metric_list": [
637
- {
638
- "metric": "exact_match",
639
- "aggregation": "mean",
640
- "higher_is_better": true,
641
- "ignore_case": true,
642
- "ignore_punctuation": true
643
- }
644
- ],
645
- "output_type": "generate_until",
646
- "generation_kwargs": {
647
- "until": [
648
- "</s>",
649
- "Q:",
650
- "<|im_end|>"
651
- ],
652
- "do_sample": false,
653
- "temperature": 0.0
654
- },
655
- "repeats": 1,
656
- "filter_list": [
657
- {
658
- "name": "custom-extract",
659
- "filter": [
660
- {
661
- "function": "regex",
662
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
663
- },
664
- {
665
- "function": "take_first"
666
- }
667
- ]
668
- }
669
- ],
670
- "should_decontaminate": false,
671
- "metadata": {
672
- "version": 1.0
673
- }
674
- },
675
- "mmlu_pro_other": {
676
- "task": "mmlu_pro_other",
677
- "task_alias": "other",
678
- "dataset_path": "TIGER-Lab/MMLU-Pro",
679
- "test_split": "test",
680
- "fewshot_split": "validation",
681
- "process_docs": "functools.partial(<function process_docs at 0x14541cf64d30>, subject='other')",
682
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf66560>, including_answer=False)",
683
- "doc_to_target": "answer",
684
- "description": "The following are multiple choice questions (with answers) about other. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
685
- "target_delimiter": " ",
686
- "fewshot_delimiter": "\n\n",
687
- "fewshot_config": {
688
- "sampler": "first_n",
689
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf65c60>, including_answer=True)",
690
- "doc_to_target": ""
691
- },
692
- "num_fewshot": 5,
693
- "metric_list": [
694
- {
695
- "metric": "exact_match",
696
- "aggregation": "mean",
697
- "higher_is_better": true,
698
- "ignore_case": true,
699
- "ignore_punctuation": true
700
- }
701
- ],
702
- "output_type": "generate_until",
703
- "generation_kwargs": {
704
- "until": [
705
- "</s>",
706
- "Q:",
707
- "<|im_end|>"
708
- ],
709
- "do_sample": false,
710
- "temperature": 0.0
711
- },
712
- "repeats": 1,
713
- "filter_list": [
714
- {
715
- "name": "custom-extract",
716
- "filter": [
717
- {
718
- "function": "regex",
719
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
720
- },
721
- {
722
- "function": "take_first"
723
- }
724
- ]
725
- }
726
- ],
727
- "should_decontaminate": false,
728
- "metadata": {
729
- "version": 1.0
730
- }
731
- },
732
- "mmlu_pro_philosophy": {
733
- "task": "mmlu_pro_philosophy",
734
- "task_alias": "philosophy",
735
- "dataset_path": "TIGER-Lab/MMLU-Pro",
736
- "test_split": "test",
737
- "fewshot_split": "validation",
738
- "process_docs": "functools.partial(<function process_docs at 0x14541cf64940>, subject='philosophy')",
739
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf65750>, including_answer=False)",
740
- "doc_to_target": "answer",
741
- "description": "The following are multiple choice questions (with answers) about philosophy. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
742
- "target_delimiter": " ",
743
- "fewshot_delimiter": "\n\n",
744
- "fewshot_config": {
745
- "sampler": "first_n",
746
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cf64e50>, including_answer=True)",
747
- "doc_to_target": ""
748
- },
749
- "num_fewshot": 5,
750
- "metric_list": [
751
- {
752
- "metric": "exact_match",
753
- "aggregation": "mean",
754
- "higher_is_better": true,
755
- "ignore_case": true,
756
- "ignore_punctuation": true
757
- }
758
- ],
759
- "output_type": "generate_until",
760
- "generation_kwargs": {
761
- "until": [
762
- "</s>",
763
- "Q:",
764
- "<|im_end|>"
765
- ],
766
- "do_sample": false,
767
- "temperature": 0.0
768
- },
769
- "repeats": 1,
770
- "filter_list": [
771
- {
772
- "name": "custom-extract",
773
- "filter": [
774
- {
775
- "function": "regex",
776
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
777
- },
778
- {
779
- "function": "take_first"
780
- }
781
- ]
782
- }
783
- ],
784
- "should_decontaminate": false,
785
- "metadata": {
786
- "version": 1.0
787
- }
788
- },
789
- "mmlu_pro_physics": {
790
- "task": "mmlu_pro_physics",
791
- "task_alias": "physics",
792
- "dataset_path": "TIGER-Lab/MMLU-Pro",
793
- "test_split": "test",
794
- "fewshot_split": "validation",
795
- "process_docs": "functools.partial(<function process_docs at 0x14541cfa3eb0>, subject='physics')",
796
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cfa3be0>, including_answer=False)",
797
- "doc_to_target": "answer",
798
- "description": "The following are multiple choice questions (with answers) about physics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
799
- "target_delimiter": " ",
800
- "fewshot_delimiter": "\n\n",
801
- "fewshot_config": {
802
- "sampler": "first_n",
803
- "doc_to_text": "functools.partial(<function format_cot_example at 0x14541cfa3d90>, including_answer=True)",
804
- "doc_to_target": ""
805
- },
806
- "num_fewshot": 5,
807
- "metric_list": [
808
- {
809
- "metric": "exact_match",
810
- "aggregation": "mean",
811
- "higher_is_better": true,
812
- "ignore_case": true,
813
- "ignore_punctuation": true
814
- }
815
- ],
816
- "output_type": "generate_until",
817
- "generation_kwargs": {
818
- "until": [
819
- "</s>",
820
- "Q:",
821
- "<|im_end|>"
822
- ],
823
- "do_sample": false,
824
- "temperature": 0.0
825
- },
826
- "repeats": 1,
827
- "filter_list": [
828
- {
829
- "name": "custom-extract",
830
- "filter": [
831
- {
832
- "function": "regex",
833
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
834
- },
835
- {
836
- "function": "take_first"
837
- }
838
- ]
839
- }
840
- ],
841
- "should_decontaminate": false,
842
- "metadata": {
843
- "version": 1.0
844
- }
845
- },
846
- "mmlu_pro_psychology": {
847
- "task": "mmlu_pro_psychology",
848
- "task_alias": "psychology",
849
- "dataset_path": "TIGER-Lab/MMLU-Pro",
850
- "test_split": "test",
851
- "fewshot_split": "validation",
852
- "process_docs": "functools.partial(<function process_docs at 0x1454204afb50>, subject='psychology')",
853
- "doc_to_text": "functools.partial(<function format_cot_example at 0x1454204afbe0>, including_answer=False)",
854
- "doc_to_target": "answer",
855
- "description": "The following are multiple choice questions (with answers) about psychology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
856
- "target_delimiter": " ",
857
- "fewshot_delimiter": "\n\n",
858
- "fewshot_config": {
859
- "sampler": "first_n",
860
- "doc_to_text": "functools.partial(<function format_cot_example at 0x1454204afd00>, including_answer=True)",
861
- "doc_to_target": ""
862
- },
863
- "num_fewshot": 5,
864
- "metric_list": [
865
- {
866
- "metric": "exact_match",
867
- "aggregation": "mean",
868
- "higher_is_better": true,
869
- "ignore_case": true,
870
- "ignore_punctuation": true
871
- }
872
- ],
873
- "output_type": "generate_until",
874
- "generation_kwargs": {
875
- "until": [
876
- "</s>",
877
- "Q:",
878
- "<|im_end|>"
879
- ],
880
- "do_sample": false,
881
- "temperature": 0.0
882
- },
883
- "repeats": 1,
884
- "filter_list": [
885
- {
886
- "name": "custom-extract",
887
- "filter": [
888
- {
889
- "function": "regex",
890
- "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
891
- },
892
- {
893
- "function": "take_first"
894
- }
895
- ]
896
- }
897
- ],
898
- "should_decontaminate": false,
899
- "metadata": {
900
- "version": 1.0
901
- }
902
- }
903
- },
904
- "versions": {
905
- "mmlu_pro": 2.0,
906
- "mmlu_pro_biology": 1.0,
907
- "mmlu_pro_business": 1.0,
908
- "mmlu_pro_chemistry": 1.0,
909
- "mmlu_pro_computer_science": 1.0,
910
- "mmlu_pro_economics": 1.0,
911
- "mmlu_pro_engineering": 1.0,
912
- "mmlu_pro_health": 1.0,
913
- "mmlu_pro_history": 1.0,
914
- "mmlu_pro_law": 1.0,
915
- "mmlu_pro_math": 1.0,
916
- "mmlu_pro_other": 1.0,
917
- "mmlu_pro_philosophy": 1.0,
918
- "mmlu_pro_physics": 1.0,
919
- "mmlu_pro_psychology": 1.0
920
- },
921
- "n-shot": {
922
- "mmlu_pro_biology": 5,
923
- "mmlu_pro_business": 5,
924
- "mmlu_pro_chemistry": 5,
925
- "mmlu_pro_computer_science": 5,
926
- "mmlu_pro_economics": 5,
927
- "mmlu_pro_engineering": 5,
928
- "mmlu_pro_health": 5,
929
- "mmlu_pro_history": 5,
930
- "mmlu_pro_law": 5,
931
- "mmlu_pro_math": 5,
932
- "mmlu_pro_other": 5,
933
- "mmlu_pro_philosophy": 5,
934
- "mmlu_pro_physics": 5,
935
- "mmlu_pro_psychology": 5
936
- },
937
- "higher_is_better": {
938
- "mmlu_pro": {
939
- "exact_match": true
940
- },
941
- "mmlu_pro_biology": {
942
- "exact_match": true
943
- },
944
- "mmlu_pro_business": {
945
- "exact_match": true
946
- },
947
- "mmlu_pro_chemistry": {
948
- "exact_match": true
949
- },
950
- "mmlu_pro_computer_science": {
951
- "exact_match": true
952
- },
953
- "mmlu_pro_economics": {
954
- "exact_match": true
955
- },
956
- "mmlu_pro_engineering": {
957
- "exact_match": true
958
- },
959
- "mmlu_pro_health": {
960
- "exact_match": true
961
- },
962
- "mmlu_pro_history": {
963
- "exact_match": true
964
- },
965
- "mmlu_pro_law": {
966
- "exact_match": true
967
- },
968
- "mmlu_pro_math": {
969
- "exact_match": true
970
- },
971
- "mmlu_pro_other": {
972
- "exact_match": true
973
- },
974
- "mmlu_pro_philosophy": {
975
- "exact_match": true
976
- },
977
- "mmlu_pro_physics": {
978
- "exact_match": true
979
- },
980
- "mmlu_pro_psychology": {
981
- "exact_match": true
982
- }
983
- },
984
- "n-samples": {
985
- "mmlu_pro_biology": {
986
- "original": 717,
987
- "effective": 717
988
- },
989
- "mmlu_pro_business": {
990
- "original": 789,
991
- "effective": 789
992
- },
993
- "mmlu_pro_chemistry": {
994
- "original": 1132,
995
- "effective": 1132
996
- },
997
- "mmlu_pro_computer_science": {
998
- "original": 410,
999
- "effective": 410
1000
- },
1001
- "mmlu_pro_economics": {
1002
- "original": 844,
1003
- "effective": 844
1004
- },
1005
- "mmlu_pro_engineering": {
1006
- "original": 969,
1007
- "effective": 969
1008
- },
1009
- "mmlu_pro_health": {
1010
- "original": 818,
1011
- "effective": 818
1012
- },
1013
- "mmlu_pro_history": {
1014
- "original": 381,
1015
- "effective": 381
1016
- },
1017
- "mmlu_pro_law": {
1018
- "original": 1101,
1019
- "effective": 1101
1020
- },
1021
- "mmlu_pro_math": {
1022
- "original": 1351,
1023
- "effective": 1351
1024
- },
1025
- "mmlu_pro_other": {
1026
- "original": 924,
1027
- "effective": 924
1028
- },
1029
- "mmlu_pro_philosophy": {
1030
- "original": 499,
1031
- "effective": 499
1032
- },
1033
- "mmlu_pro_physics": {
1034
- "original": 1299,
1035
- "effective": 1299
1036
- },
1037
- "mmlu_pro_psychology": {
1038
- "original": 798,
1039
- "effective": 798
1040
- }
1041
- },
1042
- "config": {
1043
- "model": "vllm",
1044
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=4,data_parallel_size=2,gpu_memory_utilization=0.5,download_dir=/tmp",
1045
- "batch_size": 1,
1046
- "batch_sizes": [],
1047
- "device": null,
1048
- "use_cache": null,
1049
- "limit": null,
1050
- "bootstrap_iters": 100000,
1051
- "gen_kwargs": null,
1052
- "random_seed": 0,
1053
- "numpy_seed": 1234,
1054
- "torch_seed": 1234,
1055
- "fewshot_seed": 1234
1056
- },
1057
- "git_hash": "8e1bd48d",
1058
- "date": 1735955547.4293072,
1059
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
1060
- "transformers_version": "4.47.1",
1061
- "upper_git_hash": null,
1062
- "tokenizer_pad_token": [
1063
- "<unk>",
1064
- "0"
1065
- ],
1066
- "tokenizer_eos_token": [
1067
- "</s>",
1068
- "2"
1069
- ],
1070
- "tokenizer_bos_token": [
1071
- "<s>",
1072
- "1"
1073
- ],
1074
- "eot_token_id": 2,
1075
- "max_length": 4096,
1076
- "task_hashes": {},
1077
- "model_source": "vllm",
1078
- "model_name": "/ALLaM-7B-Instruct",
1079
- "model_name_sanitized": "/ALLaM-7B-Instruct",
1080
- "system_instruction": null,
1081
- "system_instruction_sha": null,
1082
- "fewshot_as_multiturn": false,
1083
- "chat_template": null,
1084
- "chat_template_sha": null,
1085
- "start_time": 22216.794737072,
1086
- "end_time": 22732.624102917,
1087
- "total_evaluation_time_seconds": "515.829365845002"
1088
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/triviaqa_5_shot.json DELETED
@@ -1,128 +0,0 @@
1
- {
2
- "results": {
3
- "triviaqa": {
4
- "alias": "triviaqa",
5
- "exact_match,remove_whitespace": 0.1595519393669193,
6
- "exact_match_stderr,remove_whitespace": 0.0027337509995856123
7
- }
8
- },
9
- "group_subtasks": {
10
- "triviaqa": []
11
- },
12
- "configs": {
13
- "triviaqa": {
14
- "task": "triviaqa",
15
- "dataset_path": "trivia_qa",
16
- "dataset_name": "rc.nocontext",
17
- "training_split": "train",
18
- "validation_split": "validation",
19
- "doc_to_text": "Question: {{question}}?\nAnswer:",
20
- "doc_to_target": "{{answer.aliases}}",
21
- "description": "",
22
- "target_delimiter": " ",
23
- "fewshot_delimiter": "\n\n",
24
- "num_fewshot": 5,
25
- "metric_list": [
26
- {
27
- "metric": "exact_match",
28
- "aggregation": "mean",
29
- "higher_is_better": true,
30
- "ignore_case": true,
31
- "ignore_punctuation": true
32
- }
33
- ],
34
- "output_type": "generate_until",
35
- "generation_kwargs": {
36
- "until": [
37
- "\n",
38
- ".",
39
- ","
40
- ],
41
- "do_sample": false,
42
- "temperature": 0.0
43
- },
44
- "repeats": 1,
45
- "filter_list": [
46
- {
47
- "name": "remove_whitespace",
48
- "filter": [
49
- {
50
- "function": "remove_whitespace"
51
- },
52
- {
53
- "function": "take_first"
54
- }
55
- ]
56
- }
57
- ],
58
- "should_decontaminate": true,
59
- "doc_to_decontamination_query": "question",
60
- "metadata": {
61
- "version": 3.0
62
- }
63
- }
64
- },
65
- "versions": {
66
- "triviaqa": 3.0
67
- },
68
- "n-shot": {
69
- "triviaqa": 5
70
- },
71
- "higher_is_better": {
72
- "triviaqa": {
73
- "exact_match": true
74
- }
75
- },
76
- "n-samples": {
77
- "triviaqa": {
78
- "original": 17944,
79
- "effective": 17944
80
- }
81
- },
82
- "config": {
83
- "model": "vllm",
84
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=4,data_parallel_size=2,gpu_memory_utilization=0.5,download_dir=/tmp",
85
- "batch_size": 1,
86
- "batch_sizes": [],
87
- "device": null,
88
- "use_cache": null,
89
- "limit": null,
90
- "bootstrap_iters": 100000,
91
- "gen_kwargs": null,
92
- "random_seed": 0,
93
- "numpy_seed": 1234,
94
- "torch_seed": 1234,
95
- "fewshot_seed": 1234
96
- },
97
- "git_hash": "8e1bd48d",
98
- "date": 1735955269.5168972,
99
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
100
- "transformers_version": "4.47.1",
101
- "upper_git_hash": null,
102
- "tokenizer_pad_token": [
103
- "<unk>",
104
- "0"
105
- ],
106
- "tokenizer_eos_token": [
107
- "</s>",
108
- "2"
109
- ],
110
- "tokenizer_bos_token": [
111
- "<s>",
112
- "1"
113
- ],
114
- "eot_token_id": 2,
115
- "max_length": 4096,
116
- "task_hashes": {},
117
- "model_source": "vllm",
118
- "model_name": "/ALLaM-7B-Instruct",
119
- "model_name_sanitized": "/ALLaM-7B-Instruct",
120
- "system_instruction": null,
121
- "system_instruction_sha": null,
122
- "fewshot_as_multiturn": false,
123
- "chat_template": null,
124
- "chat_template_sha": null,
125
- "start_time": 21938.879925579,
126
- "end_time": 22173.800151221,
127
- "total_evaluation_time_seconds": "234.92022564199942"
128
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/truthfulqa_mc2_0_shot.json DELETED
@@ -1,108 +0,0 @@
1
- {
2
- "results": {
3
- "truthfulqa_mc2": {
4
- "alias": "truthfulqa_mc2",
5
- "acc,none": 0.4667466051524712,
6
- "acc_stderr,none": 0.015605585169281691
7
- }
8
- },
9
- "group_subtasks": {
10
- "truthfulqa_mc2": []
11
- },
12
- "configs": {
13
- "truthfulqa_mc2": {
14
- "task": "truthfulqa_mc2",
15
- "tag": [
16
- "truthfulqa"
17
- ],
18
- "dataset_path": "truthful_qa",
19
- "dataset_name": "multiple_choice",
20
- "validation_split": "validation",
21
- "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
22
- "doc_to_target": 0,
23
- "doc_to_choice": "{{mc2_targets.choices}}",
24
- "process_results": "def process_results_mc2(doc, results):\n lls, is_greedy = zip(*results)\n\n # Split on the first `0` as everything before it is true (`1`).\n split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n # Compute the normalized probability mass for the correct answer.\n ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n p_true = p_true / (sum(p_true) + sum(p_false))\n\n return {\"acc\": sum(p_true)}\n",
25
- "description": "",
26
- "target_delimiter": " ",
27
- "fewshot_delimiter": "\n\n",
28
- "num_fewshot": 0,
29
- "metric_list": [
30
- {
31
- "metric": "acc",
32
- "aggregation": "mean",
33
- "higher_is_better": true
34
- }
35
- ],
36
- "output_type": "multiple_choice",
37
- "repeats": 1,
38
- "should_decontaminate": true,
39
- "doc_to_decontamination_query": "question",
40
- "metadata": {
41
- "version": 2.0
42
- }
43
- }
44
- },
45
- "versions": {
46
- "truthfulqa_mc2": 2.0
47
- },
48
- "n-shot": {
49
- "truthfulqa_mc2": 0
50
- },
51
- "higher_is_better": {
52
- "truthfulqa_mc2": {
53
- "acc": true
54
- }
55
- },
56
- "n-samples": {
57
- "truthfulqa_mc2": {
58
- "original": 817,
59
- "effective": 817
60
- }
61
- },
62
- "config": {
63
- "model": "vllm",
64
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=4,data_parallel_size=2,gpu_memory_utilization=0.5,download_dir=/tmp",
65
- "batch_size": 1,
66
- "batch_sizes": [],
67
- "device": null,
68
- "use_cache": null,
69
- "limit": null,
70
- "bootstrap_iters": 100000,
71
- "gen_kwargs": null,
72
- "random_seed": 0,
73
- "numpy_seed": 1234,
74
- "torch_seed": 1234,
75
- "fewshot_seed": 1234
76
- },
77
- "git_hash": "8e1bd48d",
78
- "date": 1735957764.7570622,
79
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
80
- "transformers_version": "4.47.1",
81
- "upper_git_hash": null,
82
- "tokenizer_pad_token": [
83
- "<unk>",
84
- "0"
85
- ],
86
- "tokenizer_eos_token": [
87
- "</s>",
88
- "2"
89
- ],
90
- "tokenizer_bos_token": [
91
- "<s>",
92
- "1"
93
- ],
94
- "eot_token_id": 2,
95
- "max_length": 4096,
96
- "task_hashes": {},
97
- "model_source": "vllm",
98
- "model_name": "/ALLaM-7B-Instruct",
99
- "model_name_sanitized": "/ALLaM-7B-Instruct",
100
- "system_instruction": null,
101
- "system_instruction_sha": null,
102
- "fewshot_as_multiturn": false,
103
- "chat_template": null,
104
- "chat_template_sha": null,
105
- "start_time": 24434.078025398,
106
- "end_time": 24545.624577618,
107
- "total_evaluation_time_seconds": "111.54655221999928"
108
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/en/winogrande_0_shot.json DELETED
@@ -1,108 +0,0 @@
1
- {
2
- "results": {
3
- "winogrande": {
4
- "alias": "winogrande",
5
- "acc,none": 0.7048145224940805,
6
- "acc_stderr,none": 0.012819410741754765
7
- }
8
- },
9
- "group_subtasks": {
10
- "winogrande": []
11
- },
12
- "configs": {
13
- "winogrande": {
14
- "task": "winogrande",
15
- "dataset_path": "winogrande",
16
- "dataset_name": "winogrande_xl",
17
- "dataset_kwargs": {
18
- "trust_remote_code": true
19
- },
20
- "training_split": "train",
21
- "validation_split": "validation",
22
- "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n",
23
- "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n",
24
- "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
25
- "description": "",
26
- "target_delimiter": " ",
27
- "fewshot_delimiter": "\n\n",
28
- "num_fewshot": 0,
29
- "metric_list": [
30
- {
31
- "metric": "acc",
32
- "aggregation": "mean",
33
- "higher_is_better": true
34
- }
35
- ],
36
- "output_type": "multiple_choice",
37
- "repeats": 1,
38
- "should_decontaminate": true,
39
- "doc_to_decontamination_query": "sentence",
40
- "metadata": {
41
- "version": 1.0
42
- }
43
- }
44
- },
45
- "versions": {
46
- "winogrande": 1.0
47
- },
48
- "n-shot": {
49
- "winogrande": 0
50
- },
51
- "higher_is_better": {
52
- "winogrande": {
53
- "acc": true
54
- }
55
- },
56
- "n-samples": {
57
- "winogrande": {
58
- "original": 1267,
59
- "effective": 1267
60
- }
61
- },
62
- "config": {
63
- "model": "vllm",
64
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=4,data_parallel_size=2,gpu_memory_utilization=0.5,download_dir=/tmp",
65
- "batch_size": 1,
66
- "batch_sizes": [],
67
- "device": null,
68
- "use_cache": null,
69
- "limit": null,
70
- "bootstrap_iters": 100000,
71
- "gen_kwargs": null,
72
- "random_seed": 0,
73
- "numpy_seed": 1234,
74
- "torch_seed": 1234,
75
- "fewshot_seed": 1234
76
- },
77
- "git_hash": "8e1bd48d",
78
- "date": 1735957928.9213855,
79
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V12 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 0\nBogoMIPS: 4890.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 3 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 48 MiB (96 instances)\nL3 cache: 384 MiB (24 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nNUMA node2 CPU(s): 48-71\nNUMA node3 CPU(s): 72-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
80
- "transformers_version": "4.47.1",
81
- "upper_git_hash": null,
82
- "tokenizer_pad_token": [
83
- "<unk>",
84
- "0"
85
- ],
86
- "tokenizer_eos_token": [
87
- "</s>",
88
- "2"
89
- ],
90
- "tokenizer_bos_token": [
91
- "<s>",
92
- "1"
93
- ],
94
- "eot_token_id": 2,
95
- "max_length": 4096,
96
- "task_hashes": {},
97
- "model_source": "vllm",
98
- "model_name": "/ALLaM-7B-Instruct",
99
- "model_name_sanitized": "/ALLaM-7B-Instruct",
100
- "system_instruction": null,
101
- "system_instruction_sha": null,
102
- "fewshot_as_multiturn": false,
103
- "chat_template": null,
104
- "chat_template_sha": null,
105
- "start_time": 24598.479043164,
106
- "end_time": 24674.97354231,
107
- "total_evaluation_time_seconds": "76.49449914599973"
108
- }