tangtang1995 commited on
Commit
3de499f
·
verified ·
1 Parent(s): 1bf4050
Files changed (46) hide show
  1. .gitattributes +1 -0
  2. src/.DS_Store +0 -0
  3. src/__pycache__/envs.cpython-310.pyc +0 -0
  4. src/__pycache__/envs.cpython-38.pyc +0 -0
  5. src/__pycache__/envs.cpython-39.pyc +0 -0
  6. src/backend/__pycache__/evaluate_model.cpython-310.pyc +0 -0
  7. src/backend/__pycache__/evaluate_model.cpython-38.pyc +0 -0
  8. src/backend/__pycache__/evaluate_model.cpython-39.pyc +0 -0
  9. src/backend/__pycache__/manage_requests.cpython-310.pyc +0 -0
  10. src/backend/__pycache__/manage_requests.cpython-38.pyc +0 -0
  11. src/backend/__pycache__/manage_requests.cpython-39.pyc +0 -0
  12. src/backend/__pycache__/model_operations.cpython-310.pyc +0 -0
  13. src/backend/__pycache__/model_operations.cpython-38.pyc +0 -0
  14. src/backend/__pycache__/model_operations.cpython-39.pyc +0 -0
  15. src/backend/__pycache__/run_eval_suite.cpython-310.pyc +0 -0
  16. src/backend/__pycache__/run_eval_suite.cpython-38.pyc +0 -0
  17. src/backend/__pycache__/run_eval_suite.cpython-39.pyc +0 -0
  18. src/backend/__pycache__/sort_queue.cpython-310.pyc +0 -0
  19. src/backend/__pycache__/sort_queue.cpython-38.pyc +0 -0
  20. src/backend/__pycache__/util.cpython-310.pyc +0 -0
  21. src/backend/__pycache__/util.cpython-38.pyc +0 -0
  22. src/backend/evaluate_model.py +146 -0
  23. src/backend/manage_requests.py +118 -0
  24. src/backend/model_operations.py +615 -0
  25. src/backend/run_eval_suite.py +76 -0
  26. src/backend/sort_queue.py +27 -0
  27. src/backend/util.py +78 -0
  28. src/datasets/Items.xlsx +0 -0
  29. src/datasets/Material_Llama2_0603.xlsx +0 -0
  30. src/datasets/human_data.csv +0 -0
  31. src/datasets/human_data.xlsx +3 -0
  32. src/datasets/leaderboard_dataset.csv +0 -0
  33. src/datasets/prompt.csv +11 -0
  34. src/datasets/prompt.xlsx +0 -0
  35. src/datasets/sample_dataset.csv +11 -0
  36. src/datasets/~$Items.xlsx +0 -0
  37. src/datasets/~$Material_Llama2_0603.xlsx +0 -0
  38. src/display/about.py +162 -0
  39. src/display/css_html_js.py +7 -1
  40. src/display/formatting.py +9 -0
  41. src/display/utils.py +31 -5
  42. src/envs.py +20 -10
  43. src/leaderboard/read_evals.py +56 -64
  44. src/populate.py +11 -13
  45. src/submission/check_validity.py +5 -7
  46. src/submission/submit.py +22 -26
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
36
+ src/datasets/human_data.xlsx filter=lfs diff=lfs merge=lfs -text
src/.DS_Store ADDED
Binary file (8.2 kB). View file
 
src/__pycache__/envs.cpython-310.pyc ADDED
Binary file (1.15 kB). View file
 
src/__pycache__/envs.cpython-38.pyc ADDED
Binary file (1.24 kB). View file
 
src/__pycache__/envs.cpython-39.pyc ADDED
Binary file (1.24 kB). View file
 
src/backend/__pycache__/evaluate_model.cpython-310.pyc ADDED
Binary file (4.75 kB). View file
 
src/backend/__pycache__/evaluate_model.cpython-38.pyc ADDED
Binary file (4.79 kB). View file
 
src/backend/__pycache__/evaluate_model.cpython-39.pyc ADDED
Binary file (4.79 kB). View file
 
src/backend/__pycache__/manage_requests.cpython-310.pyc ADDED
Binary file (3.66 kB). View file
 
src/backend/__pycache__/manage_requests.cpython-38.pyc ADDED
Binary file (3.55 kB). View file
 
src/backend/__pycache__/manage_requests.cpython-39.pyc ADDED
Binary file (3.63 kB). View file
 
src/backend/__pycache__/model_operations.cpython-310.pyc ADDED
Binary file (14.7 kB). View file
 
src/backend/__pycache__/model_operations.cpython-38.pyc ADDED
Binary file (12.7 kB). View file
 
src/backend/__pycache__/model_operations.cpython-39.pyc ADDED
Binary file (12.7 kB). View file
 
src/backend/__pycache__/run_eval_suite.cpython-310.pyc ADDED
Binary file (2.51 kB). View file
 
src/backend/__pycache__/run_eval_suite.cpython-38.pyc ADDED
Binary file (2.5 kB). View file
 
src/backend/__pycache__/run_eval_suite.cpython-39.pyc ADDED
Binary file (2.51 kB). View file
 
src/backend/__pycache__/sort_queue.cpython-310.pyc ADDED
Binary file (1.86 kB). View file
 
src/backend/__pycache__/sort_queue.cpython-38.pyc ADDED
Binary file (1.91 kB). View file
 
src/backend/__pycache__/util.cpython-310.pyc ADDED
Binary file (2.2 kB). View file
 
src/backend/__pycache__/util.cpython-38.pyc ADDED
Binary file (2.19 kB). View file
 
src/backend/evaluate_model.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pandas as pd
3
+ import os
4
+ import csv
5
+
6
+ import src.envs as envs
7
+
8
+ from src.backend.model_operations import SummaryGenerator, EvaluationModel
9
+ import src.backend.util as util
10
+
11
+ logging.basicConfig(level=logging.INFO,
12
+ format='%(asctime)s - %(levelname)s - %(message)s')
13
+
14
+
15
+ class Evaluator:
16
+ """A class to evaluate summaries generated by a language model.
17
+
18
+ Attributes:
19
+ model (str): The name or path of the model.
20
+ revision (str): The model revision.
21
+ precision (str): The precision setting of the model.
22
+ num_fewshot (int): Number of few-shot examples to use.
23
+ batch_size (int): Batch size for processing.
24
+ device (str): The device to run the model on.
25
+ no_cache (bool): Flag to disable caching.
26
+ limit (int): Limit on the number of items to process.
27
+ write_out (bool): Whether to write results to a file.
28
+ output_base_path (str): Base path for output files.
29
+ summary_generator (SummaryGenerator): Instance for generating summaries.
30
+ eval_model (EvaluationModel): Instance for evaluating summaries.
31
+ """
32
+ def __init__(self, model, revision, precision, batch_size,
33
+ device, no_cache, limit, write_out=True,
34
+ output_base_path='logs'):
35
+ """Initializes the Evaluator with the given model and settings.
36
+
37
+ Args:
38
+ model (str): The name or path of the model.
39
+ revision (str): The model revision.
40
+ precision (str): The precision setting of the model.
41
+ num_fewshot (int): Number of few-shot examples to use.
42
+ batch_size (int): Batch size for processing.
43
+ device (str): The device to run the model on.
44
+ no_cache (bool): Flag to disable caching.
45
+ limit (int): Limit on the number of items to process.
46
+ write_out (bool): Whether to write results to a file.
47
+ output_base_path (str): Base path for output files.
48
+ """
49
+ self.model = model
50
+ self.revision = revision
51
+ self.precision = precision
52
+ self.batch_size = batch_size
53
+ self.device = device
54
+ self.no_cache = no_cache
55
+ self.limit = limit
56
+ self.write_out = write_out
57
+ self.output_base_path = output_base_path
58
+ try:
59
+ self.summary_generator = SummaryGenerator(model, revision)
60
+ self.eval_model = EvaluationModel(envs.HEM_PATH)
61
+ except Exception as e:
62
+ logging.error(f"Error initializing Evaluator: {e}")
63
+ raise
64
+
65
+ def evaluate(self):
66
+ """
67
+ Performs the evaluation process by generating summaries
68
+ and computing metrics.
69
+
70
+ Returns:
71
+ dict: A dictionary containing evaluation results.
72
+ """
73
+ try:
74
+ from openpyxl import load_workbook
75
+ # df = load_workbook(filename=envs.DATASET_PATH)
76
+ df_prompt = load_workbook(filename=envs.PROMPT_PATH)
77
+
78
+ # df = pd.read_excel(envs.DATASET_PATH, engine='xlrd') #读取原数据,原始数据,本项目这里应该是问题
79
+ # df_prompt = pd.read_excel(envs.PROMPT_PATH, engine='xlrd')
80
+ # df_prompt = pd.read_csv(envs.PROMPT_PATH)
81
+ # print(envs.DATASET_PATH)
82
+ # print(df.shape)
83
+ # print(df.iloc[-1])
84
+ self.generated_summaries_df = self.summary_generator.generate_summaries(envs.DATASET_PATH, df_prompt, save_path=f"generation_results/{self.model}.csv")
85
+ # exit()
86
+ # avg_summary_len = self.summary_generator.avg_length
87
+ # answer_rate = self.summary_generator.answer_rate
88
+ '''开始评估模型的结果'''
89
+ self.humanlike = self.eval_model.evaluate_humanlike(self.generated_summaries_df, envs.HUMAN_DATA)
90
+ '''原始指标'''
91
+ # self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
92
+ # self.generated_summaries_df)
93
+ # factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
94
+ # hallucination_rate = self.eval_model.hallucination_rate
95
+ factual_consistency_rate = 0
96
+ answer_rate = 0
97
+ avg_summary_len = 0
98
+
99
+ results = util.format_results(model_name=self.model, revision=self.revision,
100
+ precision=self.precision,
101
+ factual_consistency_rate=factual_consistency_rate,
102
+ hallucination_rate=self.humanlike,
103
+ answer_rate=answer_rate,
104
+ avg_summary_len=avg_summary_len)
105
+ return results
106
+ except FileNotFoundError:
107
+ logging.error(f"File not found: {envs.DATASET_PATH}")
108
+ raise
109
+ except Exception as e:
110
+ logging.error(f"Error during evaluation: {e}")
111
+ raise
112
+
113
+ def write_results(self):
114
+ print('Updating result files')
115
+ leaderboard_path = os.getcwd() # the path of leaderboard folder
116
+ print(leaderboard_path)
117
+ working_path = os.path.join(leaderboard_path, 'Humanlike Leaderboard Results')
118
+ if not os.path.exists(working_path):
119
+ logging.error(f"Need to first download the results from google drive to the learderboard folder")
120
+ raise
121
+
122
+ source_summary_df = self.generated_summaries_df[["user_prompt", "response"]]
123
+
124
+ # #update leaderboard_summaries.csv
125
+ # #first remove previous results for the current model
126
+ # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8', sep="\t")
127
+ # mask = existing_df['model'] == self.model
128
+ # existing_df = existing_df[~mask]
129
+ # # get new result
130
+ leaderboard_summaries_df = source_summary_df
131
+ leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
132
+ leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
133
+ print('leaderboard_summaries.csv has been updated')
134
+
135
+ # update leaderboard_summaries_with_scores.csv
136
+ # BUG: get error when opening the file
137
+ # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
138
+ # encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
139
+ # print(existing_df.shape)
140
+ # mask = existing_df['model'] == self.model
141
+ # existing_df = existing_df[~mask]
142
+ # get new result
143
+ leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
144
+ leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
145
+ leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
146
+ print('leaderboard_summaries_with_scores.csv has been updated')
src/backend/manage_requests.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import json
4
+ from dataclasses import dataclass
5
+ from typing import Optional
6
+
7
+ from huggingface_hub import HfApi, snapshot_download
8
+
9
+
10
+ @dataclass
11
+ class EvalRequest:
12
+ model: str
13
+ # private: bool
14
+ status: str
15
+ json_filepath: str = None
16
+ private: bool = False
17
+ weight_type: str = "Original"
18
+ model_type: str = "" # pretrained, finetuned, with RL
19
+ precision: str = "" # float16, bfloat16
20
+ base_model: Optional[str] = None # for adapter models
21
+ revision: str = "main" # commit
22
+ submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
23
+ model_type: Optional[str] = None
24
+ likes: Optional[int] = 0
25
+ params: Optional[int] = None
26
+ license: Optional[str] = ""
27
+
28
+ def get_model_args(self):
29
+ model_args = f"pretrained={self.model},revision={self.revision}"
30
+
31
+ if self.precision in ["float16", "bfloat16"]:
32
+ model_args += f",dtype={self.precision}"
33
+ else:
34
+ raise ValueError(f"Unknown precision {self.precision}.")
35
+
36
+ return model_args
37
+
38
+
39
+ def set_eval_request(api: HfApi, eval_request: EvalRequest, new_status: str,
40
+ hf_repo: str, local_dir: str):
41
+ """Updates a given eval request with its new status on the hub (running, completed, failed,)"""
42
+ json_filepath = eval_request.json_filepath
43
+
44
+ with open(json_filepath) as fp:
45
+ data = json.load(fp)
46
+
47
+ data["status"] = new_status
48
+
49
+ with open(json_filepath, "w") as f:
50
+ f.write(json.dumps(data))
51
+
52
+ api.upload_file(
53
+ path_or_fileobj=json_filepath,
54
+ path_in_repo=os.path.relpath(json_filepath, start=local_dir),
55
+ repo_id=hf_repo,
56
+ repo_type="dataset",
57
+ )
58
+
59
+
60
+ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
61
+ """Get all pending evaluation requests and return a list in which private
62
+ models appearing first, followed by public models sorted by the number of
63
+ likes.
64
+
65
+ Returns:
66
+ list[EvalRequest]: a list of model info dicts.
67
+ """
68
+ snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
69
+ repo_type="dataset", max_workers=60)
70
+ json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
71
+
72
+ eval_requests = []
73
+ for json_filepath in json_files:
74
+ with open(json_filepath) as fp:
75
+ data = json.load(fp)
76
+ if data["status"] in job_status:
77
+ data["json_filepath"] = json_filepath
78
+ eval_request = EvalRequest(**data)
79
+ eval_requests.append(eval_request)
80
+
81
+ return eval_requests
82
+
83
+
84
+ def check_completed_evals(
85
+ api: HfApi,
86
+ hf_repo: str,
87
+ local_dir: str,
88
+ checked_status: str,
89
+ completed_status: str,
90
+ failed_status: str,
91
+ hf_repo_results: str,
92
+ local_dir_results: str,
93
+ ):
94
+ """Checks if the currently running evals are completed, if yes, update their status on the hub."""
95
+ snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results,
96
+ repo_type="dataset", max_workers=60)
97
+
98
+ running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
99
+
100
+ for eval_request in running_evals:
101
+ model = eval_request.model
102
+ print("====================================")
103
+ print(f"Checking {model}")
104
+
105
+ output_path = model
106
+ output_files = f"{local_dir_results}/{output_path}/results*.json"
107
+ output_files_exists = len(glob.glob(output_files)) > 0
108
+
109
+ if output_files_exists:
110
+ print(
111
+ f"EXISTS output file exists for {model} setting it to {completed_status}"
112
+ )
113
+ set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
114
+ else:
115
+ print(
116
+ f"No result file found for {model} setting it to {failed_status}"
117
+ )
118
+ set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
src/backend/model_operations.py ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from datetime import datetime
4
+ import logging
5
+ from pathlib import Path
6
+ import requests
7
+ import json
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import spacy
12
+ from sentence_transformers import CrossEncoder
13
+ import litellm
14
+ # from litellm import completion
15
+ from tqdm import tqdm
16
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
17
+ # from accelerate import PartialState
18
+ # from accelerate.inference import prepare_pippy
19
+ import torch
20
+ import cohere
21
+ from openai import OpenAI
22
+ # import google
23
+ import google.generativeai as genai
24
+
25
+ import src.backend.util as util
26
+ import src.envs as envs
27
+
28
+ # litellm.set_verbose=False
29
+ litellm.set_verbose=True
30
+ # Set up basic configuration for logging
31
+ logging.basicConfig(level=logging.INFO,
32
+ format='%(asctime)s - %(levelname)s - %(message)s')
33
+
34
+ # Load spacy model for word tokenization
35
+ nlp = spacy.load("en_core_web_sm")
36
+
37
+ os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
38
+ os.environ["OPENAI_API_KEY"] = "sk-None-tanhMyavhUtpX2G1kmPuT3BlbkFJGEhM5jmyGyhrTd3LdHDI"
39
+
40
+ def load_evaluation_model(model_path):
41
+ """Load the evaluation model from the given path
42
+
43
+ Args:
44
+ model_path (str): Path to the evaluation model
45
+
46
+ Returns:
47
+ CrossEncoder: The evaluation model
48
+ """
49
+ model = CrossEncoder(model_path)
50
+ return model
51
+
52
+
53
+ class ModelLoadingException(Exception):
54
+ """Exception raised for errors in loading a model.
55
+
56
+ Attributes:
57
+ model_id (str): The model identifier.
58
+ revision (str): The model revision.
59
+ """
60
+
61
+ def __init__(self, model_id, revision, messages="Error initializing model"):
62
+ self.model_id = model_id
63
+ self.revision = revision
64
+ super().__init__(f"{messages} id={model_id} revision={revision}")
65
+
66
+
67
+ class SummaryGenerator:
68
+ """A class to generate summaries using a causal language model.
69
+
70
+ Attributes:
71
+ model (str): huggingface/{model_id}
72
+ api_base (str): https://api-inference.huggingface.co/models/{model_id}
73
+ summaries_df (DataFrame): DataFrame to store generated summaries.
74
+ revision (str): Model revision.
75
+ avg_length (float): Average length of summaries.
76
+ answer_rate (float): Rate of non-empty summaries.
77
+ """
78
+
79
+ def __init__(self, model_id, revision):
80
+ """
81
+ Initializes the SummaryGenerator with a model.
82
+
83
+ Args:
84
+ model_id (str): Identifier for the model.
85
+ revision (str): Revision of the model.
86
+ """
87
+ self.model_id = model_id
88
+ self.model = f"huggingface/{model_id}"
89
+ self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
90
+ self.summaries_df = pd.DataFrame()
91
+ self.revision = revision
92
+ self.avg_length = None
93
+ self.answer_rate = None
94
+ self.exceptions = None
95
+ self.local_model = None
96
+
97
+ def generate_summaries(self, dataset, df_prompt, save_path=None):
98
+ """Generate summaries for a given DataFrame of source docs.
99
+ 修改这里拉取模型生成结果
100
+ Args:
101
+ df (DataFrame): DataFrame containing source docs.
102
+
103
+ Returns:
104
+ summaries_df (DataFrame): Generated summaries by the model.
105
+ """
106
+ exceptions = []
107
+ if (save_path is not None) and os.path.exists(save_path):
108
+ '''已存在文件,可以读取已经存在的测试文本'''
109
+ self.summaries_df = pd.read_csv(save_path)
110
+ # print(self.summaries_df['Experiment'])
111
+
112
+ print(f'Loaded generated summaries from {save_path}')
113
+ else:
114
+ '''测试文件不存在,则需要调用指定的模型来进行测试'''
115
+ # prompt = {}
116
+ # for index, row in tqdm(df_prompt.iterrows(), total=df_prompt.shape[0]):
117
+ # prompt['E' + row['Item']] = row['Prompt']
118
+ xls = pd.ExcelFile(dataset)
119
+ sheet_names = xls.sheet_names
120
+ # sheet_names = df.sheetnames
121
+ print(f"Total: {len(sheet_names)}")
122
+ print(sheet_names)
123
+
124
+ item_ID, questions_ID, user_prompt, response = [], [], [], []
125
+
126
+ for i, sheet_name in enumerate(sheet_names[0:2], start=1):
127
+ # 读取每个工作表
128
+ df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
129
+
130
+ # 假设第一列是'Prompt0',但这里我们使用列名来避免硬编码
131
+ if 'Prompt0' in df_sheet.columns:
132
+ prompt_column = df_sheet['Prompt0']
133
+ else:
134
+ # 如果'Prompt0'列不存在,则跳过该工作表或进行其他处理
135
+ continue
136
+
137
+ # 遍历Prompt0列的值
138
+ for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=1):
139
+ ID = 'E' + str(i)
140
+ q_ID = ID + '_' + str(j)
141
+
142
+ # print(ID, q_ID, prompt_value)
143
+ for i in range(2):
144
+ system_prompt = envs.SYSTEM_PROMPT
145
+ # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
146
+ _user_prompt = prompt_value
147
+ while True:
148
+ try:
149
+ '''调用'''
150
+ print('开始调用LLM-API')
151
+
152
+ _response = self.generate_summary(system_prompt, _user_prompt)
153
+ # print(f"Finish index {index}")
154
+ break
155
+ except Exception as e:
156
+ if 'Rate limit reached' in str(e):
157
+ wait_time = 3660
158
+ current_time = datetime.now().strftime('%H:%M:%S')
159
+ print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
160
+ time.sleep(wait_time)
161
+ elif 'is currently loading' in str(e):
162
+ wait_time = 200
163
+ print(f"Model is loading, wait for {wait_time}")
164
+ time.sleep(wait_time)
165
+ elif '429 Resource has been exhausted' in str(e): # for gemini models
166
+ wait_time = 60
167
+ print(f"Quota has reached, wait for {wait_time}")
168
+ time.sleep(wait_time)
169
+ else:
170
+ print(f"Error at index {i}: {e}")
171
+ _response = ""
172
+ exceptions.append(i)
173
+ break
174
+
175
+ item_ID.append(ID)
176
+ questions_ID.append(q_ID)
177
+ user_prompt.append(_user_prompt)
178
+ response.append(_response)
179
+ print(_response)
180
+ # exit()
181
+
182
+ # Sleep to prevent hitting rate limits too frequently
183
+ time.sleep(1)
184
+
185
+ self.summaries_df = pd.DataFrame(list(zip(item_ID, questions_ID, user_prompt, response)),
186
+ columns=["Experiment", "Question_ID", "User_prompt", "Response"])
187
+
188
+ if save_path is not None:
189
+ print(f'Save summaries to {save_path}')
190
+ fpath = Path(save_path)
191
+ fpath.parent.mkdir(parents=True, exist_ok=True)
192
+ self.summaries_df.to_csv(fpath)
193
+
194
+ self.exceptions = exceptions
195
+ # self._compute_avg_length()
196
+ # self._compute_answer_rate()
197
+
198
+ return self.summaries_df
199
+
200
+ def generate_summary(self, system_prompt: str, user_prompt: str):
201
+ # Using Together AI API
202
+ using_together_api = False
203
+ together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm', 'llama-3']
204
+ for together_ai_api_model in together_ai_api_models:
205
+ if together_ai_api_model in self.model_id.lower():
206
+ using_together_api = True
207
+ break
208
+ # print('适用哪一种LLM',together_ai_api_model , using_together_api)
209
+ # print(self.model_id.lower()) #meta-llama/llama-2-7b-chat-hf
210
+ # print('local',self.local_model) $None
211
+ # exit()
212
+ # if 'mixtral' in self.model_id.lower() or 'dbrx' in self.model_id.lower() or 'wizardlm' in self.model_id.lower(): # For mixtral and dbrx models, use Together AI API
213
+ if using_together_api:
214
+ # suffix = "completions" if ('mixtral' in self.model_id.lower() or 'base' in self.model_id.lower()) else "chat/completions"
215
+ suffix = "chat/completions"
216
+ url = f"https://api.together.xyz/v1/{suffix}"
217
+
218
+ payload = {
219
+ "model": self.model_id,
220
+ # "max_tokens": 4096,
221
+ 'max_new_tokens': 250,
222
+ "temperature": 0.0,
223
+ # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
224
+ }
225
+ # if 'mixtral' in self.model_id.lower():
226
+ # # payload['prompt'] = user_prompt
227
+ # # payload['prompt'] = "Write a summary of the following passage:\nPassage:\n" + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
228
+ # payload['prompt'] = 'You must stick to the passage provided. Provide a concise summary of the following passage, covering the core pieces of information described:\nPassage:\n' + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
229
+ # print(payload)
230
+ # else:
231
+ # payload['messages'] = [{"role": "system", "content": system_prompt},
232
+ # {"role": "user", "content": user_prompt}]
233
+ payload['messages'] = [{"role": "system", "content": system_prompt},
234
+ {"role": "user", "content": user_prompt}]
235
+ headers = {
236
+ "accept": "application/json",
237
+ "content-type": "application/json",
238
+ "Authorization": f"Bearer {os.environ['TOGETHER_API_KEY']}"
239
+ }
240
+
241
+ response = requests.post(url, json=payload, headers=headers)
242
+ try:
243
+ result = json.loads(response.text)
244
+ # print(result)
245
+ result = result["choices"][0]
246
+ if 'message' in result:
247
+ result = result["message"]["content"].strip()
248
+ else:
249
+ result = result["text"]
250
+ result_candidates = [result_cancdidate for result_cancdidate in result.split('\n\n') if len(result_cancdidate) > 0]
251
+ result = result_candidates[0]
252
+ print(result)
253
+ except:
254
+ print(response)
255
+ result = ''
256
+ print(result)
257
+ return result
258
+
259
+ # Using OpenAI API
260
+ elif 'gpt' in self.model_id.lower():
261
+ response = litellm.completion(
262
+ model=self.model_id.replace('openai/',''),
263
+ messages=[{"role": "system", "content": system_prompt},
264
+ {"role": "user", "content": user_prompt}],
265
+ temperature=0.0,
266
+ max_tokens=250,
267
+ )
268
+ result = response['choices'][0]['message']['content']
269
+ print(result)
270
+ return result
271
+
272
+ # Using Google AI API for Gemini models
273
+ elif 'gemini' in self.model_id.lower():
274
+ genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
275
+ generation_config = {
276
+ "temperature": 0,
277
+ "top_p": 0.95, # cannot change
278
+ "top_k": 0,
279
+ "max_output_tokens": 250,
280
+ # "response_mime_type": "application/json",
281
+ }
282
+ safety_settings = [
283
+ {
284
+ "category": "HARM_CATEGORY_HARASSMENT",
285
+ "threshold": "BLOCK_NONE"
286
+ },
287
+ {
288
+ "category": "HARM_CATEGORY_HATE_SPEECH",
289
+ "threshold": "BLOCK_NONE"
290
+ },
291
+ {
292
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
293
+ "threshold": "BLOCK_NONE"
294
+ },
295
+ {
296
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
297
+ "threshold": "BLOCK_NONE"
298
+ },
299
+ ]
300
+ model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest" if "gemini-1.5-pro" in self.model_id.lower() else self.model_id.lower().split('google/')[-1],
301
+ generation_config=generation_config,
302
+ system_instruction=system_prompt,
303
+ safety_settings=safety_settings)
304
+ convo = model.start_chat(history=[])
305
+ convo.send_message(user_prompt)
306
+ # print(convo.last)
307
+ result = convo.last.text
308
+ print(result)
309
+ return result
310
+
311
+ # Using HF API or download checkpoints
312
+ elif self.local_model is None:
313
+ # print(self.model_id)
314
+ # exit()
315
+ try: # try use HuggingFace API
316
+ response = litellm.completion(
317
+ model='command-r-plus' if 'command' in self.model_id else self.model_id,
318
+ messages=[{"role": "system", "content": system_prompt},
319
+ {"role": "user", "content": user_prompt}],
320
+ temperature=0.0,
321
+ max_tokens=1024,
322
+ api_base=self.api_base,
323
+ )
324
+ result = response['choices'][0]['message']['content']
325
+ print(result)
326
+ return result
327
+ # exit()
328
+ except: # fail to call api. run it locally.
329
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
330
+ print("Tokenizer loaded")
331
+ self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
332
+ print("Local model loaded")
333
+ # exit()
334
+ # Using local model
335
+ if self.local_model: # cannot call API. using local model
336
+ messages=[
337
+ {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
338
+ {"role": "user", "content": user_prompt}
339
+ ]
340
+ try: # some models support pipeline
341
+ pipe = pipeline(
342
+ "text-generation",
343
+ model=self.local_model,
344
+ tokenizer=self.tokenizer,
345
+ )
346
+
347
+ generation_args = {
348
+ "max_new_tokens": 250,
349
+ "return_full_text": False,
350
+ "temperature": 0.0,
351
+ "do_sample": False,
352
+ }
353
+
354
+ output = pipe(messages, **generation_args)
355
+ result = output[0]['generated_text']
356
+ print(result)
357
+ except:
358
+ prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
359
+ print(prompt)
360
+ input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
361
+ with torch.no_grad():
362
+ outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
363
+ result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
364
+ result = result.replace(prompt[0], '')
365
+ print(result)
366
+ return result
367
+
368
+ def _compute_avg_length(self):
369
+ """
370
+ Compute the average length of non-empty summaries using SpaCy.
371
+ """
372
+ total_word_count = 0
373
+ total_count = 0
374
+
375
+ for summary in self.summaries_df['summary']:
376
+ if util.is_summary_valid(summary):
377
+ doc = nlp(summary)
378
+ words = [token.text for token in doc if token.is_alpha]
379
+ total_word_count += len(words)
380
+ total_count += 1
381
+
382
+ self.avg_length = 0 if total_count == 0 else total_word_count / total_count
383
+
384
+ def _compute_answer_rate(self):
385
+ """
386
+ Compute the rate of non-empty summaries.
387
+ """
388
+ valid_count = sum(1 for summary in self.summaries_df['summary']
389
+ if util.is_summary_valid(summary))
390
+
391
+ total_count = len(self.summaries_df)
392
+
393
+ self.answer_rate = 0 if total_count == 0 else valid_count / total_count
394
+
395
+
396
+ class EvaluationModel:
397
+ """A class to evaluate generated summaries.
398
+
399
+ Attributes:
400
+ model (CrossEncoder): The evaluation model.
401
+ scores (list): List of evaluation scores.
402
+ accuracy (float): Accuracy of the summaries.
403
+ hallucination_rate (float): Rate of hallucination in summaries.
404
+ """
405
+
406
+ def __init__(self, model_path):
407
+ """
408
+ Initializes the EvaluationModel with a CrossEncoder model.
409
+
410
+ Args:
411
+ model_path (str): Path to the CrossEncoder model.
412
+ """
413
+ self.model = load_evaluation_model(model_path)
414
+ self.scores = []
415
+ self.factual_consistency_rate = None
416
+ self.hallucination_rate = None
417
+ self.humanlike_score = None
418
+
419
+ def code_results(self, summaries_df):
420
+ '''code results from LLM's response'''
421
+ output = []
422
+ '''item1'''
423
+ # print(len(summaries_df['Experiment']),len(summaries_df['Response']))
424
+ # exit()
425
+ for i in range(len(summaries_df['Experiment'])):
426
+ # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
427
+ if summaries_df["Experiment"][i] == "E1":
428
+ if summaries_df["Response"][i].strip() == "Round":
429
+ # vote_1_1 += 1
430
+ output.append("Round")
431
+ elif summaries_df["Response"][i].strip() == "Spiky":
432
+ output.append("Round")
433
+ else:
434
+ output.append("NA")
435
+ # print()
436
+
437
+ '''item2'''
438
+ # vote_2_1, vote_2_2, vote_2_3 = 0, 0, 0
439
+ male_keyword = ["he", "his", "himself"]
440
+ female_keyword = ["she", "her", "herself"]
441
+ if summaries_df["Experiment"][i] == "E2":
442
+ rs = summaries_df["Response"][i].strip()
443
+ rs = rs.split(' ')
444
+ male, female = 0, 0
445
+ for word in rs:
446
+ if word in female_keyword and male != 1:
447
+ female = 1
448
+ output.append("Female")
449
+ break
450
+ if word in male_keyword and female != 1:
451
+ male = 1
452
+ output.append("Male")
453
+ break
454
+ if male == 0 and female == 0 :
455
+ output.append("NA")
456
+ '''item3'''
457
+ '''item4'''
458
+ '''item5'''
459
+ '''item6'''
460
+
461
+ '''item7'''
462
+ if summaries_df["Experiment"][i] == "E7":
463
+ rs = summaries_df["Response"][i].strip()
464
+ if rs == "No":
465
+ output.append("0")
466
+ elif rs == "Yes":
467
+ output.append("1")
468
+ else:
469
+ output.append("NA")
470
+ '''item8'''
471
+ if summaries_df["Experiment"][i] == "E8":
472
+ rs = summaries_df["Response"][i].strip()
473
+ if rs == "Something is wrong with the question":
474
+ output.append("1")
475
+ else:
476
+ output.append("0")
477
+ '''item9'''
478
+ if summaries_df["Experiment"][i] == "E9":
479
+ rs = summaries_df["Response"][i].strip()
480
+
481
+
482
+ '''item10'''
483
+ if summaries_df["Experiment"][i] == "E10":
484
+ rs = summaries_df["Response"][i].strip()
485
+ if rs == "Yes":
486
+ output.append("1")
487
+ else:
488
+ output.append("0")
489
+
490
+
491
+
492
+
493
+
494
+ '''是不是有不同的问题,如何计算'''
495
+ def evaluate_humanlike(self, summaries_df, human_data_path):
496
+ '''
497
+ evaluate humanlike score
498
+ 1. code the result
499
+ 2. comput the similaritirs between human and model
500
+ process model responses'''
501
+ huamn_df = pd.read_csv(human_data_path)
502
+ self.code_results(summaries_df)
503
+ return 9.00
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+ def evaluate_hallucination(self, summaries_df):
524
+ """
525
+ Evaluate the hallucination rate in summaries. Updates the 'scores' attribute
526
+ of the instance with the computed scores.
527
+
528
+ Args:
529
+ summaries_df (DataFrame): DataFrame containing source docs and summaries.
530
+
531
+ Returns:
532
+ list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
533
+ """
534
+ hem_scores = []
535
+ sources = []
536
+ summaries = []
537
+ source_summary_pairs = util.create_pairs(summaries_df)
538
+ '''评价模型结果'''
539
+ for doc, summary in tqdm(source_summary_pairs, desc="Evaluating Humanlikeness"):
540
+ if util.is_summary_valid(summary):
541
+ try:
542
+ summary = summary.replace('<bos>','').replace('<eos>','')
543
+ score = self.model.predict([doc, summary])# [0]
544
+ if not isinstance(score, float):
545
+ try:
546
+ score = score.item()
547
+ except:
548
+ logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
549
+ continue
550
+ hem_scores.append(score)
551
+ sources.append(doc)
552
+ summaries.append(summary)
553
+ except Exception as e:
554
+ logging.error(f"Error while running HEM: {e}")
555
+ raise
556
+
557
+ self.scores = hem_scores
558
+ eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
559
+ return hem_scores, eval_results
560
+ # for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
561
+ # if util.is_summary_valid(summary):
562
+ # try:
563
+ # # summary_pieces = summary.split('\n')
564
+ # # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
565
+ # summary = summary.replace('<bos>','').replace('<eos>','')
566
+ # # print([doc, summary])
567
+ # # print(self.model.predict([doc, summary]))
568
+ # score = self.model.predict([doc, summary])# [0]
569
+ # if not isinstance(score, float):
570
+ # try:
571
+ # score = score.item()
572
+ # except:
573
+ # logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
574
+ # continue
575
+ # hem_scores.append(score)
576
+ # sources.append(doc)
577
+ # summaries.append(summary)
578
+ # except Exception as e:
579
+ # logging.error(f"Error while running HEM: {e}")
580
+ # raise
581
+
582
+ # self.scores = hem_scores
583
+ # eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
584
+ # return hem_scores, eval_results
585
+
586
+
587
+ def compute_factual_consistency_rate(self, threshold=0.5):
588
+ """
589
+ Compute the factual consistency rate of the evaluated summaries based on
590
+ the previously calculated scores. This method relies on the 'scores'
591
+ attribute being populated, typically via the 'evaluate_hallucination' method.
592
+
593
+ Returns:
594
+ float: Factual Consistency Rate. Also updates the 'factual_consistency_rate'
595
+ and 'hallucination_rate' attributes of the instance.
596
+
597
+ Raises:
598
+ ValueError: If scores have not been calculated prior to calling this method.
599
+ """
600
+ if not self.scores:
601
+ error_msg = "Scores not calculated. Call evaluate_hallucination() first."
602
+ logging.error(error_msg)
603
+ raise ValueError(error_msg)
604
+
605
+ # Use threshold of 0.5 to compute factual_consistency_rate
606
+ num_above_threshold = sum(score >= threshold for score in self.scores)
607
+ num_total = len(self.scores)
608
+
609
+ if not num_total:
610
+ raise ValueError("No scores available to compute factual consistency rate.")
611
+
612
+ self.factual_consistency_rate = (num_above_threshold / num_total) * 100
613
+ self.hallucination_rate = 100 - self.factual_consistency_rate
614
+
615
+ return self.factual_consistency_rate
src/backend/run_eval_suite.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import logging
4
+ from datetime import datetime
5
+
6
+ import src.envs as envs
7
+ from src.backend.manage_requests import EvalRequest
8
+ from src.backend.evaluate_model import Evaluator
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO,
12
+ format='%(asctime)s - %(levelname)s - %(message)s')
13
+ logging.getLogger("openai").setLevel(logging.WARNING)
14
+
15
+
16
+ def run_evaluation(eval_request: EvalRequest, batch_size, device,
17
+ local_dir: str, results_repo: str, no_cache=True, limit=None,
18
+ need_check=True, write_results=False):
19
+ """
20
+ Run the evaluation for a given model and upload the results.
21
+
22
+ Args:
23
+ eval_request (EvalRequest): The evaluation request object containing model details.
24
+ num_fewshot (int): Number of few-shot examples.
25
+ batch_size (int): Batch size for processing.
26
+ device (str): The device to run the evaluation on.
27
+ local_dir (str): Local directory path for saving results.
28
+ results_repo (str): Repository ID where results will be uploaded.
29
+ no_cache (bool): Whether to disable caching.
30
+ limit (int, optional): Limit on the number of items to process. Use with caution.
31
+
32
+ Returns:
33
+ dict: A dictionary containing evaluation results.
34
+ """
35
+ if limit:
36
+ logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
37
+
38
+ output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
39
+ # if os.path.exists(output_folder):
40
+ # f_name = os.listdir(output_folder)[-1]
41
+ # print(f"Loading results from {os.path.join(output_folder, f_name)}")
42
+ # results = json.loads(os.path.join(output_folder, f_name))
43
+ # dumped = json.dumps(results, indent=2)
44
+ # logging.info(dumped)
45
+ # else:
46
+ try:
47
+ evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
48
+ batch_size, device, no_cache, limit, write_out=True,
49
+ output_base_path='logs')
50
+ results = evaluator.evaluate()
51
+ if write_results:
52
+ evaluator.write_results()
53
+ except Exception as e:
54
+ logging.error(f"Error during evaluation: {e}")
55
+ raise
56
+
57
+ dumped = json.dumps(results, indent=2)
58
+ logging.info(dumped)
59
+
60
+ output_path = os.path.join(output_folder,
61
+ f"results_{datetime.now()}.json") #
62
+ os.makedirs(output_folder, exist_ok=True)
63
+ with open(output_path, "w") as f:
64
+ f.write(dumped)
65
+ print(f"Results have been saved to{output_path}")
66
+
67
+ if not need_check:
68
+ print("Path in the repo:", f"{eval_request.model}/results_{datetime.now()}.json")
69
+ envs.API.upload_file(
70
+ path_or_fileobj=output_path,
71
+ path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
72
+ repo_id=results_repo,
73
+ repo_type="dataset",
74
+ )
75
+
76
+ return results
src/backend/sort_queue.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ from src.backend.manage_requests import EvalRequest
6
+
7
+
8
+ @dataclass
9
+ class ModelMetadata:
10
+ likes: int = 0
11
+ size: int = 15
12
+
13
+
14
+ def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
15
+ private_models = [model for model in models if model.private]
16
+ public_models = [model for model in models if not model.private]
17
+
18
+ return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
19
+
20
+ def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
21
+ return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
22
+
23
+ def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
24
+ return sorted(eval_requests, key=lambda x: x.params, reverse=False)
25
+
26
+ def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
27
+ return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
src/backend/util.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def is_summary_valid(summary: str) -> bool:
2
+ """
3
+ Checks if the summary is valid.
4
+
5
+ A summary is valid if it is not empty and contains at least five words.
6
+
7
+ Args:
8
+ summary (str): The summary to check.
9
+
10
+ Returns:
11
+ bool: True if the summary is valid, False otherwise.
12
+ """
13
+ if isinstance(summary, str):
14
+ words = summary.split()
15
+ if len(words) >= 5:
16
+ return True
17
+ # print(summary)
18
+ return False
19
+
20
+
21
+ def create_pairs(df):
22
+ """
23
+ Creates pairs of source and summary from the dataframe.
24
+
25
+ Args:
26
+ df (DataFrame): The dataframe containing source and summary columns.
27
+
28
+ Returns:
29
+ list: A list of pairs [source, summary].
30
+ """
31
+ pairs = []
32
+ for _, row in df.iterrows():
33
+ pairs.append([row['source'], row['summary']])
34
+
35
+ return pairs
36
+
37
+
38
+ def format_results(model_name: str, revision: str, precision: str,
39
+ factual_consistency_rate: float, hallucination_rate: float,
40
+ answer_rate: float, avg_summary_len: float) -> dict:
41
+ """
42
+ Formats the evaluation results into a structured dictionary.
43
+
44
+ Args:
45
+ model_name (str): The name of the evaluated model.
46
+ revision (str): The revision hash of the model.
47
+ precision (str): The precision with which the evaluation was run.
48
+ factual_consistency_rate (float): The factual consistency rate.
49
+ hallucination_rate (float): The hallucination rate.
50
+ answer_rate (float): The answer rate.
51
+ avg_summary_len (float): The average summary length.
52
+
53
+ Returns:
54
+ dict: A dictionary containing the structured evaluation results.
55
+ """
56
+ results = {
57
+ "config": {
58
+ "model_dtype": precision, # Precision with which you ran the evaluation
59
+ "model_name": model_name, # Name of the model
60
+ "model_sha": revision # Hash of the model
61
+ },
62
+ "results": {
63
+ "hallucination_rate": {
64
+ "hallucination_rate": round(hallucination_rate,1)
65
+ },
66
+ "factual_consistency_rate": {
67
+ "factual_consistency_rate": round(factual_consistency_rate,1)
68
+ },
69
+ "answer_rate": {
70
+ "answer_rate": round(answer_rate*100,1)
71
+ },
72
+ "average_summary_length": {
73
+ "average_summary_length": round(avg_summary_len,1)
74
+ },
75
+ }
76
+ }
77
+
78
+ return results
src/datasets/Items.xlsx ADDED
Binary file (92.9 kB). View file
 
src/datasets/Material_Llama2_0603.xlsx ADDED
Binary file (147 kB). View file
 
src/datasets/human_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/datasets/human_data.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4995d145fea0d7fdbf9e25cb1fa0d05f2d30eadc0da79e9bb1964ccce3672d7
3
+ size 1597107
src/datasets/leaderboard_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/datasets/prompt.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Item,Condition,Stimuli,V1,Unnamed: 4,Unnamed: 5,Unnamed: 6,Prompt,SystemPrompt,Unnamed: 9,Unnamed: 10,Unnamed: 11,Instruction
2
+ 1,Round,baamoo,Round,"In this task, you will see a novel word. Assuming that the word refers to a shape, we'd like you to guess whether the novel word refers to a round or spiky shape by saying ""Round"" or ""Spiky"".","Please respond only with ""Round"" or ""Spicky""; don’t ask any questions or give any other information.",Please guess whether the following novel word refers to a round or spiky shape:,"In this task, you will see a novel word. Assuming that the word refers to a shape, we'd like you to guess whether the novel word refers to a round or spiky shape by saying ""Round"" or ""Spiky"".\n\nPlease respond only with ""Round"" or ""Spicky""; don’t ask any questions or give any other information.\n\nPlease guess whether the following novel word refers to a round or spiky shape:\nbaamoo",You are a participant of a psycholinguistic experiment. You will do a task on English language use.,<s>[INST] <<SYS>>\n,\n<</SYS>>\n\n,[/INST],"<s>[INST] <<SYS>>\nYou are a participant of a psycholinguistic experiment. You will do a task on English language use.\n<</SYS>>\n\nIn this task, you will see a novel word. Assuming that the word refers to a shape, we'd like you to guess whether the novel word refers to a round or spiky shape by saying ""Round"" or ""Spiky"".\n\nPlease respond only with ""Round"" or ""Spicky""; don’t ask any questions or give any other information.\n\nPlease guess whether the following novel word refers to a round or spiky shape:\nbaamoo[/INST]"
3
+ 2,,,,,,,"In this task, you will see a sentence fragment; please repeat the fragment and continue it into a full sentence.\n\nFor instance, if you see ""The boy went to the park ..."", you can say ""The boy went to the park to fly a kite"".\n\nPlease respond only with your completed sentence; don’t ask any questions or give any other information.\n\nPlease repeat the following fragment and complete it into a full sentence:\nAlthough Pelcra was sick …",,,,,
4
+ 3,,,,,,,"In this task, you will read a sentence fragment, with two words for completing the fragment. Please choose the word that you think best completes the fragment.\n\nPlease respond only with your chosen word; don’t ask any questions or give any other information.\n\nHere’s the sentence fragment:\nSusan was very bad at algebra, so she hated...\n\nPlease choose the word from the following options that you prefer to complete the fragment:\nmathematics\nmath",,,,,
5
+ 4,,,,,,,"In this task, I would like to present you with five sentences. Please just carefully read the sentences; you don't have to do anything with them.\n\nHere are the sentences:\nSENTENCE1: The curious cat silently watched the busy people from atop the old wooden fence.\nSENTENCE2: The man accepted the post in the accountancy firm.\nSENTENCE3: She decided to take a different path through the park, enjoying the unexpected quiet.\nSENTENCE4: He found an old book in his attic that contained stories of ancient heroes and legends.\nSENTENCE5: They often spent their evenings by the lake, listening to the soothing sounds of nature.\n\nNext, I am going to present some words to you; upon reading each word, please provide ONLY ONE word/phrase as an associate.\n\nFor instance, if the word you see is ""milk"", you can provide ""breakfast"" or ""cow"" as an associate.\n\nPlease respond only with the associate words in order; separate them with semicolons; don’t ask any questions or give any other information.\n\nHere are the words:\nWORD1: bottle\nWORD2: cloud\nWORD3: blanket\nWORD4: paper\nWORD5: post",,,,,
6
+ 5,,,,,,,"In this task, you will see a sentence fragment; please repeat the fragment and continue it into a full sentence.\n\nFor instance, if you see ""The boy went to the park ..."", you can type ""The boy went to the park to fly a kite"".\n\nPlease respond only with the two completed sentences in order, separated by two line breaks; don’t ask any questions or give any other information.\n\nHere are two fragments for you to complete:\nFRAGMENT1: The captain lent the spare lifejacket...\nFRAGMENT2: The bus driver gave...",,,,,
7
+ 6,,,,,,,"In this task, you will read a short passage containing several sentences, followed by a question about the passage. Please answer the question according to the passage.\n\nFor instance, if you read ""There was a tiger and a fox. The tiger ate the fox because it was hungry. Who was hungry?"", you can say ""the tiger"" as an answer.\n\nPlease respond only with your answer; don’t ask any questions or give any other information.\n\nPlease answer the question according to preceding passage:\nPASSAGE: There was a farmer and a thief. The farmer injured the thief with a staff only a few days ago.\nQUESTION: Who had a staff, the farmer or the injured thief?",,,,,
8
+ 7,,,,,,,"In this task, you will read a sentence, followed by a comprehension question. Please choose ""Yes"" or ""No"" to answer the question.\n\nPlease answer the question in this format without any other words:[ANSWER].\n\nRead the sentence and answer the question:\nSENTENCE: The sister mailed the letter the niece.\nQUESTION: Did the niece receive something/someone?",,,,,
9
+ 8,,,,,,,"In this task, I want you to answer a question.\n\nYou may encounter a question which has something wrong with it. For example, you might see the question: ""When was President Gerald Ford forced to resign his office? "" The thing that is wrong in the question is that Ford wasn't forced to resign. When you see a question like this, just say ""Something is wrong with the question"".\n\nPlease respond only with your answer; don’t ask any questions or give any other information.\n\nPlease answer the question:\nRegina is the capital of what Canadian prairie city?",,,,,
10
+ 9,,,,,,,"In this task, you will see a sentence fragment; please repeat the fragment and continue it into a full sentence.\n\nFor instance, if you see ""The boy went to the park ..."", you can type ""The boy went to the park to fly a kite"".\n\nPlease respond only with your completed sentence; don’t ask any questions or give any other information.\n\nRead the sentence fragment and continue it into a full sentence:\nGary scared Anna because…",,,,,
11
+ 10,,,,,,,"In this task, you will read a short passage and answer a yes/no question regarding the passage. Please say ""Yes"", ""No"", or ""Don't know"" to answer.\n\nPlease respond only with ""Yes"", ""No"", or ""Don't know""; don’t ask any questions or give any other information.\n\nPlease respond to the question according to the preceding passage:\nWhile swimming in the shallow water near the rocks, Sharon stepped on a piece of glass. She called desperately for help, but there was no one around to hear her. Did she cut her foot?",,,,,
src/datasets/prompt.xlsx ADDED
Binary file (12.2 kB). View file
 
src/datasets/sample_dataset.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ text,dataset
2
+ "Paul Merson has restarted his row with Andros Townsend after the Tottenham midfielder was brought on with only seven minutes remaining in his team's 0-0 draw with Burnley on Sunday. 'Just been watching the game, did you miss the coach? #RubberDub #7minutes,' Merson put on Twitter. Merson initially angered Townsend for writing in his Sky Sports column that 'if Andros Townsend can get in (the England team) then it opens it up to anybody.' Paul Merson had another dig at Andros Townsend after his appearance for Tottenham against Burnley . Townsend was brought on in the 83rd minute for Tottenham as they drew 0-0 against Burnley . Andros Townsend scores England's equaliser in their 1-1 friendly draw with Italy in Turin on Tuesday night . The former Arsenal man was proven wrong when Townsend hit a stunning equaliser for England against Italy and he duly admitted his mistake. 'It's not as though I was watching hoping he wouldn't score for England, I'm genuinely pleased for him and fair play to him – it was a great goal,' Merson said. 'It's just a matter of opinion, and my opinion was that he got pulled off after half an hour at Manchester United in front of Roy Hodgson, so he shouldn't have been in the squad. 'When I'm wrong, I hold my hands up. I don't have a problem with doing that - I'll always be the first to admit when I'm wrong.' Townsend hit back at Merson on Twitter after scoring for England against Italy . Sky Sports pundit Merson (centre) criticised Townsend's call-up to the England squad last week . Townsend hit back at Merson after netting for England in Turin on Wednesday, saying 'Not bad for a player that should be 'nowhere near the squad' ay @PaulMerse?' Any bad feeling between the pair seemed to have passed but Merson was unable to resist having another dig at Townsend after Tottenham drew at Turf Moor.",summeval_valid
3
+ "Chelsea have made an offer for FC Tokyo's 22-year-old forward Yoshinori Muto, according to club president Naoki Ogane. The Japan international, who has played for the J-League side since 2013, will join Chelsea's Dutch partner club Vitesse Arnhem on loan next season if he completes a move to Stamford Bridge this summer. Ogane claims that Chelsea's interest in Muto is not connected to the £200million sponsorship deal they signed with Japanese company Yokohama Rubber in February. FC Tokyo forward Yoshinori Muto (centre) brings the ball forward against Albirex Niigata in March . FC Tokyo president Naoki Ogane claims that Chelsea have made a bid for Japan international Muto . Muto tussles with Yuji Nakazawa of Yokohama F.Marinos during a J-League clash last month . Age: 22 . Club: FC Tokyo . Appearances: 37 . Goals: 16 . International caps (Japan): 11 . International goals: 1 . Did you know? Muto graduated from Keio University in Tokyo with an economics degree two weeks ago . Speaking to Sports Nippon, Ogane said: 'It is true that Chelsea sent us an offer for Muto. 'It is a formal offer with conditions. They want to acquire him in the summer.' Muto, who only graduated from Keio University a fortnight ago after completing an economics degree, would be the first Japanese player to represent Chelsea if he moves to west London. He has earned 11 caps for his country after signing his first professional contract in 2014, scoring once for the Samurai Blue. A £4million deal for the youngster has been mooted, but Muto admits that he isn't sure if he will join the Premier League title chasers despite being pleased with their bid. He said: 'I have not decided yet at all. It is an honour for me to receive this offer from a great club.' Muto scored 13 times in his debut season with FC Tokyo and was named in the J-League's best XI. Muto admits it is an 'honour' to receive an offer from Chelsea although he has not yet decided to join the club . Muto, pictured in action against Ventforet Kofu has scored three goals in four games so far this season . The 22-year-old has a shot at goal during Japan's Asian Cup match against Palestine in January this year . He has continued his fine form during the current campaign, helping his club to third place in the division with three goals in four games. Yokohama Rubber - one of the world's largest tyre manufacturers - will become Chelsea's official shirt sponsors from the start of the 2015-16 season. The initial five-year deal is the biggest in the club's history, with the Blues now considering a two-week pre-season tour of Japan this summer.",summeval_valid
4
+ "Babies given antibiotics in the first six months of life are more likely to be fat as toddlers, a large-scale study has found. The researchers said say that just as antibiotics are used to make farm animals put on weight, the may also be fattening our children. Writing in the respected medical journal Pediatrics, they said that the widely-prescribed drugs could be contributing to the obesity epidemic. A third of 10-11 year olds and more than a fifth of 4-5 year olds in England are overweight or obese, leading to fears that today's generation will be the first to die at an earlier age than their parents. Obesity: Babies given antibiotics in the first six months of life are more likely to be fat as toddlers, a large-scale study has found (file photo) The Finnish researchers compared the weight and height of more than 12,000 healthy two year old with records on antibiotic prescription. By two years-old, one in five boys and one in ten girls was overweight or obese. And children who had taken antibiotics as young babies were particularly likely to be overweight. Repeated prescriptions before the age of two also raised the odds of being a fat toddler. Boys seemed particularly prone weight gain after being given antibiotics. They were also slightly taller than boys who hadn't been given the drugs. The study didn't prove that antibiotics were causing weight gain. But if they do, it may be because they kill of bugs in the gut that would normally feed on some of the food eaten. This frees up more food for the body. Killing certain gut bugs may also increase appetite. Lead author Dr Antti Saari, of Kuopio University Hospital, warned: 'Antibiotic exposure before six months of age, or repeatedly during infancy, was associated with increased body mass in healthy children. 'Such effects may play a role in the worldwide childhood obesity epidemic and highlight the importance of judicious use of antibiotics in infancy. The worldwide obesity epidemic is real, and is more pronounced for boys. 'Epidemic': By two years-old, one in five boys and one in ten girls was overweight or obese and children who had taken antibiotics as young babies were particularly likely to be overweight (file photo) 'An increase in the use of antibiotics could be an additional contributing factor to the development of excess weight problems. 'The crucial role of antibiotics in the improvement of human health is unquestionable but their extended use today has undesirable and unexpected consequences.' Previous research has found that babies given antibiotics are at higher risk of eczema and digestive problems. The studies come amid growing concern that the over-prescription of antibiotics is leading to the pills losing their power and making common infections harder to treat. The Government has warned that a new superbug could infect up to 200,000 Britons and kill 80,000 in a single outbreak.",summeval_valid
5
+ "This is the embarrassing moment a Buckingham Palace guard slipped and fell on a manhole cover in front of hundreds of shocked tourists as he took up position in his sentry box. The Queen's Guard was left red-faced after he slipped on a manhole cover during the popular Changing of the Guard - and unfortunately for him the entire incident was caught on camera. He lost his footing and slid sideways, knocking his bearskin on the side of the box and dropping his rifle. The Queen's Guard (pictured) slipped on a manhole cover during the popular Changing of the Guard at Buckingham Palace last week. Unfortunately for him, the entire incident was caught on a tourist's camera . The embarrassed soldier quickly scrambled to his feet as his colleagues marched past as if nothing had happened. But the young guard started to blush as he peered at the crowd from under his bearskin and realised how many people had seen his slapstick moment. Holidaymaker David Meadwell recorded the unscheduled manouevre outside Buckingham Palace on Thursday afternoon. Mr Meadwell, 50, from Newcastle-upon-Tyne, said: 'I was with my family for a trip to London and I thought I'd take some pictures of the changing of the guards. Tourist David Meadwell shot this footage of the Changing of the Guard last week when the incident unfolded . The moment it all started to go wrong: The guard leans heavily to the side as he appears to slip . The unidentified young guard's legs appear to get a bit tangled as he tries to turn to march away . The guard, wearing full regalia, falls heavily to the floor still clutching his rifle following the slip up . 'The first group changed successfully, without any problems, and so I decided to video the next group doing it. 'I didn't expect anything like this to happen - he went flying. There were quite a few people around and there were a lot of gasps as he went down. 'I think he just slipped on a manhole cover, he looked so embarrassed.' The unnamed solider is thought to have slipped because of the metal protectors nailed to the soles of his boots. Tourists gather in their thousands to watch the changing of the guard outside Buckingham Palace at 11.30am every day in the summer and every other day in the winter. The Guard comprises two detachments, one each for Buckingham Palace and St James's Palace, under the command of the Captain of The Queen's Guard. Contrary to popular belief they are not purely ceremonial and are fully operational soldiers. The Ministry of Defence said they would not comment on 'a young man falling over while doing his job'. The embarrassed guard hastily scrambled to his feet following the unfortunate tumble at the palace . The incident took place in front of hundreds of tourists who were watching the Changing of the Guard .",summeval_valid
6
+ "(CNN)One of the biggest TV events of all time is being reimagined for new audiences. ""Roots,"" the epic miniseries about an African-American slave and his descendants, had a staggering audience of over 100 million viewers back in 1977. Now A&E networks are remaking the miniseries, to air in 2016. A&E, Lifetime and History (formerly the History Channel) announced Thursday that the three networks would simulcast a remake of the saga of Kunta Kinte, an African who was captured, shipped to America and sold into slavery to work on a Virginia plantation. LeVar Burton, who portrayed Kinte in the original, will co-executive produce the new miniseries. A press release describes the new version as ""original"" and ""contemporary"" and will draw more from Alex Haley's classic novel, ""Roots: The Saga of an American Family."" Producers will consult scholars in African and African-American history for added authenticity. ""We are proud to bring this saga to fans of the original, as well as to a new generation that will experience this powerful and poignant tale for the first time,"" said Dirk Hoogstra, History's executive vice president and general manager. ""Audiences will once again feel the impact of Kunta Kinte's indomitable spirit."" Executive producer Mark Wolper, son of the original's producer David L. Wolper, added, ""Kunta Kinte began telling his story over 200 years ago and that story went through his family lineage, to Alex Haley, to my father, and now the mantle rests with me. Like Kunta Kinte fought to tell his story over and over again, so must we."" The remade ""Roots"" will encounter a new generation of viewers who have witnessed Barack Obama make history as the nation's first African-American president and ""12 Years a Slave"" win the Oscar for Best Picture, but also widespread racial unrest over police treatment of black suspects in many U.S. cities. ""My career began with 'Roots' and I am proud to be a part of this new adaptation,"" said Burton. ""There is a huge audience of contemporary young Americans who do not know the story of 'Roots' or its importance.""",summeval_valid
7
+ "Police are investigating claims by a former royal footman that palace aides tried to force him into an orgy, it was revealed yesterday. Christopher Lawler said he was pinned to a chair and groped by a male member of staff on his first day working at Clarence House. The ordeal left him in tears and he left the job the same day. He finally decided to contact palace officials again last year after inquiries began into alleged cover-ups of child sex abuse by a VIP paedophile ring in the 1970s and 80s. Police are investigating claims made by a former royal footman that Clarence House aides tried to force him into an orgy in the 1970s . But his complaints were ignored for months before the police were finally notified, he claims. Mr Lawler, now 64, said the attempted abuse at the Queen Mother’s London residence happened in January 1978 when he was 27. He said he was accosted after he walked into a bedroom to look for a pen on his first shift. Two men offered him a drink before two other men joined them – one a senior member of staff, he said. He was asked if he was gay before a younger man took his trousers off and began performing a sex act on himself. ‘That prompted another guy to put his hand on my leg and then he grabbed me,’ he told the Sunday People. ‘I was staggered. The younger man then came up behind me and gripped me, holding me in the chair. Mr Lawler worked at Clarence House when the Queen Mother used it as her London residence . ‘They were trying to undo my trousers but I managed to jump up and burst out of the room.’ Afterwards, Mr Lawler said he was followed by two men and threatened to keep quiet. A complaint he made that day allegedly drew an angry response from Clarence House – so he packed his bags immediately. Mr Lawler, a former Port of Liverpool police officer, said he was ‘haunted’ by the incident for years. After hearing about the probe into historical cases of child abuse last year he wrote to the Palace, but was twice rebuffed. Months later the complaint was finally passed to Scotland Yard. A retired royal aide, who is now in his 80s, was reportedly interviewed but denied he was involved because he was working for the Queen in Balmoral at the time. Mr Lawler has now been told by the Palace that the Royal Household would work ‘exhaustively and co-operatively’ with any police probe. A police spokesman said it would be inappropriate to comment ‘as investigations continue’.",summeval_valid
8
+ "An Oregon couple announced they are expecting a child in a rap video they made set to the theme from '90s television sitcom 'The Fresh Prince of Bel-Air.' The clip, which features Jesse and Melissa Meek rapping as they drive in a car, has been viewed over 1.7 million times on YouTube. 'In Happy Valley, Oregon, livin' it wed, bought our first house and started to build our homestead,' Melissa says in the clip. Parents: Jesse and Melissa Meek announced they are expecting a child in a video set to the theme song from '90s television sitcom 'The Fresh Prince of Bel-Air' The original song for the popular NBC sitcom, sung by star Will Smith, details how his character grew up in West Philadelphia, where he got into a neighborhood fight - and at his mother's insistence, moved to his aunt and uncle's home in their wealthy Los Angeles neighborhood. In the Meeks' parody video, Melissa raps, 'When our family and friends, who were up to no good, started asking questions about parenthood. 'We told one little lie, said ""It's not time yet."" Parents said ""We're ready for grandkids, get a move on it!""' Jesse raps that the couple thought it would take longer than the two months the pair spent trying to conceive. Melissa says in the video 'I woke up in the morning about 7 or 8 and I thought to myself, ""Oh man, I'm late!"" Looked at the test and it was finally there. The little plus sign. We're now three. Not a pair.' At the end of the video, the Meeks smile and share a sonogram of their unborn child. It took five takes to film the clip, the couple told KPTV. After finding out the gender of the child, another video is a possibility for the Meeks, the couple told the Fox affiliate. Original: Will Smith is seen here rapping the theme for 'The Fresh Prince of Bel-Air' during the show's title sequence . Big reveal: At the end of the clip, the Meeks share a sonogram of their unborn child . According to KPTV, the video was made so loved ones living far away could know about the baby. Melissa told the Fox affliate of the video's success 'It was completely unexpected. Like, that was the last thing we thought would happen, that it would blow up to what it had, or what it has.' Jesse told the Oregonian 'It has been a lot of fun, but definitely way more than we ever expected.' He is the great-great-grandson of Oregon pioneer Joseph Lafayette Meek, the newspaper reported. The Oregonian reported that Melissa earlier made a video which captured Jesse's reaction when he found out about the pregnancy. Jesse learned the news after reading a tag Melissa placed on their dog, which indicated a baby was on the way. A Phoenixville, Pennsylvania, couple made a pregnancy announcement video using 'The Fresh Prince of Bel-Air' theme song last year and recreated the show's title sequence, People reported.",summeval_valid
9
+ "A dress worn by Vivien Leigh when she played Scarlett O'Hara in the classic 1939 film Gone With the Wind has fetched $137,000 at auction. Heritage Auctions offered the gray jacket and skirt, featuring a black zigzag applique, plus more than 150 other items from the Academy Award-winning film at auction on Saturday in Beverly Hills, California. The dress - a jacket and full skirt ensemble - was worn in several key scenes in the 1939 movie, including when Scarlett O'Hara encounters Rhett Butler, played by Clark Gable, and when she gets attacked in the shanty town. Scroll down for video . An outfit worn in several scenes of the 1939 film Gone With The Wind by Vivien Leigh as she played Scarlett O'Hara sold for $137,000 at auction on Saturday . The dress - a jacket and full skirt ensemble - was worn in several key scenes in the 1939 movie but has suffered a little with age and has faded to light gray from original slate blue-gray color . The outfit has suffered a little with age, however. When Leigh wore it in the movie, it was slate blue-gray but over the years it has faded to light gray. It was one of more than 150 items that were part of the private collection of James Tumblin, formerly in charge of the hair and makeup department at Universal Studios. Tumblin began collecting onscreen costumes, props and behind-the-scenes artifacts from the film in the 1960s, amassing a collection of more than 300,000 pieces of memorabilia. During a visit to the Western Costume Company he spotted the Scarlett O'Hara dress on the floor. He learned that the dress was about to be thrown away and negotiated a deal to buy it for $20. Tumblin has 'devoted his life and efforts to promoting Hollywood and this film, touring his items throughout the United States,' said Kathleen Guzman, managing director of Heritage Auctions. Gone With The Wind, which celebrated its 75th anniversary last year, was based on Margaret Mitchell's 1936 best-selling book about a spoiled Old South socialite, played by Vivien Leigh, and co-starred Clark gable as Rhett Butler . Hattie McDaniel (left), Olivia DeHavilland (middle), and Vivien Leigh: McDaniel famously became the first African-American actor to be nominated for and win an Academy Award . Other top selling items from the auction were a straw hat worn by Leigh that sold for $52,500; the trousers and jacket from a suit worn by Clark Gable as Rhett Butler, selling for $55,000; and a black bonnet worn by both Leigh and Olivia De Havilland as Melanie Wilkes, which fetched $30,000. Gone With The Wind, which celebrated its 75th anniversary last year, was based on Margaret Mitchell's 1936 best-selling book about a spoiled Old South socialite. Actress Hattie McDaniel, who played Scarlett's devoted nanny Mammy, a slave, famously became the first African-American actor to be nominated for and win an Academy Award.",summeval_valid
10
+ "A two-year-old boy is recovering after falling into a cheetah exhibit at the Cleveland Metroparks Zoo after his parents dangled him over the edge, officials said. The toddler's mother was holding him and another child when he slipped and fell between 10 to 12ft and into the pit on Saturday around 3pm. The child was rescued by his parents before firefighters and paramedics arrived on the scene. Scroll down for video . A mother was holding the two-year-old boy and another child when the toddler slipped and fell into the cheetah exhibit at the Cleveland Metroparks Zoo (file photo of cheetahs at the Cleveland zoo) The boy was rescued by his parents from the pit (pictured) before firefighters and paramedics arrived on the scene. He suffered from minor bumps and bruises and was listed in stable condition at the hospital . He is listed in stable condition after being taken to MetroHealth Medical Center and suffered from minor bruises and bumps. The boy's leg was injured in the fall, but he was not attacked by the animals, Dr. Christopher Kuhar, the zoo's executive director told Fox 8. Michael Lurie and his family were at the Cheetah exhibit when they heard the child scream. 'You saw how far the drop was and you just couldn't believe the kid didn't hurt himself from falling down on the ground,' Lurie told WKYC. 'I was just shocked,' he said. 'I didn't understand how the parents let the kid go over the thing.' The cheetahs did not approach the boy or his parents while in the pit, according to zoo officials. Zoo visitor Terra Lurie believes the boy was not approached by the fast feline because they were frightened. 'I think they were just curious as to what was going on and why somebody was in the pen with them,' she said. 'It's not every day that somebody is just in the pen with them.' 'And everyone else is screaming and they probably got scared.' Kuhar said the zoo had received 'a number of eyewitness accounts' that indicate the 'strong likelihood that the child was dangled over the railing,' he told NewsNet5. Cleveland Metroparks Zoo has plans to press child endangerment charges against the family on Monday. The exhibit was closed following the child's fall. Zoo visitor Michael Lurie was at the cheetah exhibit when he heard the child scream. He said he was 'shocked' and 'didn't understand how the parents let the kid' go over the railing and into the pit . Cleveland Metroparks Zoo plans to press child endangering charges against the child's mother (above file photo of visitors at the Cleveland zoo)",summeval_valid
11
+ "The owners of this house better not leave too quickly, after a speed camera was angled directly on their front door. The bright yellow gatso had previously enforced the 30mph speed limit for motorists along the residential road in Handsworth, Birmingham. However, it has not been working for two years after every single fixed device was switched off in the West Midlands. Big Brother is watching: A speed camera has been turned round and is pointing at this house in Birmingham, West Midlands . The speed camera has not been working for more than two years . Around 300 speed and traffic camera, using old technology, were turned off across the region in March 2013 . In there place, speed enforcement operations have been carried out by a small number of mobile camera units, fixed cameras on motorways and traffic officers on patrol. Mystery surrounds who had re-pointed the camera, but a spokesman for Birmingham City Council said they were aware of it. One of their engineers will now be visiting the site and the camera could be removed completely. 'Fixed location safety cameras have been decommissioned across the West Midlands since 2013 as the technology inside them had become obsolete,' the spokesman said. 'Plans for a pilot at a limited number of sites, using digital technology, is currently in development. 'Now the issue with this camera in Wellington Road has been brought to our attention, we will take any appropriate action at the site.' The spokesman confirmed that there were no plans to include the camera in Wellington Road in the new pilot. The owners of the house were not available for comment.",summeval_valid
src/datasets/~$Items.xlsx ADDED
Binary file (165 Bytes). View file
 
src/datasets/~$Material_Llama2_0603.xlsx ADDED
Binary file (165 Bytes). View file
 
src/display/about.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ col_name: str
9
+
10
+
11
+ class Tasks(Enum):
12
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
13
+ hallucination_rate = Task("hallucination_rate",
14
+ "hallucination_rate", "Hallucination Rate (%)")
15
+ factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
16
+ answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
17
+ average_summary_length = Task("average_summary_length",
18
+ "average_summary_length", "Average Summary Length")
19
+
20
+
21
+ # Your leaderboard name
22
+ TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model (HHEM) leaderboard</h1>"""
23
+
24
+ # What does your leaderboard evaluate?
25
+ INTRODUCTION_TEXT = """
26
+ This leaderboard (by [Vectara](https://vectara.com)) evaluates how often an LLM introduces hallucinations when summarizing a document. <br>
27
+ The leaderboard utilizes [HHEM](https://huggingface.co/vectara/hallucination_evaluation_model), an open source hallucination detection model.<br>
28
+ An improved version (HHEM v2) is integrated into the [Vectara platform](https://console.vectara.com/signup/?utm_source=huggingface&utm_medium=space&utm_term=integration&utm_content=console&utm_campaign=huggingface-space-integration-console).
29
+
30
+ """
31
+
32
+ # Which evaluations are you running? how can people reproduce what you have?
33
+ LLM_BENCHMARKS_TEXT = """
34
+ ## Introduction
35
+
36
+ The Hughes Hallucination Evaluation Model (HHEM) Leaderboard is dedicated to assessing the frequency of hallucinations in document summaries generated by Large Language Models (LLMs).
37
+
38
+ Hallucinations refer to instances where a model introduces factually incorrect or unrelated content in its summaries.
39
+
40
+ ## How it works
41
+
42
+ Using [Vectara](https://vectara.com)'s HHEM, we measure the occurrence of hallucinations in generated summaries.
43
+ Given a source document and a summary generated by an LLM, HHEM outputs a hallucination score between 0 and 1, with 0 indicating complete hallucination and 1 representing perfect factual consistency.
44
+ The model card for HHEM can be found [here](https://huggingface.co/vectara/hallucination_evaluation_model).
45
+
46
+ ## Evaluation Dataset
47
+
48
+ Our evaluation dataset consists of 1006 documents from multiple public datasets, primarily [CNN/Daily Mail Corpus](https://huggingface.co/datasets/cnn_dailymail/viewer/1.0.0/test).
49
+ We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
50
+
51
+ ## Metrics Explained
52
+ - Hallucination Rate: Percentage of summaries with a hallucination score below 0.5
53
+ - Factual Consistency Rate: The complement of the hallucination rate, expressed as a percentage.
54
+ - Answer Rate: Percentage of summaries that are non-empty. This is either the model refuses to generate a response or throws an error due to various reasons. (e.g. the model believes that the document includes inappropriate content)
55
+ - Average Summary Length: The average word count of generated summaries
56
+
57
+ ## Note on non-Hugging Face models
58
+ On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
59
+ If you would like to submit your model that is not available on the Hugging Face model hub, please contact us at [email protected].
60
+
61
+ ## Model Submissions and Reproducibility
62
+ You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
63
+
64
+ ### For models not available on the Hugging Face model hub:
65
+ 1) Access generated summaries used for evaluation [here](https://github.com/vectara/hallucination-leaderboard) in "leaderboard_summaries.csv".
66
+ 2) The text generation prompt is available under "Prompt Used" section in the repository's README.
67
+ 3) Details on API Integration for evaluations are under "API Integration Details".
68
+
69
+ ### For models available on the Hugging Face model hub:
70
+ To replicate the evaluation result for a Hugging Face model:
71
+
72
+ 1) Clone the Repository
73
+ ```python
74
+ git lfs install
75
+ git clone https://huggingface.co/spaces/vectara/leaderboard
76
+ ```
77
+ 2) Install the Requirements
78
+ ```python
79
+ pip install -r requirements.txt
80
+ ```
81
+ 3) Set Up Your Hugging Face Token
82
+ ```python
83
+ export HF_TOKEN=your_token
84
+ ```
85
+ 4) Run the Evaluation Script
86
+ ```python
87
+ python main_backend.py --model your_model_id --precision float16
88
+ ```
89
+ 5) Check Results
90
+ After the evaluation, results are saved in "eval-results-bk/your_model_id/results.json".
91
+
92
+ ## Results Format
93
+ The results are structured in JSON as follows:
94
+ ```python
95
+ {
96
+ "config": {
97
+ "model_dtype": "float16",
98
+ "model_name": "your_model_id",
99
+ "model_sha": "main"
100
+ },
101
+ "results": {
102
+ "hallucination_rate": {
103
+ "hallucination_rate": ...
104
+ },
105
+ "factual_consistency_rate": {
106
+ "factual_consistency_rate": ...
107
+ },
108
+ "answer_rate": {
109
+ "answer_rate": ...
110
+ },
111
+ "average_summary_length": {
112
+ "average_summary_length": ...
113
+ }
114
+ }
115
+ }
116
+ ```
117
+ For additional queries or model submissions, please contact [email protected].
118
+ """
119
+
120
+ EVALUATION_QUEUE_TEXT = """
121
+ ## Some good practices before submitting a model
122
+
123
+ ### 1) Make sure you can load your model and tokenizer using AutoClasses:
124
+ ```python
125
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
126
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
127
+ model = AutoModel.from_pretrained("your model name", revision=revision)
128
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
129
+ ```
130
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
131
+
132
+ Note: make sure your model is public!
133
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
134
+
135
+ ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
136
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
137
+
138
+ ### 3) Make sure your model has an open license!
139
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
140
+
141
+ ### 4) Fill up your model card
142
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
143
+
144
+ ## In case of model failure
145
+ If your model is displayed in the `FAILED` category, its execution stopped.
146
+ Make sure you have followed the above steps first.
147
+ """
148
+
149
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
150
+ CITATION_BUTTON_TEXT = r"""
151
+ @dataset{HughesBae2023,
152
+ author = {Simon Hughes and Minseok Bae},
153
+ title = {Vectara Hallucination Leaderboard},
154
+ year = {2023},
155
+ month = {11},
156
+ publisher = {Vectara, Inc},
157
+ doi = {},
158
+ url = {https://github.com/vectara/hallucination-leaderboard},
159
+ abstract = {A leaderboard comparing LLM performance at maintaining factual consistency when summarizing a set of facts.},
160
+ keywords = {nlp, llm, hallucination, nli, machine learning},
161
+ license = {Apache-2.0},
162
+ }"""
src/display/css_html_js.py CHANGED
@@ -33,11 +33,17 @@ custom_css = """
33
  background: none;
34
  border: none;
35
  }
36
-
37
  #search-bar {
38
  padding: 0px;
39
  }
40
 
 
 
 
 
 
 
41
  /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
  table td:first-child,
43
  table th:first-child {
 
33
  background: none;
34
  border: none;
35
  }
36
+
37
  #search-bar {
38
  padding: 0px;
39
  }
40
 
41
+ /* Hides the final AutoEvalColumn */
42
+ #llm-benchmark-tab-table table td:last-child,
43
+ #llm-benchmark-tab-table table th:last-child {
44
+ display: none;
45
+ }
46
+
47
  /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
48
  table td:first-child,
49
  table th:first-child {
src/display/formatting.py CHANGED
@@ -1,3 +1,12 @@
 
 
 
 
 
 
 
 
 
1
  def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
 
1
+ import os
2
+ from datetime import datetime, timezone
3
+
4
+ from huggingface_hub import HfApi
5
+ from huggingface_hub.hf_api import ModelInfo
6
+
7
+
8
+ API = HfApi()
9
+
10
  def model_hyperlink(link, model_name):
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
 
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -19,16 +19,18 @@ class ColumnContent:
19
  displayed_by_default: bool
20
  hidden: bool = False
21
  never_hidden: bool = False
 
22
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -39,6 +41,8 @@ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B
39
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -91,6 +95,9 @@ class WeightType(Enum):
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
 
 
 
94
  Unknown = ModelDetails("?")
95
 
96
  def from_str(precision):
@@ -98,13 +105,32 @@ class Precision(Enum):
98
  return Precision.float16
99
  if precision in ["torch.bfloat16", "bfloat16"]:
100
  return Precision.bfloat16
 
 
 
 
 
 
101
  return Precision.Unknown
102
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
 
 
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  import pandas as pd
5
 
6
+ from src.display.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
19
  displayed_by_default: bool
20
  hidden: bool = False
21
  never_hidden: bool = False
22
+ dummy: bool = False
23
 
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
26
  # Init
27
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent,
28
+ ColumnContent("T", "str", True, never_hidden=True)])
29
+ auto_eval_column_dict.append(["model", ColumnContent,
30
+ ColumnContent("Model", "markdown", True, never_hidden=True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
+
34
  # Model information
35
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
36
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
41
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
42
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
43
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
44
+ # Dummy column for the search bar (hidden by the custom CSS)
45
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
46
 
47
  # We use make dataclass to dynamically fill the scores from Tasks
48
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
95
  class Precision(Enum):
96
  float16 = ModelDetails("float16")
97
  bfloat16 = ModelDetails("bfloat16")
98
+ qt_8bit = ModelDetails("8bit")
99
+ qt_4bit = ModelDetails("4bit")
100
+ qt_GPTQ = ModelDetails("GPTQ")
101
  Unknown = ModelDetails("?")
102
 
103
  def from_str(precision):
 
105
  return Precision.float16
106
  if precision in ["torch.bfloat16", "bfloat16"]:
107
  return Precision.bfloat16
108
+ if precision in ["8bit"]:
109
+ return Precision.qt_8bit
110
+ if precision in ["4bit"]:
111
+ return Precision.qt_4bit
112
+ if precision in ["GPTQ", "None"]:
113
+ return Precision.qt_GPTQ
114
  return Precision.Unknown
115
 
116
  # Column selection
117
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
118
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
119
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
120
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
121
 
122
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
123
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
124
 
125
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
126
 
127
+ NUMERIC_INTERVALS = {
128
+ "?": pd.Interval(-1, 0, closed="right"),
129
+ "~1.5": pd.Interval(0, 2, closed="right"),
130
+ "~3": pd.Interval(2, 4, closed="right"),
131
+ "~7": pd.Interval(4, 9, closed="right"),
132
+ "~13": pd.Interval(9, 20, closed="right"),
133
+ "~35": pd.Interval(20, 45, closed="right"),
134
+ "~60": pd.Interval(45, 70, closed="right"),
135
+ "70+": pd.Interval(70, 10000, closed="right"),
136
+ }
src/envs.py CHANGED
@@ -1,19 +1,16 @@
1
  import os
2
-
3
  from huggingface_hub import HfApi
4
 
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
-
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
 
 
 
 
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
-
16
- # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
@@ -21,5 +18,18 @@ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
-
 
 
25
  API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import torch
3
  from huggingface_hub import HfApi
4
 
 
 
 
 
 
 
5
 
6
+ # replace this with our token
7
+ TOKEN = os.environ.get("HF_TOKEN", None)
8
+ # print(TOKEN)
9
+ OWNER = "vectara"
10
  REPO_ID = f"{OWNER}/leaderboard"
11
  QUEUE_REPO = f"{OWNER}/requests"
12
  RESULTS_REPO = f"{OWNER}/results"
13
+ print(RESULTS_REPO)
 
14
  CACHE_PATH=os.getenv("HF_HOME", ".")
15
 
16
  # Local caches
 
18
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
19
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
20
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
21
+ print(EVAL_RESULTS_PATH_BACKEND)
22
+ # exit()
23
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
24
  API = HfApi(token=TOKEN)
25
+
26
+ DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
27
+ PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
28
+ HEM_PATH = 'vectara/hallucination_evaluation_model'
29
+ HUMAN_DATA = "./src/datasets/human_data.csv" #experiment data
30
+
31
+ # SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
32
+ SYSTEM_PROMPT = "You are a participant of a psycholinguistic experiment. You will do a task on English language use."
33
+ '''prompt'''
34
+ # USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
35
+ USER_PROMPT = ""
src/leaderboard/read_evals.py CHANGED
@@ -1,35 +1,32 @@
1
  import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
6
 
7
- import dateutil
8
  import numpy as np
 
9
 
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -41,43 +38,35 @@ class EvalResult:
41
  config = data.get("config")
42
 
43
  # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
 
46
  # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
 
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
 
59
 
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
 
69
  # Extract results available in this file (some results are split in several files)
70
  results = {}
71
- for task in Tasks:
72
  task = task.value
73
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
 
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
 
82
  return self(
83
  eval_name=result_key,
@@ -85,7 +74,7 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
  revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
@@ -93,40 +82,43 @@ class EvalResult:
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
97
 
98
  try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
  self.license = request.get("license", "?")
104
  self.likes = request.get("likes", 0)
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
- AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
  }
128
 
129
- for task in Tasks:
130
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
 
132
  return data_dict
@@ -160,7 +152,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
160
 
161
  for root, _, files in os.walk(results_path):
162
  # We should only have json files in model results
163
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
  continue
165
 
166
  # Sort the files by date
@@ -169,8 +161,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
169
  except dateutil.parser._parser.ParserError:
170
  files = [files[-1]]
171
 
172
- for file in files:
173
- model_result_filepaths.append(os.path.join(root, file))
174
 
175
  eval_results = {}
176
  for model_result_filepath in model_result_filepaths:
@@ -181,7 +172,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
183
  if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
185
  else:
186
  eval_results[eval_name] = eval_result
187
 
 
1
  import glob
2
  import json
 
3
  import os
4
  from dataclasses import dataclass
5
 
 
6
  import numpy as np
7
+ import dateutil
8
 
9
+ import src.display.formatting as formatting
10
+ import src.display.utils as utils
11
+ import src.submission.check_validity as check_validity
12
 
13
 
14
  @dataclass
15
  class EvalResult:
16
+ eval_name: str # org_model_precision (uid)
17
+ full_model: str # org/model (path on hub)
18
+ org: str
 
 
19
  model: str
20
+ revision: str # commit hash, "" if main
21
  results: dict
22
+ precision: utils.Precision = utils.Precision.Unknown
23
+ model_type: utils.ModelType = utils.ModelType.Unknown # Pretrained, fine tuned, ...
24
+ weight_type: utils.WeightType = utils.WeightType.Original # Original or Adapter
25
+ architecture: str = "Unknown"
26
  license: str = "?"
27
  likes: int = 0
28
  num_params: int = 0
29
+ date: str = "" # submission date of request file
30
  still_on_hub: bool = False
31
 
32
  @classmethod
 
38
  config = data.get("config")
39
 
40
  # Precision
41
+ precision = utils.Precision.from_str(config.get("model_dtype"))
42
 
43
  # Get model and org
44
+ full_model = config.get("model_name", config.get("model_args", None))
45
+ org, model = full_model.split("/", 1) if "/" in full_model else (None, full_model)
46
 
47
+ if org:
 
 
 
 
 
 
48
  result_key = f"{org}_{model}_{precision.value.name}"
49
+ else:
50
+ result_key = f"{model}_{precision.value.name}"
51
 
52
+ still_on_hub, _, model_config = check_validity.is_model_on_hub(
53
+ full_model, config.get("model_sha", "main"), trust_remote_code=True,
54
+ test_tokenizer=False)
55
+
56
+ if model_config:
57
+ architecture = ";".join(getattr(model_config, "architectures", ["?"]))
58
+ else:
59
+ architecture = "?"
60
 
61
  # Extract results available in this file (some results are split in several files)
62
  results = {}
63
+ for task in utils.Tasks:
64
  task = task.value
65
 
66
  # We average all scores of a given metric (not all metrics are present in all files)
67
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
 
 
68
 
69
+ results[task.benchmark] = accs
 
70
 
71
  return self(
72
  eval_name=result_key,
 
74
  org=org,
75
  model=model,
76
  results=results,
77
+ precision=precision,
78
  revision= config.get("model_sha", ""),
79
  still_on_hub=still_on_hub,
80
  architecture=architecture
 
82
 
83
  def update_with_request_file(self, requests_path):
84
  """Finds the relevant request file for the current model and updates info with it"""
85
+ request_file = get_request_file_for_model(requests_path, self.full_model,
86
+ self.precision.value.name)
87
 
88
  try:
89
  with open(request_file, "r") as f:
90
  request = json.load(f)
91
+ self.model_type = utils.ModelType.from_str(request.get("model_type", ""))
92
+ self.weight_type = utils.WeightType[request.get("weight_type", "Original")]
93
  self.license = request.get("license", "?")
94
  self.likes = request.get("likes", 0)
95
  self.num_params = request.get("params", 0)
96
  self.date = request.get("submitted_time", "")
97
+ except FileNotFoundError:
98
+ print(f"Could not find request file for {self.org}/{self.model}")
99
+ except json.JSONDecodeError:
100
+ print(f"Error decoding JSON in request file for {self.org}/{self.model}")
101
 
102
  def to_dict(self):
103
  """Converts the Eval Result to a dict compatible with our dataframe display"""
104
+
105
  data_dict = {
106
  "eval_name": self.eval_name, # not a column, just a save name,
107
+ utils.AutoEvalColumn.precision.name: self.precision.value.name,
108
+ utils.AutoEvalColumn.model_type.name: self.model_type.value.name,
109
+ utils.AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
110
+ utils.AutoEvalColumn.weight_type.name: self.weight_type.value.name,
111
+ utils.AutoEvalColumn.architecture.name: self.architecture,
112
+ utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
113
+ utils.AutoEvalColumn.dummy.name: self.full_model,
114
+ utils.AutoEvalColumn.revision.name: self.revision,
115
+ utils.AutoEvalColumn.license.name: self.license,
116
+ utils.AutoEvalColumn.likes.name: self.likes,
117
+ utils.AutoEvalColumn.params.name: self.num_params,
118
+ utils.AutoEvalColumn.still_on_hub.name: self.still_on_hub,
119
  }
120
 
121
+ for task in utils.Tasks:
122
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
123
 
124
  return data_dict
 
152
 
153
  for root, _, files in os.walk(results_path):
154
  # We should only have json files in model results
155
+ if not files or any([not f.endswith(".json") for f in files]):
156
  continue
157
 
158
  # Sort the files by date
 
161
  except dateutil.parser._parser.ParserError:
162
  files = [files[-1]]
163
 
164
+ model_result_filepaths.extend([os.path.join(root, file) for file in files])
 
165
 
166
  eval_results = {}
167
  for model_result_filepath in model_result_filepaths:
 
172
  # Store results of same eval together
173
  eval_name = eval_result.eval_name
174
  if eval_name in eval_results.keys():
175
+ eval_results[eval_name].results.update({k: v for k, v in
176
+ eval_result.results.items() if v is not None})
177
  else:
178
  eval_results[eval_name] = eval_result
179
 
src/populate.py CHANGED
@@ -3,27 +3,25 @@ import os
3
 
4
  import pandas as pd
5
 
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
 
24
 
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
  all_evals = []
29
 
@@ -33,8 +31,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
33
  with open(file_path) as fp:
34
  data = json.load(fp)
35
 
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
 
39
  all_evals.append(data)
40
  elif ".md" not in entry:
@@ -45,8 +43,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
45
  with open(file_path) as fp:
46
  data = json.load(fp)
47
 
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
  all_evals.append(data)
51
 
52
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
 
3
 
4
  import pandas as pd
5
 
6
+ import src.display.formatting as formatting
7
+ import src.display.utils as utils
8
+ import src.leaderboard.read_evals as read_evals
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
+ raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
 
13
  all_data_json = [v.to_dict() for v in raw_data]
14
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
+ df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
17
  df = df[cols].round(decimals=2)
18
 
19
  # filter out if any of the benchmarks have not been produced
20
+ df = df[formatting.has_no_nan_values(df, benchmark_cols)]
21
+ return raw_data, df
22
 
23
 
24
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
25
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
26
  all_evals = []
27
 
 
31
  with open(file_path) as fp:
32
  data = json.load(fp)
33
 
34
+ data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
35
+ data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
36
 
37
  all_evals.append(data)
38
  elif ".md" not in entry:
 
43
  with open(file_path) as fp:
44
  data = json.load(fp)
45
 
46
+ data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
47
+ data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
48
  all_evals.append(data)
49
 
50
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
src/submission/check_validity.py CHANGED
@@ -1,14 +1,12 @@
1
  import json
2
  import os
3
- import re
4
  from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
 
7
  import huggingface_hub
8
  from huggingface_hub import ModelCard
9
  from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
13
  def check_model_card(repo_id: str) -> tuple[bool, str]:
14
  """Checks if the model card and license exist and have been filled"""
@@ -31,8 +29,8 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
31
 
32
  return True, ""
33
 
 
34
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
  try:
37
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
  if test_tokenizer:
@@ -56,7 +54,8 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
56
  )
57
 
58
  except Exception as e:
59
- return False, "was not found on hub!", None
 
60
 
61
 
62
  def get_model_size(model_info: ModelInfo, precision: str):
@@ -75,7 +74,6 @@ def get_model_arch(model_info: ModelInfo):
75
  return model_info.config.get("architectures", "Unknown")
76
 
77
  def already_submitted_models(requested_models_dir: str) -> set[str]:
78
- """Gather a list of already submitted models to avoid duplicates"""
79
  depth = 1
80
  file_names = []
81
  users_to_submission_dates = defaultdict(list)
 
1
  import json
2
  import os
 
3
  from collections import defaultdict
 
4
 
5
  import huggingface_hub
6
  from huggingface_hub import ModelCard
7
  from huggingface_hub.hf_api import ModelInfo
8
+ from transformers import AutoConfig, AutoTokenizer
9
+ from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
10
 
11
  def check_model_card(repo_id: str) -> tuple[bool, str]:
12
  """Checks if the model card and license exist and have been filled"""
 
29
 
30
  return True, ""
31
 
32
+
33
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
 
34
  try:
35
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
36
  if test_tokenizer:
 
54
  )
55
 
56
  except Exception as e:
57
+ return False, f"was not found on hub!: {e}", None
58
+
59
 
60
 
61
  def get_model_size(model_info: ModelInfo, precision: str):
 
74
  return model_info.config.get("architectures", "Unknown")
75
 
76
  def already_submitted_models(requested_models_dir: str) -> set[str]:
 
77
  depth = 1
78
  file_names = []
79
  users_to_submission_dates = defaultdict(list)
src/submission/submit.py CHANGED
@@ -2,14 +2,10 @@ import json
2
  import os
3
  from datetime import datetime, timezone
4
 
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
@@ -25,7 +21,7 @@ def add_new_eval(
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
27
  if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
 
30
  user_name = ""
31
  model_path = model
@@ -37,7 +33,7 @@ def add_new_eval(
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
  if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
 
42
  # Does the model actually exist?
43
  if revision == "":
@@ -45,32 +41,32 @@ def add_new_eval(
45
 
46
  # Is the model on the hub?
47
  if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
  if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
 
52
  if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
  if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
 
57
  # Is the model info correctly filled?
58
  try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
  except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
 
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
 
65
  # Were the model card and license filled?
66
  try:
67
  license = model_info.cardData["license"]
68
  except Exception:
69
- return styled_error("Please select a license for your model")
70
 
71
- modelcard_OK, error_msg = check_model_card(model)
72
  if not modelcard_OK:
73
- return styled_error(error_msg)
74
 
75
  # Seems good, creating the eval
76
  print("Adding new eval")
@@ -87,15 +83,15 @@ def add_new_eval(
87
  "likes": model_info.likes,
88
  "params": model_size,
89
  "license": license,
90
- "private": False,
91
  }
92
 
93
  # Check for duplicate submission
94
  if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
 
97
  print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
 
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
  out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
@@ -103,10 +99,10 @@ def add_new_eval(
103
  f.write(json.dumps(eval_entry))
104
 
105
  print("Uploading eval file")
106
- API.upload_file(
107
  path_or_fileobj=out_path,
108
  path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
  repo_type="dataset",
111
  commit_message=f"Add {model} to eval queue",
112
  )
@@ -114,6 +110,6 @@ def add_new_eval(
114
  # Remove the local file
115
  os.remove(out_path)
116
 
117
- return styled_message(
118
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
  )
 
2
  import os
3
  from datetime import datetime, timezone
4
 
5
+ import src.display.formatting as formatting
6
+ import src.envs as envs
7
+ import src.submission.check_validity as check_validity
8
+
 
 
 
 
9
 
10
  REQUESTED_MODELS = None
11
  USERS_TO_SUBMISSION_DATES = None
 
21
  global REQUESTED_MODELS
22
  global USERS_TO_SUBMISSION_DATES
23
  if not REQUESTED_MODELS:
24
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = check_validity.already_submitted_models(envs.EVAL_REQUESTS_PATH)
25
 
26
  user_name = ""
27
  model_path = model
 
33
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
34
 
35
  if model_type is None or model_type == "":
36
+ return formatting.styled_error("Please select a model type.")
37
 
38
  # Does the model actually exist?
39
  if revision == "":
 
41
 
42
  # Is the model on the hub?
43
  if weight_type in ["Delta", "Adapter"]:
44
+ base_model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=base_model, revision=revision, token=envs.TOKEN, test_tokenizer=True)
45
  if not base_model_on_hub:
46
+ return formatting.styled_error(f'Base model "{base_model}" {error}')
47
 
48
  if not weight_type == "Adapter":
49
+ model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
50
  if not model_on_hub:
51
+ return formatting.styled_error(f'Model "{model}" {error}')
52
 
53
  # Is the model info correctly filled?
54
  try:
55
+ model_info = envs.API.model_info(repo_id=model, revision=revision)
56
  except Exception:
57
+ return formatting.styled_error("Could not get your model information. Please fill it up properly.")
58
 
59
+ model_size = check_validity.get_model_size(model_info=model_info, precision=precision)
60
 
61
  # Were the model card and license filled?
62
  try:
63
  license = model_info.cardData["license"]
64
  except Exception:
65
+ return formatting.styled_error("Please select a license for your model")
66
 
67
+ modelcard_OK, error_msg = check_validity.check_model_card(model)
68
  if not modelcard_OK:
69
+ return formatting.styled_error(error_msg)
70
 
71
  # Seems good, creating the eval
72
  print("Adding new eval")
 
83
  "likes": model_info.likes,
84
  "params": model_size,
85
  "license": license,
 
86
  }
87
 
88
  # Check for duplicate submission
89
  if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
90
+ return formatting.styled_warning("This model has been already submitted.")
91
 
92
  print("Creating eval file")
93
+
94
+ OUT_DIR = f"{envs.EVAL_REQUESTS_PATH}/{user_name}"
95
  os.makedirs(OUT_DIR, exist_ok=True)
96
  out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
97
 
 
99
  f.write(json.dumps(eval_entry))
100
 
101
  print("Uploading eval file")
102
+ envs.API.upload_file(
103
  path_or_fileobj=out_path,
104
  path_in_repo=out_path.split("eval-queue/")[1],
105
+ repo_id=envs.QUEUE_REPO,
106
  repo_type="dataset",
107
  commit_message=f"Add {model} to eval queue",
108
  )
 
110
  # Remove the local file
111
  os.remove(out_path)
112
 
113
+ return formatting.styled_message(
114
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
115
  )