Spaces:

tangtang1995
/

Humanlike_Evaluation

Runtime error

App Files Files Community

tangtang1995 commited on Jul 22, 2024

Commit

3de499f

verified ·

1 Parent(s): 1bf4050

1

Browse files

Files changed (46) hide show

.gitattributes +1 -0
src/.DS_Store +0 -0
src/__pycache__/envs.cpython-310.pyc +0 -0
src/__pycache__/envs.cpython-38.pyc +0 -0
src/__pycache__/envs.cpython-39.pyc +0 -0
src/backend/__pycache__/evaluate_model.cpython-310.pyc +0 -0
src/backend/__pycache__/evaluate_model.cpython-38.pyc +0 -0
src/backend/__pycache__/evaluate_model.cpython-39.pyc +0 -0
src/backend/__pycache__/manage_requests.cpython-310.pyc +0 -0
src/backend/__pycache__/manage_requests.cpython-38.pyc +0 -0
src/backend/__pycache__/manage_requests.cpython-39.pyc +0 -0
src/backend/__pycache__/model_operations.cpython-310.pyc +0 -0
src/backend/__pycache__/model_operations.cpython-38.pyc +0 -0
src/backend/__pycache__/model_operations.cpython-39.pyc +0 -0
src/backend/__pycache__/run_eval_suite.cpython-310.pyc +0 -0
src/backend/__pycache__/run_eval_suite.cpython-38.pyc +0 -0
src/backend/__pycache__/run_eval_suite.cpython-39.pyc +0 -0
src/backend/__pycache__/sort_queue.cpython-310.pyc +0 -0
src/backend/__pycache__/sort_queue.cpython-38.pyc +0 -0
src/backend/__pycache__/util.cpython-310.pyc +0 -0
src/backend/__pycache__/util.cpython-38.pyc +0 -0
src/backend/evaluate_model.py +146 -0
src/backend/manage_requests.py +118 -0
src/backend/model_operations.py +615 -0
src/backend/run_eval_suite.py +76 -0
src/backend/sort_queue.py +27 -0
src/backend/util.py +78 -0
src/datasets/Items.xlsx +0 -0
src/datasets/Material_Llama2_0603.xlsx +0 -0
src/datasets/human_data.csv +0 -0
src/datasets/human_data.xlsx +3 -0
src/datasets/leaderboard_dataset.csv +0 -0
src/datasets/prompt.csv +11 -0
src/datasets/prompt.xlsx +0 -0
src/datasets/sample_dataset.csv +11 -0
src/datasets/~$Items.xlsx +0 -0
src/datasets/~$Material_Llama2_0603.xlsx +0 -0
src/display/about.py +162 -0
src/display/css_html_js.py +7 -1
src/display/formatting.py +9 -0
src/display/utils.py +31 -5
src/envs.py +20 -10
src/leaderboard/read_evals.py +56 -64
src/populate.py +11 -13
src/submission/check_validity.py +5 -7
src/submission/submit.py +22 -26

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
+src/datasets/human_data.xlsx filter=lfs diff=lfs merge=lfs -text

src/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

src/__pycache__/envs.cpython-310.pyc ADDED Viewed

Binary file (1.15 kB). View file

src/__pycache__/envs.cpython-38.pyc ADDED Viewed

Binary file (1.24 kB). View file

src/__pycache__/envs.cpython-39.pyc ADDED Viewed

Binary file (1.24 kB). View file

src/backend/__pycache__/evaluate_model.cpython-310.pyc ADDED Viewed

Binary file (4.75 kB). View file

src/backend/__pycache__/evaluate_model.cpython-38.pyc ADDED Viewed

Binary file (4.79 kB). View file

src/backend/__pycache__/evaluate_model.cpython-39.pyc ADDED Viewed

Binary file (4.79 kB). View file

src/backend/__pycache__/manage_requests.cpython-310.pyc ADDED Viewed

Binary file (3.66 kB). View file

src/backend/__pycache__/manage_requests.cpython-38.pyc ADDED Viewed

Binary file (3.55 kB). View file

src/backend/__pycache__/manage_requests.cpython-39.pyc ADDED Viewed

Binary file (3.63 kB). View file

src/backend/__pycache__/model_operations.cpython-310.pyc ADDED Viewed

Binary file (14.7 kB). View file

src/backend/__pycache__/model_operations.cpython-38.pyc ADDED Viewed

Binary file (12.7 kB). View file

src/backend/__pycache__/model_operations.cpython-39.pyc ADDED Viewed

Binary file (12.7 kB). View file

src/backend/__pycache__/run_eval_suite.cpython-310.pyc ADDED Viewed

Binary file (2.51 kB). View file

src/backend/__pycache__/run_eval_suite.cpython-38.pyc ADDED Viewed

Binary file (2.5 kB). View file

src/backend/__pycache__/run_eval_suite.cpython-39.pyc ADDED Viewed

Binary file (2.51 kB). View file

src/backend/__pycache__/sort_queue.cpython-310.pyc ADDED Viewed

Binary file (1.86 kB). View file

src/backend/__pycache__/sort_queue.cpython-38.pyc ADDED Viewed

Binary file (1.91 kB). View file

src/backend/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (2.2 kB). View file

src/backend/__pycache__/util.cpython-38.pyc ADDED Viewed

Binary file (2.19 kB). View file

src/backend/evaluate_model.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import logging
+import pandas as pd
+import os
+import csv
+import src.envs as envs
+from src.backend.model_operations import SummaryGenerator, EvaluationModel
+import src.backend.util as util
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+class Evaluator:
+    """A class to evaluate summaries generated by a language model.
+    Attributes:
+        model (str): The name or path of the model.
+        revision (str): The model revision.
+        precision (str): The precision setting of the model.
+        num_fewshot (int): Number of few-shot examples to use.
+        batch_size (int): Batch size for processing.
+        device (str): The device to run the model on.
+        no_cache (bool): Flag to disable caching.
+        limit (int): Limit on the number of items to process.
+        write_out (bool): Whether to write results to a file.
+        output_base_path (str): Base path for output files.
+        summary_generator (SummaryGenerator): Instance for generating summaries.
+        eval_model (EvaluationModel): Instance for evaluating summaries.
+    """
+    def __init__(self, model, revision, precision, batch_size,
+                device, no_cache, limit, write_out=True,
+                output_base_path='logs'):
+        """Initializes the Evaluator with the given model and settings.
+        Args:
+            model (str): The name or path of the model.
+            revision (str): The model revision.
+            precision (str): The precision setting of the model.
+            num_fewshot (int): Number of few-shot examples to use.
+            batch_size (int): Batch size for processing.
+            device (str): The device to run the model on.
+            no_cache (bool): Flag to disable caching.
+            limit (int): Limit on the number of items to process.
+            write_out (bool): Whether to write results to a file.
+            output_base_path (str): Base path for output files.
+        """
+        self.model = model
+        self.revision = revision
+        self.precision = precision
+        self.batch_size = batch_size
+        self.device = device
+        self.no_cache = no_cache
+        self.limit = limit
+        self.write_out = write_out
+        self.output_base_path = output_base_path
+        try:
+            self.summary_generator = SummaryGenerator(model, revision)
+            self.eval_model = EvaluationModel(envs.HEM_PATH)
+        except Exception as e:
+            logging.error(f"Error initializing Evaluator: {e}")
+            raise
+    def evaluate(self):
+        """
+        Performs the evaluation process by generating summaries
+        and computing metrics.
+        Returns:
+            dict: A dictionary containing evaluation results.
+        """
+        try:
+            from openpyxl import load_workbook
+            # df = load_workbook(filename=envs.DATASET_PATH)
+            df_prompt = load_workbook(filename=envs.PROMPT_PATH)
+            # df = pd.read_excel(envs.DATASET_PATH, engine='xlrd') #读取原数据，原始数据，本项目这里应该是问题
+            # df_prompt = pd.read_excel(envs.PROMPT_PATH, engine='xlrd')
+            # df_prompt = pd.read_csv(envs.PROMPT_PATH)
+            # print(envs.DATASET_PATH)
+            # print(df.shape)
+            # print(df.iloc[-1])
+            self.generated_summaries_df = self.summary_generator.generate_summaries(envs.DATASET_PATH, df_prompt, save_path=f"generation_results/{self.model}.csv")
+            # exit()
+            # avg_summary_len = self.summary_generator.avg_length
+            # answer_rate = self.summary_generator.answer_rate
+            '''开始评估模型的结果'''
+            self.humanlike = self.eval_model.evaluate_humanlike(self.generated_summaries_df, envs.HUMAN_DATA)
+            '''原始指标'''
+            # self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
+                # self.generated_summaries_df)
+            # factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
+            # hallucination_rate = self.eval_model.hallucination_rate
+            factual_consistency_rate = 0
+            answer_rate = 0
+            avg_summary_len = 0
+            results = util.format_results(model_name=self.model, revision=self.revision,
+                                        precision=self.precision,
+                                        factual_consistency_rate=factual_consistency_rate,
+                                        hallucination_rate=self.humanlike,
+                                        answer_rate=answer_rate,
+                                        avg_summary_len=avg_summary_len)
+            return results
+        except FileNotFoundError:
+            logging.error(f"File not found: {envs.DATASET_PATH}")
+            raise
+        except Exception as e:
+            logging.error(f"Error during evaluation: {e}")
+            raise
+    def write_results(self):
+        print('Updating result files')
+        leaderboard_path = os.getcwd() # the path of leaderboard folder
+        print(leaderboard_path)
+        working_path = os.path.join(leaderboard_path, 'Humanlike Leaderboard Results')
+        if not os.path.exists(working_path):
+            logging.error(f"Need to first download the results from google drive to the learderboard folder")
+            raise
+        source_summary_df = self.generated_summaries_df[["user_prompt", "response"]]
+        # #update leaderboard_summaries.csv
+        # #first remove previous results for the current model
+        # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8', sep="\t")
+        # mask = existing_df['model'] == self.model
+        # existing_df = existing_df[~mask]
+        # # get new result
+        leaderboard_summaries_df = source_summary_df
+        leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
+        leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
+        print('leaderboard_summaries.csv has been updated')
+        # update leaderboard_summaries_with_scores.csv
+        # BUG: get error when opening the file
+        # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
+        #                         encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
+        # print(existing_df.shape)
+        # mask = existing_df['model'] == self.model
+        # existing_df = existing_df[~mask]
+        # get new result
+        leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
+        leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
+        leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
+        print('leaderboard_summaries_with_scores.csv has been updated')

src/backend/manage_requests.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import glob
+import json
+from dataclasses import dataclass
+from typing import Optional
+from huggingface_hub import HfApi, snapshot_download
+@dataclass
+class EvalRequest:
+    model: str
+    # private: bool
+    status: str
+    json_filepath: str = None
+    private: bool = False
+    weight_type: str = "Original"
+    model_type: str = ""  # pretrained, finetuned, with RL
+    precision: str = ""  # float16, bfloat16
+    base_model: Optional[str] = None # for adapter models
+    revision: str = "main" # commit
+    submitted_time: Optional[str] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
+    model_type: Optional[str] = None
+    likes: Optional[int] = 0
+    params: Optional[int] = None
+    license: Optional[str] = ""
+    def get_model_args(self):
+        model_args = f"pretrained={self.model},revision={self.revision}"
+        if self.precision in ["float16", "bfloat16"]:
+            model_args += f",dtype={self.precision}"
+        else:
+            raise ValueError(f"Unknown precision {self.precision}.")
+        return model_args
+def set_eval_request(api: HfApi, eval_request: EvalRequest, new_status: str,
+                    hf_repo: str, local_dir: str):
+    """Updates a given eval request with its new status on the hub (running, completed, failed,)"""
+    json_filepath = eval_request.json_filepath
+    with open(json_filepath) as fp:
+        data = json.load(fp)
+    data["status"] = new_status
+    with open(json_filepath, "w") as f:
+        f.write(json.dumps(data))
+    api.upload_file(
+        path_or_fileobj=json_filepath,
+        path_in_repo=os.path.relpath(json_filepath, start=local_dir),
+        repo_id=hf_repo,
+        repo_type="dataset",
+    )
+def get_eval_requests(job_status: list, local_dir: str, hf_repo: str)  ->  list[EvalRequest]:
+    """Get all pending evaluation requests and return a list in which private
+    models appearing first, followed by public models sorted by the number of
+    likes.
+    Returns:
+        list[EvalRequest]: a list of model info dicts.
+    """
+    snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
+                    repo_type="dataset", max_workers=60)
+    json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
+    eval_requests = []
+    for json_filepath in json_files:
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        if data["status"] in job_status:
+            data["json_filepath"] = json_filepath
+            eval_request = EvalRequest(**data)
+            eval_requests.append(eval_request)
+    return eval_requests
+def check_completed_evals(
+    api: HfApi,
+    hf_repo: str,
+    local_dir: str,
+    checked_status: str,
+    completed_status: str,
+    failed_status: str,
+    hf_repo_results: str,
+    local_dir_results: str,
+):
+    """Checks if the currently running evals are completed, if yes, update their status on the hub."""
+    snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results,
+                    repo_type="dataset", max_workers=60)
+    running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
+    for eval_request in running_evals:
+        model = eval_request.model
+        print("====================================")
+        print(f"Checking {model}")
+        output_path = model
+        output_files = f"{local_dir_results}/{output_path}/results*.json"
+        output_files_exists = len(glob.glob(output_files)) > 0
+        if output_files_exists:
+            print(
+                f"EXISTS output file exists for {model} setting it to {completed_status}"
+            )
+            set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
+        else:
+            print(
+                f"No result file found for {model} setting it to {failed_status}"
+            )
+            set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/model_operations.py ADDED Viewed

	@@ -0,0 +1,615 @@

+import os
+import time
+from datetime import datetime
+import logging
+from pathlib import Path
+import requests
+import json
+import numpy as np
+import pandas as pd
+import spacy
+from sentence_transformers import CrossEncoder
+import litellm
+# from litellm import completion
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
+# from accelerate import PartialState
+# from accelerate.inference import prepare_pippy
+import torch
+import cohere
+from openai import OpenAI
+# import  google
+import google.generativeai as genai
+import src.backend.util as util
+import src.envs as envs
+# litellm.set_verbose=False
+litellm.set_verbose=True
+# Set up basic configuration for logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+# Load spacy model for word tokenization
+nlp = spacy.load("en_core_web_sm")
+os.environ["HUGGINGFACE_API_KEY"] =  envs.TOKEN
+os.environ["OPENAI_API_KEY"] = "sk-None-tanhMyavhUtpX2G1kmPuT3BlbkFJGEhM5jmyGyhrTd3LdHDI"
+def load_evaluation_model(model_path):
+    """Load the evaluation model from the given path
+    Args:
+        model_path (str): Path to the evaluation model
+    Returns:
+        CrossEncoder: The evaluation model
+    """
+    model = CrossEncoder(model_path)
+    return model
+class ModelLoadingException(Exception):
+    """Exception raised for errors in loading a model.
+    Attributes:
+        model_id (str): The model identifier.
+        revision (str): The model revision.
+    """
+    def __init__(self, model_id, revision, messages="Error initializing model"):
+        self.model_id = model_id
+        self.revision = revision
+        super().__init__(f"{messages} id={model_id} revision={revision}")
+class SummaryGenerator:
+    """A class to generate summaries using a causal language model.
+    Attributes:
+        model (str): huggingface/{model_id}
+        api_base (str): https://api-inference.huggingface.co/models/{model_id}
+        summaries_df (DataFrame): DataFrame to store generated summaries.
+        revision (str): Model revision.
+        avg_length (float): Average length of summaries.
+        answer_rate (float): Rate of non-empty summaries.
+    """
+    def __init__(self, model_id, revision):
+        """
+        Initializes the SummaryGenerator with a model.
+        Args:
+            model_id (str): Identifier for the model.
+            revision (str): Revision of the model.
+        """
+        self.model_id = model_id
+        self.model = f"huggingface/{model_id}"
+        self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
+        self.summaries_df = pd.DataFrame()
+        self.revision = revision
+        self.avg_length = None
+        self.answer_rate = None
+        self.exceptions = None
+        self.local_model = None
+    def generate_summaries(self, dataset, df_prompt, save_path=None):
+        """Generate summaries for a given DataFrame of source docs.
+           修改这里拉取模型生成结果
+        Args:
+            df (DataFrame): DataFrame containing source docs.
+        Returns:
+            summaries_df (DataFrame): Generated summaries by the model.
+        """
+        exceptions = []
+        if (save_path is not None) and os.path.exists(save_path):
+            '''已存在文件,可以读取已经存在的测试文本'''
+            self.summaries_df = pd.read_csv(save_path)
+            # print(self.summaries_df['Experiment'])
+            print(f'Loaded generated summaries from {save_path}')
+        else:
+            '''测试文件不存在，则需要调用指定的模型来进行测试'''
+            # prompt = {}
+            # for index, row in tqdm(df_prompt.iterrows(), total=df_prompt.shape[0]):
+            #     prompt['E' + row['Item']] = row['Prompt']
+            xls = pd.ExcelFile(dataset)
+            sheet_names = xls.sheet_names
+            # sheet_names = df.sheetnames
+            print(f"Total: {len(sheet_names)}")
+            print(sheet_names)
+            item_ID, questions_ID, user_prompt, response = [], [], [], []
+            for i, sheet_name in enumerate(sheet_names[0:2], start=1):
+                # 读取每个工作表
+                df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
+                # 假设第一列是'Prompt0'，但这里我们使用列名来避免硬编码
+                if 'Prompt0' in df_sheet.columns:
+                    prompt_column = df_sheet['Prompt0']
+                else:
+                    # 如果'Prompt0'列不存在，则跳过该工作表或进行其他处理
+                    continue
+                # 遍历Prompt0列的值
+                for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=1):
+                    ID = 'E' + str(i)
+                    q_ID = ID + '_' + str(j)
+                    # print(ID, q_ID, prompt_value)
+                    for i in range(2):
+                        system_prompt = envs.SYSTEM_PROMPT
+                    # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
+                        _user_prompt = prompt_value
+                        while True:
+                            try:
+                                '''调用'''
+                                print('开始调用LLM-API')
+                                _response = self.generate_summary(system_prompt, _user_prompt)
+                                # print(f"Finish index {index}")
+                                break
+                            except Exception as e:
+                                if 'Rate limit reached' in str(e):
+                                    wait_time = 3660
+                                    current_time = datetime.now().strftime('%H:%M:%S')
+                                    print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
+                                    time.sleep(wait_time)
+                                elif 'is currently loading' in str(e):
+                                    wait_time = 200
+                                    print(f"Model is loading, wait for {wait_time}")
+                                    time.sleep(wait_time)
+                                elif '429 Resource has been exhausted' in str(e): # for gemini models
+                                    wait_time = 60
+                                    print(f"Quota has reached, wait for {wait_time}")
+                                    time.sleep(wait_time)
+                                else:
+                                    print(f"Error at index {i}: {e}")
+                                    _response = ""
+                                    exceptions.append(i)
+                                    break
+                        item_ID.append(ID)
+                        questions_ID.append(q_ID)
+                        user_prompt.append(_user_prompt)
+                        response.append(_response)
+                        print(_response)
+                        # exit()
+                    # Sleep to prevent hitting rate limits too frequently
+                        time.sleep(1)
+            self.summaries_df = pd.DataFrame(list(zip(item_ID, questions_ID, user_prompt, response)),
+                                            columns=["Experiment", "Question_ID", "User_prompt", "Response"])
+            if save_path is not None:
+                print(f'Save summaries to {save_path}')
+                fpath = Path(save_path)
+                fpath.parent.mkdir(parents=True, exist_ok=True)
+                self.summaries_df.to_csv(fpath)
+        self.exceptions = exceptions
+        # self._compute_avg_length()
+        # self._compute_answer_rate()
+        return self.summaries_df
+    def generate_summary(self, system_prompt: str, user_prompt: str):
+        # Using Together AI API
+        using_together_api = False
+        together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm', 'llama-3']
+        for together_ai_api_model in together_ai_api_models:
+            if together_ai_api_model in self.model_id.lower():
+                using_together_api = True
+                break
+        # print('适用哪一种LLM',together_ai_api_model , using_together_api)
+        # print(self.model_id.lower()) #meta-llama/llama-2-7b-chat-hf
+        # print('local',self.local_model) $None
+        # exit()
+        # if 'mixtral' in self.model_id.lower() or 'dbrx' in self.model_id.lower() or 'wizardlm' in self.model_id.lower(): # For mixtral and dbrx models, use Together AI API
+        if using_together_api:
+            # suffix = "completions" if ('mixtral' in self.model_id.lower() or 'base' in self.model_id.lower()) else "chat/completions"
+            suffix = "chat/completions"
+            url = f"https://api.together.xyz/v1/{suffix}"
+            payload = {
+                "model": self.model_id,
+                # "max_tokens": 4096,
+                'max_new_tokens': 250,
+                "temperature": 0.0,
+                # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
+            }
+            # if 'mixtral' in self.model_id.lower():
+            #     # payload['prompt'] = user_prompt
+            #     # payload['prompt'] = "Write a summary of the following passage:\nPassage:\n" + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
+            #     payload['prompt'] = 'You must stick to the passage provided. Provide a concise summary of the following passage, covering the core pieces of information described:\nPassage:\n' + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
+            #     print(payload)
+            # else:
+            #     payload['messages'] = [{"role": "system", "content": system_prompt},
+            #                             {"role": "user", "content": user_prompt}]
+            payload['messages'] = [{"role": "system", "content": system_prompt},
+                                        {"role": "user", "content": user_prompt}]
+            headers = {
+                "accept": "application/json",
+                "content-type": "application/json",
+                "Authorization": f"Bearer {os.environ['TOGETHER_API_KEY']}"
+            }
+            response = requests.post(url, json=payload, headers=headers)
+            try:
+                result = json.loads(response.text)
+                # print(result)
+                result = result["choices"][0]
+                if 'message' in result:
+                    result = result["message"]["content"].strip()
+                else:
+                    result = result["text"]
+                    result_candidates = [result_cancdidate for result_cancdidate in result.split('\n\n') if len(result_cancdidate) > 0]
+                    result = result_candidates[0]
+                print(result)
+            except:
+                print(response)
+                result = ''
+            print(result)
+            return result
+        # Using OpenAI API
+        elif 'gpt' in self.model_id.lower():
+            response = litellm.completion(
+                model=self.model_id.replace('openai/',''),
+                messages=[{"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_prompt}],
+                temperature=0.0,
+                max_tokens=250,
+            )
+            result = response['choices'][0]['message']['content']
+            print(result)
+            return result
+        # Using Google AI API for Gemini models
+        elif 'gemini' in self.model_id.lower():
+            genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
+            generation_config = {
+                "temperature": 0,
+                "top_p": 0.95, # cannot change
+                "top_k": 0,
+                "max_output_tokens": 250,
+                # "response_mime_type": "application/json",
+            }
+            safety_settings = [
+                {
+                    "category": "HARM_CATEGORY_HARASSMENT",
+                    "threshold": "BLOCK_NONE"
+                },
+                {
+                    "category": "HARM_CATEGORY_HATE_SPEECH",
+                    "threshold": "BLOCK_NONE"
+                },
+                {
+                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+                    "threshold": "BLOCK_NONE"
+                },
+                {
+                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+                    "threshold": "BLOCK_NONE"
+                },
+            ]
+            model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest" if "gemini-1.5-pro" in self.model_id.lower() else self.model_id.lower().split('google/')[-1],
+                              generation_config=generation_config,
+                              system_instruction=system_prompt,
+                              safety_settings=safety_settings)
+            convo = model.start_chat(history=[])
+            convo.send_message(user_prompt)
+            # print(convo.last)
+            result = convo.last.text
+            print(result)
+            return result
+        # Using HF API or download checkpoints
+        elif self.local_model is None:
+            # print(self.model_id)
+            # exit()
+            try: # try use HuggingFace API
+                response = litellm.completion(
+                    model='command-r-plus' if 'command' in self.model_id else self.model_id,
+                    messages=[{"role": "system", "content": system_prompt},
+                                {"role": "user", "content": user_prompt}],
+                    temperature=0.0,
+                    max_tokens=1024,
+                    api_base=self.api_base,
+                )
+                result = response['choices'][0]['message']['content']
+                print(result)
+                return result
+                # exit()
+            except: # fail to call api. run it locally.
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
+                print("Tokenizer loaded")
+                self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
+                print("Local model loaded")
+            # exit()
+        # Using local model
+        if self.local_model: # cannot call API. using local model
+            messages=[
+                {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
+                {"role": "user", "content": user_prompt}
+            ]
+            try: # some models support pipeline
+                pipe = pipeline(
+                    "text-generation",
+                    model=self.local_model,
+                    tokenizer=self.tokenizer,
+                )
+                generation_args = {
+                    "max_new_tokens": 250,
+                    "return_full_text": False,
+                    "temperature": 0.0,
+                    "do_sample": False,
+                }
+                output = pipe(messages, **generation_args)
+                result = output[0]['generated_text']
+                print(result)
+            except:
+                prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
+                print(prompt)
+                input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
+                with torch.no_grad():
+                    outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
+                result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                result = result.replace(prompt[0], '')
+                print(result)
+            return result
+    def _compute_avg_length(self):
+        """
+        Compute the average length of non-empty summaries using SpaCy.
+        """
+        total_word_count = 0
+        total_count = 0
+        for summary in self.summaries_df['summary']:
+            if util.is_summary_valid(summary):
+                doc = nlp(summary)
+                words = [token.text for token in doc if token.is_alpha]
+                total_word_count += len(words)
+                total_count += 1
+        self.avg_length = 0 if total_count == 0 else total_word_count / total_count
+    def _compute_answer_rate(self):
+        """
+        Compute the rate of non-empty summaries.
+        """
+        valid_count = sum(1 for summary in self.summaries_df['summary']
+                            if util.is_summary_valid(summary))
+        total_count = len(self.summaries_df)
+        self.answer_rate = 0 if total_count == 0 else valid_count / total_count
+class EvaluationModel:
+    """A class to evaluate generated summaries.
+    Attributes:
+        model (CrossEncoder): The evaluation model.
+        scores (list): List of evaluation scores.
+        accuracy (float): Accuracy of the summaries.
+        hallucination_rate (float): Rate of hallucination in summaries.
+    """
+    def __init__(self, model_path):
+        """
+        Initializes the EvaluationModel with a CrossEncoder model.
+        Args:
+            model_path (str): Path to the CrossEncoder model.
+        """
+        self.model = load_evaluation_model(model_path)
+        self.scores = []
+        self.factual_consistency_rate = None
+        self.hallucination_rate = None
+        self.humanlike_score = None
+    def code_results(self, summaries_df):
+        '''code results from LLM's response'''
+        output = []
+        '''item1'''
+        # print(len(summaries_df['Experiment']),len(summaries_df['Response']))
+        # exit()
+        for i in range(len(summaries_df['Experiment'])):
+            # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
+            if summaries_df["Experiment"][i] == "E1":
+                if summaries_df["Response"][i].strip() == "Round":
+                    # vote_1_1 += 1
+                    output.append("Round")
+                elif summaries_df["Response"][i].strip() == "Spiky":
+                    output.append("Round")
+                else:
+                    output.append("NA")
+            # print()
+            '''item2'''
+            # vote_2_1, vote_2_2, vote_2_3 = 0, 0, 0
+            male_keyword = ["he", "his", "himself"]
+            female_keyword = ["she", "her", "herself"]
+            if summaries_df["Experiment"][i] == "E2":
+                rs = summaries_df["Response"][i].strip()
+                rs = rs.split(' ')
+                male, female = 0, 0
+                for word in rs:
+                    if word in female_keyword and male != 1:
+                        female = 1
+                        output.append("Female")
+                        break
+                    if word in male_keyword and female != 1:
+                        male = 1
+                        output.append("Male")
+                        break
+                if male == 0 and female == 0 :
+                    output.append("NA")
+            '''item3'''
+            '''item4'''
+            '''item5'''
+            '''item6'''
+            '''item7'''
+            if summaries_df["Experiment"][i] == "E7":
+                rs = summaries_df["Response"][i].strip()
+                if rs == "No":
+                    output.append("0")
+                elif rs == "Yes":
+                    output.append("1")
+                else:
+                    output.append("NA")
+            '''item8'''
+            if summaries_df["Experiment"][i] == "E8":
+                rs = summaries_df["Response"][i].strip()
+                if rs == "Something is wrong with the question":
+                    output.append("1")
+                else:
+                    output.append("0")
+            '''item9'''
+            if summaries_df["Experiment"][i] == "E9":
+                rs = summaries_df["Response"][i].strip()
+            '''item10'''
+            if summaries_df["Experiment"][i] == "E10":
+                rs = summaries_df["Response"][i].strip()
+                if rs == "Yes":
+                    output.append("1")
+                else:
+                    output.append("0")
+        '''是不是有不同的问题，如何计算'''
+    def evaluate_humanlike(self, summaries_df, human_data_path):
+        '''
+        evaluate humanlike score
+        1. code the result
+        2. comput the similaritirs between human and model
+        process model responses'''
+        huamn_df = pd.read_csv(human_data_path)
+        self.code_results(summaries_df)
+        return 9.00
+    def evaluate_hallucination(self, summaries_df):
+        """
+        Evaluate the hallucination rate in summaries. Updates the 'scores' attribute
+        of the instance with the computed scores.
+        Args:
+            summaries_df (DataFrame): DataFrame containing source docs and summaries.
+        Returns:
+            list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
+        """
+        hem_scores = []
+        sources = []
+        summaries = []
+        source_summary_pairs = util.create_pairs(summaries_df)
+        '''评价模型结果'''
+        for doc, summary in tqdm(source_summary_pairs, desc="Evaluating Humanlikeness"):
+            if util.is_summary_valid(summary):
+                try:
+                    summary = summary.replace('<bos>','').replace('<eos>','')
+                    score = self.model.predict([doc, summary])# [0]
+                    if not isinstance(score, float):
+                        try:
+                            score = score.item()
+                        except:
+                            logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
+                            continue
+                    hem_scores.append(score)
+                    sources.append(doc)
+                    summaries.append(summary)
+                except Exception as e:
+                    logging.error(f"Error while running HEM: {e}")
+                    raise
+        self.scores = hem_scores
+        eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
+        return hem_scores, eval_results
+        # for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
+        #     if util.is_summary_valid(summary):
+        #         try:
+        #             # summary_pieces = summary.split('\n')
+        #             # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
+        #             summary = summary.replace('<bos>','').replace('<eos>','')
+        #             # print([doc, summary])
+        #             # print(self.model.predict([doc, summary]))
+        #             score = self.model.predict([doc, summary])# [0]
+        #             if not isinstance(score, float):
+        #                 try:
+        #                     score = score.item()
+        #                 except:
+        #                     logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
+        #                     continue
+        #             hem_scores.append(score)
+        #             sources.append(doc)
+        #             summaries.append(summary)
+        #         except Exception as e:
+        #             logging.error(f"Error while running HEM: {e}")
+        #             raise
+        # self.scores = hem_scores
+        # eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
+        # return hem_scores, eval_results
+    def compute_factual_consistency_rate(self, threshold=0.5):
+        """
+        Compute the factual consistency rate of the evaluated summaries based on
+        the previously calculated scores. This method relies on the 'scores'
+        attribute being populated, typically via the 'evaluate_hallucination' method.
+        Returns:
+            float: Factual Consistency Rate. Also updates the 'factual_consistency_rate'
+            and 'hallucination_rate' attributes of the instance.
+        Raises:
+            ValueError: If scores have not been calculated prior to calling this method.
+        """
+        if not self.scores:
+            error_msg = "Scores not calculated. Call evaluate_hallucination() first."
+            logging.error(error_msg)
+            raise ValueError(error_msg)
+        # Use threshold of 0.5 to compute factual_consistency_rate
+        num_above_threshold = sum(score >= threshold for score in self.scores)
+        num_total = len(self.scores)
+        if not num_total:
+            raise ValueError("No scores available to compute factual consistency rate.")
+        self.factual_consistency_rate = (num_above_threshold / num_total) * 100
+        self.hallucination_rate = 100 - self.factual_consistency_rate
+        return self.factual_consistency_rate

src/backend/run_eval_suite.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import json
+import os
+import logging
+from datetime import datetime
+import src.envs as envs
+from src.backend.manage_requests import EvalRequest
+from src.backend.evaluate_model import Evaluator
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+logging.getLogger("openai").setLevel(logging.WARNING)
+def run_evaluation(eval_request: EvalRequest, batch_size, device,
+                local_dir: str, results_repo: str, no_cache=True, limit=None,
+                need_check=True, write_results=False):
+    """
+    Run the evaluation for a given model and upload the results.
+    Args:
+        eval_request (EvalRequest): The evaluation request object containing model details.
+        num_fewshot (int): Number of few-shot examples.
+        batch_size (int): Batch size for processing.
+        device (str): The device to run the evaluation on.
+        local_dir (str): Local directory path for saving results.
+        results_repo (str): Repository ID where results will be uploaded.
+        no_cache (bool): Whether to disable caching.
+        limit (int, optional): Limit on the number of items to process. Use with caution.
+    Returns:
+        dict: A dictionary containing evaluation results.
+    """
+    if limit:
+        logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
+    output_folder = os.path.join(local_dir, *eval_request.model.split("/"))
+    # if os.path.exists(output_folder):
+    #     f_name = os.listdir(output_folder)[-1]
+    #     print(f"Loading results from {os.path.join(output_folder, f_name)}")
+    #     results = json.loads(os.path.join(output_folder, f_name))
+    #     dumped = json.dumps(results, indent=2)
+    #     logging.info(dumped)
+    # else:
+    try:
+        evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
+                            batch_size, device, no_cache, limit, write_out=True,
+                            output_base_path='logs')
+        results = evaluator.evaluate()
+        if write_results:
+            evaluator.write_results()
+    except Exception as e:
+        logging.error(f"Error during evaluation: {e}")
+        raise
+    dumped = json.dumps(results, indent=2)
+    logging.info(dumped)
+    output_path = os.path.join(output_folder,
+                            f"results_{datetime.now()}.json") #
+    os.makedirs(output_folder, exist_ok=True)
+    with open(output_path, "w") as f:
+        f.write(dumped)
+    print(f"Results have been saved to{output_path}")
+    if not need_check:
+        print("Path in the repo:", f"{eval_request.model}/results_{datetime.now()}.json")
+        envs.API.upload_file(
+            path_or_fileobj=output_path,
+            path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
+            repo_id=results_repo,
+            repo_type="dataset",
+        )
+    return results

src/backend/sort_queue.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from dataclasses import dataclass
+from huggingface_hub import HfApi
+from src.backend.manage_requests import EvalRequest
+@dataclass
+class ModelMetadata:
+    likes: int = 0
+    size: int = 15
+def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
+    private_models = [model for model in models if model.private]
+    public_models = [model for model in models if not model.private]
+    return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
+def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
+def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.params, reverse=False)
+def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

src/backend/util.py ADDED Viewed

	@@ -0,0 +1,78 @@

+def is_summary_valid(summary: str) -> bool:
+    """
+    Checks if the summary is valid.
+    A summary is valid if it is not empty and contains at least five words.
+    Args:
+        summary (str): The summary to check.
+    Returns:
+        bool: True if the summary is valid, False otherwise.
+    """
+    if isinstance(summary, str):
+        words = summary.split()
+        if len(words) >= 5:
+            return True
+    # print(summary)
+    return False
+def create_pairs(df):
+    """
+    Creates pairs of source and summary from the dataframe.
+    Args:
+        df (DataFrame): The dataframe containing source and summary columns.
+    Returns:
+        list: A list of pairs [source, summary].
+    """
+    pairs = []
+    for _, row in df.iterrows():
+        pairs.append([row['source'], row['summary']])
+    return pairs
+def format_results(model_name: str, revision: str, precision: str,
+                factual_consistency_rate: float, hallucination_rate: float,
+                answer_rate: float, avg_summary_len: float) -> dict:
+    """
+    Formats the evaluation results into a structured dictionary.
+    Args:
+        model_name (str): The name of the evaluated model.
+        revision (str): The revision hash of the model.
+        precision (str): The precision with which the evaluation was run.
+        factual_consistency_rate (float): The factual consistency rate.
+        hallucination_rate (float): The hallucination rate.
+        answer_rate (float): The answer rate.
+        avg_summary_len (float): The average summary length.
+    Returns:
+        dict: A dictionary containing the structured evaluation results.
+    """
+    results = {
+        "config": {
+            "model_dtype": precision, # Precision with which you ran the evaluation
+            "model_name": model_name, # Name of the model
+            "model_sha": revision # Hash of the model
+        },
+        "results": {
+            "hallucination_rate": {
+                "hallucination_rate": round(hallucination_rate,1)
+            },
+            "factual_consistency_rate": {
+                "factual_consistency_rate": round(factual_consistency_rate,1)
+            },
+            "answer_rate": {
+                "answer_rate": round(answer_rate*100,1)
+            },
+            "average_summary_length": {
+                "average_summary_length": round(avg_summary_len,1)
+            },
+        }
+    }
+    return results

src/datasets/Items.xlsx ADDED Viewed

Binary file (92.9 kB). View file

src/datasets/Material_Llama2_0603.xlsx ADDED Viewed

Binary file (147 kB). View file

src/datasets/human_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

src/datasets/human_data.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4995d145fea0d7fdbf9e25cb1fa0d05f2d30eadc0da79e9bb1964ccce3672d7
+size 1597107

src/datasets/leaderboard_dataset.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

src/datasets/prompt.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+Item,Condition,Stimuli,V1,Unnamed: 4,Unnamed: 5,Unnamed: 6,Prompt,SystemPrompt,Unnamed: 9,Unnamed: 10,Unnamed: 11,Instruction
+1,Round,baamoo,Round,"In this task, you will see a novel word. Assuming that the word refers to a shape, we'd like you to guess whether the novel word refers to a round or spiky shape by saying ""Round"" or ""Spiky"".","Please respond only with ""Round"" or ""Spicky""; don’t ask any questions or give any other information.",Please guess whether the following novel word refers to a round or spiky shape:,"In this task, you will see a novel word. Assuming that the word refers to a shape, we'd like you to guess whether the novel word refers to a round or spiky shape by saying ""Round"" or ""Spiky"".\n\nPlease respond only with ""Round"" or ""Spicky""; don’t ask any questions or give any other information.\n\nPlease guess whether the following novel word refers to a round or spiky shape:\nbaamoo",You are a participant of a psycholinguistic experiment. You will do a task on English language use.,<s>[INST] <<SYS>>\n,\n<</SYS>>\n\n,[/INST],"<s>[INST] <<SYS>>\nYou are a participant of a psycholinguistic experiment. You will do a task on English language use.\n<</SYS>>\n\nIn this task, you will see a novel word. Assuming that the word refers to a shape, we'd like you to guess whether the novel word refers to a round or spiky shape by saying ""Round"" or ""Spiky"".\n\nPlease respond only with ""Round"" or ""Spicky""; don’t ask any questions or give any other information.\n\nPlease guess whether the following novel word refers to a round or spiky shape:\nbaamoo[/INST]"
+2,,,,,,,"In this task, you will see a sentence fragment; please repeat the fragment and continue it into a full sentence.\n\nFor instance, if you see ""The boy went to the park ..."", you can say ""The boy went to the park to fly a kite"".\n\nPlease respond only with your completed sentence; don’t ask any questions or give any other information.\n\nPlease repeat the following fragment and complete it into a full sentence:\nAlthough Pelcra was sick …",,,,,
+3,,,,,,,"In this task, you will read a sentence fragment, with two words for completing the fragment. Please choose the word that you think best completes the fragment.\n\nPlease respond only with your chosen word; don’t ask any questions or give any other information.\n\nHere’s the sentence fragment:\nSusan was very bad at algebra, so she hated...\n\nPlease choose the word from the following options that you prefer to complete the fragment:\nmathematics\nmath",,,,,
+4,,,,,,,"In this task, I would like to present you with five sentences. Please just carefully read the sentences; you don't have to do anything with them.\n\nHere are the sentences:\nSENTENCE1: The curious cat silently watched the busy people from atop the old wooden fence.\nSENTENCE2: The man accepted the post in the accountancy firm.\nSENTENCE3: She decided to take a different path through the park, enjoying the unexpected quiet.\nSENTENCE4: He found an old book in his attic that contained stories of ancient heroes and legends.\nSENTENCE5: They often spent their evenings by the lake, listening to the soothing sounds of nature.\n\nNext, I am going to present some words to you; upon reading each word, please provide ONLY ONE word/phrase as an associate.\n\nFor instance, if the word you see is ""milk"", you can provide ""breakfast"" or ""cow"" as an associate.\n\nPlease respond only with the associate words in order; separate them with semicolons; don’t ask any questions or give any other information.\n\nHere are the words:\nWORD1: bottle\nWORD2: cloud\nWORD3: blanket\nWORD4: paper\nWORD5: post",,,,,
+5,,,,,,,"In this task, you will see a sentence fragment; please repeat the fragment and continue it into a full sentence.\n\nFor instance, if you see ""The boy went to the park ..."", you can type ""The boy went to the park to fly a kite"".\n\nPlease respond only with the two completed sentences in order, separated by two line breaks; don’t ask any questions or give any other information.\n\nHere are two fragments for you to complete:\nFRAGMENT1: The captain lent the spare lifejacket...\nFRAGMENT2: The bus driver gave...",,,,,
+6,,,,,,,"In this task, you will read a short passage containing several sentences, followed by a question about the passage. Please answer the question according to the passage.\n\nFor instance, if you read ""There was a tiger and a fox. The tiger ate the fox because it was hungry. Who was hungry?"", you can say ""the tiger"" as an answer.\n\nPlease respond only with your answer; don’t ask any questions or give any other information.\n\nPlease answer the question according to preceding passage:\nPASSAGE: There was a farmer and a thief. The farmer injured the thief with a staff only a few days ago.\nQUESTION: Who had a staff, the farmer or the injured thief?",,,,,
+7,,,,,,,"In this task, you will read a sentence, followed by a comprehension question. Please choose ""Yes"" or ""No"" to answer the question.\n\nPlease answer the question in this format without any other words:[ANSWER].\n\nRead the sentence and answer the question:\nSENTENCE: The sister mailed the letter the niece.\nQUESTION: Did the niece receive something/someone?",,,,,
+8,,,,,,,"In this task, I want you to answer a question.\n\nYou may encounter a question which has something wrong with it. For example, you might see the question: ""When was President Gerald Ford forced to resign his office? "" The thing that is wrong in the question is that Ford wasn't forced to resign. When you see a question like this, just say ""Something is wrong with the question"".\n\nPlease respond only with your answer; don’t ask any questions or give any other information.\n\nPlease answer the question:\nRegina is the capital of what Canadian prairie city?",,,,,
+9,,,,,,,"In this task, you will see a sentence fragment; please repeat the fragment and continue it into a full sentence.\n\nFor instance, if you see ""The boy went to the park ..."", you can type ""The boy went to the park to fly a kite"".\n\nPlease respond only with your completed sentence; don’t ask any questions or give any other information.\n\nRead the sentence fragment and continue it into a full sentence:\nGary scared Anna because…",,,,,
+10,,,,,,,"In this task, you will read a short passage and answer a yes/no question regarding the passage. Please say ""Yes"", ""No"", or ""Don't know"" to answer.\n\nPlease respond only with ""Yes"", ""No"", or ""Don't know""; don’t ask any questions or give any other information.\n\nPlease respond to the question according to the preceding passage:\nWhile swimming in the shallow water near the rocks, Sharon stepped on a piece of glass. She called desperately for help, but there was no one around to hear her. Did she cut her foot?",,,,,

src/datasets/prompt.xlsx ADDED Viewed

Binary file (12.2 kB). View file

src/datasets/sample_dataset.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+text,dataset
+"Paul Merson has restarted his row with Andros Townsend after the Tottenham midfielder was brought on with only seven minutes remaining in his team's 0-0 draw with Burnley on Sunday. 'Just been watching the game, did you miss the coach? #RubberDub #7minutes,' Merson put on Twitter. Merson initially angered Townsend for writing in his Sky Sports column that 'if Andros Townsend can get in (the England team) then it opens it up to anybody.' Paul Merson had another dig at Andros Townsend after his appearance for Tottenham against Burnley . Townsend was brought on in the 83rd minute for Tottenham as they drew 0-0 against Burnley . Andros Townsend scores England's equaliser in their 1-1 friendly draw with Italy in Turin on Tuesday night . The former Arsenal man was proven wrong when Townsend hit a stunning equaliser for England against Italy and he duly admitted his mistake. 'It's not as though I was watching hoping he wouldn't score for England, I'm genuinely pleased for him and fair play to him – it was a great goal,' Merson said. 'It's just a matter of opinion, and my opinion was that he got pulled off after half an hour at Manchester United in front of Roy Hodgson, so he shouldn't have been in the squad. 'When I'm wrong, I hold my hands up. I don't have a problem with doing that - I'll always be the first to admit when I'm wrong.' Townsend hit back at Merson on Twitter after scoring for England against Italy . Sky Sports pundit  Merson (centre) criticised Townsend's call-up to the England squad last week . Townsend hit back at Merson after netting for England in Turin on Wednesday, saying 'Not bad for a player that should be 'nowhere near the squad' ay @PaulMerse?' Any bad feeling between the pair seemed to have passed but Merson was unable to resist having another dig at Townsend after Tottenham drew at Turf Moor.",summeval_valid
+"Chelsea have made an offer for FC Tokyo's 22-year-old forward Yoshinori Muto, according to club president Naoki Ogane. The Japan international, who has played for the J-League side since 2013, will join Chelsea's Dutch partner club Vitesse Arnhem on loan next season if he completes a move to Stamford Bridge this summer. Ogane claims that Chelsea's interest in Muto is not connected to the £200million sponsorship deal they signed with Japanese company Yokohama Rubber in February. FC Tokyo forward Yoshinori Muto (centre) brings the ball forward against Albirex Niigata in March . FC Tokyo president Naoki Ogane claims that Chelsea have made a bid for Japan international Muto . Muto tussles with Yuji Nakazawa of Yokohama F.Marinos during a J-League clash last month . Age: 22 . Club: FC Tokyo . Appearances: 37 . Goals: 16 . International caps (Japan): 11 . International goals: 1 . Did you know? Muto graduated from Keio University in Tokyo with an economics degree two weeks ago . Speaking to Sports Nippon, Ogane said: 'It is true that Chelsea sent us an offer for Muto. 'It is a formal offer with conditions. They want to acquire him in the summer.' Muto, who only graduated from Keio University a fortnight ago after completing an economics degree, would be the first Japanese player to represent Chelsea if he moves to west London. He has earned 11 caps for his country after signing his first professional contract in 2014, scoring once for the Samurai Blue. A £4million deal for the youngster has been mooted, but Muto admits that he isn't sure if he will join the Premier League title chasers despite being pleased with their bid. He said: 'I have not decided yet at all. It is an honour for me to receive this offer from a great club.' Muto scored 13 times in his debut season with FC Tokyo and was named in the J-League's best XI. Muto admits it is an 'honour' to receive an offer from Chelsea although he has not yet decided to join the club . Muto, pictured in action against Ventforet Kofu has scored three goals in four games so far this season . The 22-year-old has a shot at goal during Japan's Asian Cup match against Palestine in January this year . He has continued his fine form during the current campaign, helping his club to third place in the division with three goals in four games. Yokohama Rubber - one of the world's largest tyre manufacturers - will become Chelsea's official shirt sponsors from the start of the 2015-16 season. The initial five-year deal is the biggest in the club's history, with the Blues now considering a two-week pre-season tour of Japan this summer.",summeval_valid
+"Babies given antibiotics in the first six months of life are more likely to be fat as toddlers, a large-scale study has found. The researchers said say that just as antibiotics are used to make farm animals put on weight, the may also be fattening our children. Writing in the respected medical journal Pediatrics, they said that the widely-prescribed drugs could be contributing to the obesity epidemic. A third of 10-11 year olds and more than a fifth of 4-5 year olds in England are overweight or obese, leading to fears that today's generation will be the first to die at an earlier age than their parents. Obesity: Babies given antibiotics in the first six months of life are more likely to be fat as toddlers, a large-scale study has found (file photo) The Finnish researchers compared the weight and height of more than 12,000 healthy two year old with records on antibiotic prescription. By two years-old, one in five boys and one in ten girls was overweight or obese. And children who had taken antibiotics as young babies were particularly likely to be overweight. Repeated prescriptions before the age of two also raised the odds of being a fat toddler. Boys seemed particularly prone weight gain after being given antibiotics. They were also slightly taller than boys who hadn't been given the drugs. The study didn't prove that antibiotics were causing weight gain. But if they do, it may be because they kill of bugs in the gut that would normally feed on some of the food eaten. This frees up more food for the body. Killing certain gut bugs may also increase appetite. Lead author Dr Antti Saari, of Kuopio University Hospital, warned: 'Antibiotic exposure before six months of age, or repeatedly during infancy, was associated with increased body mass in healthy children. 'Such effects may play a role in the worldwide childhood obesity epidemic and highlight the importance of judicious use of antibiotics in infancy. The worldwide obesity epidemic is real, and is more pronounced for boys. 'Epidemic': By two years-old, one in five boys and one in ten girls was overweight or obese and children who had taken antibiotics as young babies were particularly likely to be overweight (file photo) 'An increase in the use of antibiotics could be an additional contributing factor to the development of excess weight problems. 'The crucial role of antibiotics in the improvement of human health is unquestionable but their extended use today has undesirable and unexpected consequences.' Previous research has found that babies given antibiotics are at higher risk of eczema and digestive problems. The studies come amid growing concern that the over-prescription of antibiotics is leading to the pills losing their power and making common infections harder to treat. The Government has warned that a new superbug could infect up to 200,000 Britons and kill 80,000 in a single outbreak.",summeval_valid
+"This is the embarrassing moment a Buckingham Palace guard slipped and fell on a manhole cover in front of hundreds of shocked tourists as he took up position in his sentry box. The Queen's Guard was left red-faced after he slipped on a manhole cover during the popular Changing of the Guard - and unfortunately for him the entire incident was caught on camera. He lost his footing and slid sideways, knocking his bearskin on the side of the box and dropping his rifle. The Queen's Guard (pictured) slipped on a manhole cover during the popular Changing of the Guard at Buckingham Palace last week. Unfortunately for him, the entire incident was caught on a tourist's camera . The embarrassed soldier quickly scrambled to his feet as his colleagues marched past as if nothing had happened. But the young guard started to blush as he peered at the crowd from under his bearskin and realised how many people had seen his slapstick moment. Holidaymaker David Meadwell recorded the unscheduled manouevre outside Buckingham Palace on Thursday afternoon. Mr Meadwell, 50, from Newcastle-upon-Tyne, said: 'I was with my family for a trip to London and I thought I'd take some pictures of the changing of the guards. Tourist David Meadwell shot this footage of the Changing of the Guard last week when the incident unfolded . The moment it all started to go wrong: The guard leans heavily to the side as he appears to slip . The unidentified young guard's legs appear to get a bit tangled as he tries to turn to march away . The guard, wearing full regalia, falls heavily to the floor still clutching his rifle following the slip up . 'The first group changed successfully, without any problems, and so I decided to video the next group doing it. 'I didn't expect anything like this to happen - he went flying. There were quite a few people around and there were a lot of gasps as he went down. 'I think he just slipped on a manhole cover, he looked so embarrassed.' The unnamed solider is thought to have slipped because of the metal protectors nailed to the soles of his boots. Tourists gather in their thousands to watch the changing of the guard outside Buckingham Palace at 11.30am every day in the summer and every other day in the winter. The Guard comprises two detachments, one each for Buckingham Palace and St James's Palace, under the command of the Captain of The Queen's Guard. Contrary to popular belief they are not purely ceremonial and are fully operational soldiers. The Ministry of Defence said they would not comment on 'a young man falling over while doing his job'. The embarrassed guard hastily scrambled to his feet following the unfortunate tumble at the palace . The incident took place in front of hundreds of tourists who were watching the Changing of the Guard .",summeval_valid
+"(CNN)One of the biggest TV events of all time is being reimagined for new audiences. ""Roots,"" the epic miniseries about an African-American slave and his descendants, had a staggering audience of over 100 million viewers back in 1977. Now A&E networks are remaking the miniseries, to air in 2016. A&E, Lifetime and History (formerly the History Channel) announced Thursday that the three networks would simulcast a remake of the saga of Kunta Kinte, an African who was captured, shipped to America and sold into slavery to work on a Virginia plantation. LeVar Burton, who portrayed Kinte in the original, will co-executive produce the new miniseries. A press release describes the new version as ""original"" and ""contemporary"" and will draw more from Alex Haley's classic novel, ""Roots: The Saga of an American Family."" Producers will consult scholars in African and African-American history for added authenticity. ""We are proud to bring this saga to fans of the original, as well as to a new generation that will experience this powerful and poignant tale for the first time,"" said Dirk Hoogstra, History's executive vice president and general manager. ""Audiences will once again feel the impact of Kunta Kinte's indomitable spirit."" Executive producer Mark Wolper, son of the original's producer David L. Wolper, added, ""Kunta Kinte began telling his story over 200 years ago and that story went through his family lineage, to Alex Haley, to my father, and now the mantle rests with me. Like Kunta Kinte fought to tell his story over and over again, so must we."" The remade ""Roots"" will encounter a new generation of viewers who have witnessed Barack Obama make history as the nation's first African-American president and ""12 Years a Slave"" win the Oscar for Best Picture, but also widespread racial unrest over police treatment of black suspects in many U.S. cities. ""My career began with 'Roots' and I am proud to be a part of this new adaptation,"" said Burton. ""There is a huge audience of contemporary young Americans who do not know the story of 'Roots' or its importance.""",summeval_valid
+"Police are investigating claims by a former royal footman that palace aides tried to force him into an orgy, it was revealed yesterday. Christopher Lawler said he was pinned to a chair and groped by a male member of staff on his first day working at Clarence House. The ordeal left him in tears and he left the job the same day. He finally decided to contact palace officials again last year after inquiries began into alleged cover-ups of child sex abuse by a VIP paedophile ring in the 1970s and 80s. Police are investigating claims made by a former royal footman that Clarence House aides tried to force him into an orgy in the 1970s . But his complaints were ignored for months before the police were finally notified, he claims. Mr Lawler, now 64, said the attempted abuse at the Queen Mother’s London residence happened in January 1978 when he was 27. He said he was accosted after he walked into a bedroom to look for a pen on his first shift. Two men offered him a drink before two other men joined them – one a senior member of staff, he said. He was asked if he was gay before a younger man took his trousers off and began performing a sex act on himself. ‘That prompted another guy to put his hand on my leg and then he grabbed me,’ he told the Sunday People. ‘I was staggered. The younger man then came up behind me and gripped me, holding me in the chair. Mr Lawler worked at Clarence House when the Queen Mother used it as her London residence . ‘They were trying to undo my trousers but I managed to jump up and burst out of the room.’ Afterwards, Mr Lawler said he was followed by two men and threatened to keep quiet. A complaint he made that day allegedly drew an angry response from Clarence House – so he packed his bags immediately. Mr Lawler, a former Port of Liverpool police officer, said he was ‘haunted’ by the incident for years. After hearing about the probe into historical cases of child abuse last year he wrote to the Palace, but was twice rebuffed. Months later the complaint was finally passed to Scotland Yard. A retired royal aide, who is now in his 80s, was reportedly interviewed but denied he was involved because he was working for the Queen in Balmoral at the time. Mr Lawler has now been told by the Palace that the Royal Household would work ‘exhaustively and co-operatively’ with any police probe. A police spokesman said it would be inappropriate to comment ‘as investigations continue’.",summeval_valid
+"An Oregon couple announced they are expecting a child in a rap video they made set to the theme  from '90s television sitcom 'The Fresh Prince of Bel-Air.' The clip, which features Jesse and Melissa Meek rapping as they drive in a car, has been viewed over 1.7 million times on YouTube. 'In Happy Valley, Oregon, livin' it wed, bought our first house and started to build our homestead,' Melissa says in the clip. Parents: Jesse and Melissa Meek announced they are expecting a child in a video set to the theme song from '90s television sitcom 'The Fresh Prince of Bel-Air' The original song for the popular NBC sitcom, sung by star Will Smith, details how his character grew up in West Philadelphia, where he got into a neighborhood fight - and at his mother's insistence, moved to his aunt and uncle's home in their wealthy Los Angeles neighborhood. In the Meeks' parody video, Melissa raps, 'When our family and friends, who were up to no good, started asking questions about parenthood. 'We told one little lie, said ""It's not time yet."" Parents said ""We're ready for grandkids, get a move on it!""' Jesse raps that the couple thought it would take longer than the two months the pair spent trying to conceive. Melissa says in the video 'I woke up in the morning about 7 or 8 and I thought to myself, ""Oh man, I'm late!"" Looked at the test and it was finally there. The little plus sign. We're now three. Not a pair.' At the end of the video, the Meeks smile and share a sonogram of their unborn child. It took five takes to film the clip, the couple told KPTV. After finding out the gender of the child, another video is a possibility for the Meeks, the couple told the Fox affiliate. Original: Will Smith is seen here rapping the theme for 'The Fresh Prince of Bel-Air' during the show's title sequence . Big reveal: At the end of the clip, the Meeks share a sonogram of their unborn child . According to KPTV, the video was made so loved ones living far away could know about the baby. Melissa told the Fox affliate of the video's success 'It was completely unexpected. Like, that was the last thing we thought would happen, that it would blow up to what it had, or what it has.' Jesse told the Oregonian 'It has been a lot of fun, but definitely way more than we ever expected.' He is the great-great-grandson of Oregon pioneer Joseph Lafayette Meek, the newspaper reported. The Oregonian reported that Melissa earlier made a video which captured Jesse's reaction when he found out about the pregnancy. Jesse learned the news after reading a tag Melissa placed on their dog, which indicated a baby was on the way. A Phoenixville, Pennsylvania, couple made a pregnancy announcement video using 'The Fresh Prince of Bel-Air' theme song last year and recreated the show's title sequence, People reported.",summeval_valid
+"A dress worn by Vivien Leigh when she played Scarlett O'Hara in the classic 1939 film Gone With the Wind has fetched $137,000 at auction. Heritage Auctions offered the gray jacket and skirt, featuring a black zigzag applique, plus more than 150 other items from the Academy Award-winning film at auction on Saturday in Beverly Hills, California. The dress - a jacket and full skirt ensemble - was worn in several key scenes in the 1939 movie, including when Scarlett O'Hara encounters Rhett Butler, played by Clark Gable, and when she gets attacked in the shanty town. Scroll down for video . An outfit worn in several scenes of the 1939 film Gone With The Wind by Vivien Leigh as she played Scarlett O'Hara sold for $137,000 at auction on Saturday . The dress - a jacket and full skirt ensemble - was worn in several key scenes in the 1939 movie but has suffered a little with age and has faded to light gray from original slate blue-gray color . The outfit has suffered a little with age, however. When Leigh wore it in the movie, it was slate blue-gray but over the years it has faded to light gray. It was one of more than 150 items that were part of the private collection of James Tumblin, formerly in charge of the hair and makeup department at Universal Studios. Tumblin began collecting onscreen costumes, props and behind-the-scenes artifacts from the film in the 1960s, amassing a collection of more than 300,000 pieces of memorabilia. During a visit to the Western Costume Company he spotted the Scarlett O'Hara dress on the floor. He learned that the dress was about to be thrown away and negotiated a deal to buy it for $20. Tumblin has 'devoted his life and efforts to promoting Hollywood and this film, touring his items throughout the United States,' said Kathleen Guzman, managing director of Heritage Auctions. Gone With The Wind, which celebrated its 75th anniversary last year, was based on Margaret Mitchell's 1936 best-selling book about a spoiled Old South socialite, played by Vivien Leigh, and co-starred Clark gable as Rhett Butler . Hattie McDaniel (left), Olivia DeHavilland (middle), and Vivien Leigh: McDaniel famously became the first African-American actor to be nominated for and win an Academy Award . Other top selling items from the auction were a straw hat worn by Leigh that sold for $52,500; the trousers and jacket from a suit worn by Clark Gable as Rhett Butler, selling for $55,000; and a black bonnet worn by both Leigh and Olivia De Havilland as Melanie Wilkes, which fetched $30,000. Gone With The Wind, which celebrated its 75th anniversary last year, was based on Margaret Mitchell's 1936 best-selling book about a spoiled Old South socialite. Actress Hattie McDaniel, who played Scarlett's devoted nanny Mammy, a slave, famously became the first African-American actor to be nominated for and win an Academy Award.",summeval_valid
+"A two-year-old boy is recovering after falling into a cheetah exhibit at the Cleveland Metroparks Zoo after his parents dangled him over the edge, officials said. The toddler's mother was holding him and another child when he slipped and fell between 10 to 12ft and into the pit on Saturday around 3pm. The child was rescued by his parents before firefighters and paramedics arrived on the scene. Scroll down for video . A mother was holding the two-year-old boy and another child when the toddler slipped and fell into the cheetah exhibit at the Cleveland Metroparks Zoo (file photo of cheetahs at the Cleveland zoo) The boy was rescued by his parents from the pit (pictured) before firefighters and paramedics arrived on the scene. He suffered from minor bumps and bruises and was listed in stable condition at the hospital . He is listed in stable condition after being taken to MetroHealth Medical Center and suffered from minor bruises and bumps. The boy's leg was injured in the fall, but he was not attacked by the animals, Dr. Christopher Kuhar, the zoo's executive director told Fox 8. Michael Lurie and his family were at the Cheetah exhibit when they heard the child scream. 'You saw how far the drop was and you just couldn't believe the kid didn't hurt himself from falling down on the ground,' Lurie told WKYC. 'I was just shocked,' he said. 'I didn't understand how the parents let the kid go over the thing.' The cheetahs did not approach the boy or his parents while in the pit, according to zoo officials. Zoo visitor Terra Lurie believes the boy was not approached by the fast feline because they were frightened. 'I think they were just curious as to what was going on and why somebody was in the pen with them,' she said. 'It's not every day that somebody is just in the pen with them.' 'And everyone else is screaming and they probably got scared.' Kuhar said the zoo had received 'a number of eyewitness accounts' that indicate the 'strong likelihood that the child was dangled over the railing,' he told NewsNet5. Cleveland Metroparks Zoo has plans to press child endangerment charges against the family on Monday. The exhibit was closed following the child's fall. Zoo visitor Michael Lurie was at the cheetah exhibit when he heard the child scream. He said he was 'shocked' and 'didn't understand how the parents let the kid' go over the railing and into the pit . Cleveland Metroparks Zoo plans to press child endangering charges against the child's mother (above file photo of visitors at the Cleveland zoo)",summeval_valid
+"The owners of this house better not leave too quickly, after a speed camera was angled directly on their front door. The bright yellow gatso had previously enforced the 30mph speed limit for motorists along the residential road in Handsworth, Birmingham. However, it has not been working for two years after every single fixed device was switched off in the West Midlands. Big Brother is watching: A speed camera has been turned round and is pointing at this house in Birmingham, West Midlands . The speed camera has not been working for more than two years . Around 300 speed and traffic camera, using old technology, were turned off across the region in March 2013 . In there place, speed enforcement operations have been carried out by a small number of mobile camera units, fixed cameras on motorways and traffic officers on patrol. Mystery surrounds who had re-pointed the camera, but a spokesman for Birmingham City Council said they were aware of it. One of their engineers will now be visiting the site and the camera could be removed completely. 'Fixed location safety cameras have been decommissioned across the West Midlands since 2013 as the technology inside them had become obsolete,' the spokesman said. 'Plans for a pilot at a limited number of sites, using digital technology, is currently in development. 'Now the issue with this camera in Wellington Road has been brought to our attention, we will take any appropriate action at the site.' The spokesman confirmed that there were no plans to include the camera in Wellington Road in the new pilot. The owners of the house were not available for comment.",summeval_valid

src/datasets/~$Items.xlsx ADDED Viewed

Binary file (165 Bytes). View file

src/datasets/~$Material_Llama2_0603.xlsx ADDED Viewed

Binary file (165 Bytes). View file

src/display/about.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    hallucination_rate = Task("hallucination_rate",
+                            "hallucination_rate", "Hallucination Rate (%)")
+    factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
+    answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
+    average_summary_length = Task("average_summary_length",
+                                "average_summary_length", "Average Summary Length")
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model (HHEM) leaderboard</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+This leaderboard (by [Vectara](https://vectara.com)) evaluates how often an LLM introduces hallucinations when summarizing a document. <br>
+The leaderboard utilizes [HHEM](https://huggingface.co/vectara/hallucination_evaluation_model), an open source hallucination detection model.<br>
+An improved version (HHEM v2) is integrated into the [Vectara platform](https://console.vectara.com/signup/?utm_source=huggingface&utm_medium=space&utm_term=integration&utm_content=console&utm_campaign=huggingface-space-integration-console).
+"""
+# Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = """
+## Introduction
+The Hughes Hallucination Evaluation Model (HHEM) Leaderboard is dedicated to assessing the frequency of hallucinations in document summaries generated by Large Language Models (LLMs).
+Hallucinations refer to instances where a model introduces factually incorrect or unrelated content in its summaries.
+## How it works
+Using [Vectara](https://vectara.com)'s HHEM, we measure the occurrence of hallucinations in generated summaries.
+Given a source document and a summary generated by an LLM, HHEM outputs a hallucination score between 0 and 1, with 0 indicating complete hallucination and 1 representing perfect factual consistency.
+The model card for HHEM can be found [here](https://huggingface.co/vectara/hallucination_evaluation_model).
+## Evaluation Dataset
+Our evaluation dataset consists of 1006 documents from multiple public datasets, primarily [CNN/Daily Mail Corpus](https://huggingface.co/datasets/cnn_dailymail/viewer/1.0.0/test).
+We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
+## Metrics Explained
+- Hallucination Rate: Percentage of summaries with a hallucination score below 0.5
+- Factual Consistency Rate: The complement of the hallucination rate, expressed as a percentage.
+- Answer Rate: Percentage of summaries that are non-empty. This is either the model refuses to generate a response or throws an error due to various reasons. (e.g. the model believes that the document includes inappropriate content)
+- Average Summary Length: The average word count of generated summaries
+## Note on non-Hugging Face models
+On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
+If you would like to submit your model that is not available on the Hugging Face model hub, please contact us at [email protected].
+## Model Submissions and Reproducibility
+You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
+### For models not available on the Hugging Face model hub:
+1) Access generated summaries used for evaluation [here](https://github.com/vectara/hallucination-leaderboard) in "leaderboard_summaries.csv".
+2) The text generation prompt is available under "Prompt Used" section in the repository's README.
+3) Details on API Integration for evaluations are under "API Integration Details".
+### For models available on the Hugging Face model hub:
+To replicate the evaluation result for a Hugging Face model:
+1) Clone the Repository
+```python
+git lfs install
+git clone https://huggingface.co/spaces/vectara/leaderboard
+```
+2) Install the Requirements
+```python
+pip install -r requirements.txt
+```
+3) Set Up Your Hugging Face Token
+```python
+export HF_TOKEN=your_token
+```
+4) Run the Evaluation Script
+```python
+python main_backend.py --model your_model_id --precision float16
+```
+5) Check Results
+After the evaluation, results are saved in "eval-results-bk/your_model_id/results.json".
+## Results Format
+The results are structured in JSON as follows:
+```python
+{
+    "config": {
+        "model_dtype": "float16",
+        "model_name": "your_model_id",
+        "model_sha": "main"
+    },
+    "results": {
+        "hallucination_rate": {
+            "hallucination_rate": ...
+        },
+        "factual_consistency_rate": {
+            "factual_consistency_rate": ...
+        },
+        "answer_rate": {
+            "answer_rate": ...
+        },
+        "average_summary_length": {
+            "average_summary_length": ...
+        }
+    }
+}
+```
+For additional queries or model submissions, please contact [email protected].
+"""
+EVALUATION_QUEUE_TEXT = """
+## Some good practices before submitting a model
+### 1) Make sure you can load your model and tokenizer using AutoClasses:
+```python
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+config = AutoConfig.from_pretrained("your model name", revision=revision)
+model = AutoModel.from_pretrained("your model name", revision=revision)
+tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
+```
+If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
+Note: make sure your model is public!
+Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
+### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
+It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
+### 3) Make sure your model has an open license!
+This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
+### 4) Fill up your model card
+When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+## In case of model failure
+If your model is displayed in the `FAILED` category, its execution stopped.
+Make sure you have followed the above steps first.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+@dataset{HughesBae2023,
+  author       = {Simon Hughes and Minseok Bae},
+  title        = {Vectara Hallucination Leaderboard},
+  year         = {2023},
+  month        = {11},
+  publisher    = {Vectara, Inc},
+  doi          = {},
+  url          = {https://github.com/vectara/hallucination-leaderboard},
+  abstract     = {A leaderboard comparing LLM performance at maintaining factual consistency when summarizing a set of facts.},
+  keywords     = {nlp, llm, hallucination, nli, machine learning},
+  license      = {Apache-2.0},
+}"""

src/display/css_html_js.py CHANGED Viewed

@@ -33,11 +33,17 @@ custom_css = """
     background: none;
     border: none;
 }
 #search-bar {
     padding: 0px;
 }
 /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
 table td:first-child,
 table th:first-child {

     background: none;
     border: none;
 }
 #search-bar {
     padding: 0px;
 }
+/* Hides the final AutoEvalColumn */
+#llm-benchmark-tab-table table td:last-child,
+#llm-benchmark-tab-table table th:last-child {
+    display: none;
+}
 /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
 table td:first-child,
 table th:first-child {

src/display/formatting.py CHANGED Viewed

@@ -1,3 +1,12 @@
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

+import os
+from datetime import datetime, timezone
+from huggingface_hub import HfApi
+from huggingface_hub.hf_api import ModelInfo
+API = HfApi()
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

src/display/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from enum import Enum
 import pandas as pd
-from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -19,16 +19,18 @@ class ColumnContent:
     displayed_by_default: bool
     hidden: bool = False
     never_hidden: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -39,6 +41,8 @@ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -91,6 +95,9 @@ class WeightType(Enum):
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     Unknown = ModelDetails("?")
     def from_str(precision):
@@ -98,13 +105,32 @@ class Precision(Enum):
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
         return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

 import pandas as pd
+from src.display.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
     displayed_by_default: bool
     hidden: bool = False
     never_hidden: bool = False
+    dummy: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
+auto_eval_column_dict.append(["model_type_symbol", ColumnContent,
+                            ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent,
+                            ColumnContent("Model", "markdown", True, never_hidden=True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+# Dummy column for the search bar (hidden by the custom CSS)
+auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
+    qt_8bit = ModelDetails("8bit")
+    qt_4bit = ModelDetails("4bit")
+    qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     def from_str(precision):
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
+        if precision in ["8bit"]:
+            return Precision.qt_8bit
+        if precision in ["4bit"]:
+            return Precision.qt_4bit
+        if precision in ["GPTQ", "None"]:
+            return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
+COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
+TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+NUMERIC_INTERVALS = {
+    "?": pd.Interval(-1, 0, closed="right"),
+    "~1.5": pd.Interval(0, 2, closed="right"),
+    "~3": pd.Interval(2, 4, closed="right"),
+    "~7": pd.Interval(4, 9, closed="right"),
+    "~13": pd.Interval(9, 20, closed="right"),
+    "~35": pd.Interval(20, 45, closed="right"),
+    "~60": pd.Interval(45, 70, closed="right"),
+    "70+": pd.Interval(70, 10000, closed="right"),
+}

src/envs.py CHANGED Viewed

@@ -1,19 +1,16 @@
 import os
 from huggingface_hub import HfApi
-# Info to change for your repository
-# ----------------------------------
-TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
-# ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
-# If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
@@ -21,5 +18,18 @@ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)

 import os
+import torch
 from huggingface_hub import HfApi
+# replace this with our token
+TOKEN = os.environ.get("HF_TOKEN", None)
+# print(TOKEN)
+OWNER = "vectara"
 REPO_ID = f"{OWNER}/leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
+print(RESULTS_REPO)
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+print(EVAL_RESULTS_PATH_BACKEND)
+# exit()
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
 API = HfApi(token=TOKEN)
+DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
+PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
+HEM_PATH = 'vectara/hallucination_evaluation_model'
+HUMAN_DATA = "./src/datasets/human_data.csv" #experiment data
+# SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
+SYSTEM_PROMPT = "You are a participant of a psycholinguistic experiment. You will do a task on English language use."
+'''prompt'''
+# USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
+USER_PROMPT = ""

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,35 +1,32 @@
 import glob
 import json
-import math
 import os
 from dataclasses import dataclass
-import dateutil
 import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
-from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
     model: str
-    revision: str # commit hash, "" if main
     results: dict
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = False
     @classmethod
@@ -41,43 +38,35 @@ class EvalResult:
         config = data.get("config")
         # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
             result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
         results = {}
-        for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
@@ -85,7 +74,7 @@ class EvalResult:
             org=org,
             model=model,
             results=results,
-            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
@@ -93,40 +82,43 @@ class EvalResult:
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
-        for task in Tasks:
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
@@ -160,7 +152,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
         # Sort the files by date
@@ -169,8 +161,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         except dateutil.parser._parser.ParserError:
             files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
@@ -181,7 +172,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result

 import glob
 import json
 import os
 from dataclasses import dataclass
 import numpy as np
+import dateutil
+import src.display.formatting as formatting
+import src.display.utils as utils
+import src.submission.check_validity as check_validity
 @dataclass
 class EvalResult:
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
+    org: str
     model: str
+    revision: str  # commit hash, "" if main
     results: dict
+    precision: utils.Precision = utils.Precision.Unknown
+    model_type: utils.ModelType = utils.ModelType.Unknown  # Pretrained, fine tuned, ...
+    weight_type: utils.WeightType = utils.WeightType.Original  # Original or Adapter
+    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = False
     @classmethod
         config = data.get("config")
         # Precision
+        precision = utils.Precision.from_str(config.get("model_dtype"))
         # Get model and org
+        full_model = config.get("model_name", config.get("model_args", None))
+        org, model = full_model.split("/", 1) if "/" in full_model else (None, full_model)
+        if org:
             result_key = f"{org}_{model}_{precision.value.name}"
+        else:
+            result_key = f"{model}_{precision.value.name}"
+        still_on_hub, _, model_config = check_validity.is_model_on_hub(
+            full_model, config.get("model_sha", "main"), trust_remote_code=True,
+            test_tokenizer=False)
+        if model_config:
+            architecture = ";".join(getattr(model_config, "architectures", ["?"]))
+        else:
+            architecture = "?"
         # Extract results available in this file (some results are split in several files)
         results = {}
+        for task in utils.Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            results[task.benchmark] = accs
         return self(
             eval_name=result_key,
             org=org,
             model=model,
             results=results,
+            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model,
+                                                self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
+            self.model_type = utils.ModelType.from_str(request.get("model_type", ""))
+            self.weight_type = utils.WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
+        except FileNotFoundError:
+            print(f"Could not find request file for {self.org}/{self.model}")
+        except json.JSONDecodeError:
+            print(f"Error decoding JSON in request file for {self.org}/{self.model}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
+            utils.AutoEvalColumn.precision.name: self.precision.value.name,
+            utils.AutoEvalColumn.model_type.name: self.model_type.value.name,
+            utils.AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            utils.AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            utils.AutoEvalColumn.architecture.name: self.architecture,
+            utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
+            utils.AutoEvalColumn.dummy.name: self.full_model,
+            utils.AutoEvalColumn.revision.name: self.revision,
+            utils.AutoEvalColumn.license.name: self.license,
+            utils.AutoEvalColumn.likes.name: self.likes,
+            utils.AutoEvalColumn.params.name: self.num_params,
+            utils.AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
+        for task in utils.Tasks:
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
+        if not files or any([not f.endswith(".json") for f in files]):
             continue
         # Sort the files by date
         except dateutil.parser._parser.ParserError:
             files = [files[-1]]
+        model_result_filepaths.extend([os.path.join(root, file) for file in files])
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in
+                                                    eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result

src/populate.py CHANGED Viewed

@@ -3,27 +3,25 @@ import os
 import pandas as pd
-from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    """Creates a dataframe from all the individual experiment results"""
-    raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
-    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
-    df = df[has_no_nan_values(df, benchmark_cols)]
-    return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
-    """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []
@@ -33,8 +31,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             with open(file_path) as fp:
                 data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
             all_evals.append(data)
         elif ".md" not in entry:
@@ -45,8 +43,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
                 with open(file_path) as fp:
                     data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                 all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]

 import pandas as pd
+import src.display.formatting as formatting
+import src.display.utils as utils
+import src.leaderboard.read_evals as read_evals
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
+    df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
+    df = df[formatting.has_no_nan_values(df, benchmark_cols)]
+    return raw_data, df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []
             with open(file_path) as fp:
                 data = json.load(fp)
+            data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
+            data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
             all_evals.append(data)
         elif ".md" not in entry:
                 with open(file_path) as fp:
                     data = json.load(fp)
+                data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
+                data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
                 all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]

src/submission/check_validity.py CHANGED Viewed

@@ -1,14 +1,12 @@
 import json
 import os
-import re
 from collections import defaultdict
-from datetime import datetime, timedelta, timezone
 import huggingface_hub
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
-from transformers import AutoConfig
-from transformers.models.auto.tokenization_auto import AutoTokenizer
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
@@ -31,8 +29,8 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     return True, ""
 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
-    """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
@@ -56,7 +54,8 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
         )
     except Exception as e:
-        return False, "was not found on hub!", None
 def get_model_size(model_info: ModelInfo, precision: str):
@@ -75,7 +74,6 @@ def get_model_arch(model_info: ModelInfo):
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
-    """Gather a list of already submitted models to avoid duplicates"""
     depth = 1
     file_names = []
     users_to_submission_dates = defaultdict(list)

 import json
 import os
 from collections import defaultdict
 import huggingface_hub
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
+from transformers import AutoConfig, AutoTokenizer
+from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     return True, ""
 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
         )
     except Exception as e:
+        return False, f"was not found on hub!: {e}", None
 def get_model_size(model_info: ModelInfo, precision: str):
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
     depth = 1
     file_names = []
     users_to_submission_dates = defaultdict(list)

src/submission/submit.py CHANGED Viewed

@@ -2,14 +2,10 @@ import json
 import os
 from datetime import datetime, timezone
-from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
-from src.submission.check_validity import (
-    already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
-)
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
@@ -25,7 +21,7 @@ def add_new_eval(
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
     user_name = ""
     model_path = model
@@ -37,7 +33,7 @@ def add_new_eval(
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
     # Does the model actually exist?
     if revision == "":
@@ -45,32 +41,32 @@ def add_new_eval(
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
     # Is the model info correctly filled?
     try:
-        model_info = API.model_info(repo_id=model, revision=revision)
     except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
     # Were the model card and license filled?
     try:
         license = model_info.cardData["license"]
     except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
     if not modelcard_OK:
-        return styled_error(error_msg)
     # Seems good, creating the eval
     print("Adding new eval")
@@ -87,15 +83,15 @@ def add_new_eval(
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
-        "private": False,
     }
     # Check for duplicate submission
     if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
     print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
@@ -103,10 +99,10 @@ def add_new_eval(
         f.write(json.dumps(eval_entry))
     print("Uploading eval file")
-    API.upload_file(
         path_or_fileobj=out_path,
         path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
         repo_type="dataset",
         commit_message=f"Add {model} to eval queue",
     )
@@ -114,6 +110,6 @@ def add_new_eval(
     # Remove the local file
     os.remove(out_path)
-    return styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
     )

 import os
 from datetime import datetime, timezone
+import src.display.formatting as formatting
+import src.envs as envs
+import src.submission.check_validity as check_validity
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     if not REQUESTED_MODELS:
+        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = check_validity.already_submitted_models(envs.EVAL_REQUESTS_PATH)
     user_name = ""
     model_path = model
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if model_type is None or model_type == "":
+        return formatting.styled_error("Please select a model type.")
     # Does the model actually exist?
     if revision == "":
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=base_model, revision=revision, token=envs.TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
+            return formatting.styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
+        model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
         if not model_on_hub:
+            return formatting.styled_error(f'Model "{model}" {error}')
     # Is the model info correctly filled?
     try:
+        model_info = envs.API.model_info(repo_id=model, revision=revision)
     except Exception:
+        return formatting.styled_error("Could not get your model information. Please fill it up properly.")
+    model_size = check_validity.get_model_size(model_info=model_info, precision=precision)
     # Were the model card and license filled?
     try:
         license = model_info.cardData["license"]
     except Exception:
+        return formatting.styled_error("Please select a license for your model")
+    modelcard_OK, error_msg = check_validity.check_model_card(model)
     if not modelcard_OK:
+        return formatting.styled_error(error_msg)
     # Seems good, creating the eval
     print("Adding new eval")
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
     }
     # Check for duplicate submission
     if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
+        return formatting.styled_warning("This model has been already submitted.")
     print("Creating eval file")
+    OUT_DIR = f"{envs.EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
         f.write(json.dumps(eval_entry))
     print("Uploading eval file")
+    envs.API.upload_file(
         path_or_fileobj=out_path,
         path_in_repo=out_path.split("eval-queue/")[1],
+        repo_id=envs.QUEUE_REPO,
         repo_type="dataset",
         commit_message=f"Add {model} to eval queue",
     )
     # Remove the local file
     os.remove(out_path)
+    return formatting.styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
     )