Spaces:

mangopy
/

ToolRet-demo

Running

App Files Files Community

mangopy commited on 9 days ago

Commit

cdfebc6

verified ·

1 Parent(s): 39b5b1c

Upload 6 files

Browse files

Files changed (6) hide show

.gitattributes +1 -0
README.md +4 -4
app.py +156 -0
requirements.txt +152 -0
src/jsonl_Indexer.py +125 -0
tool-embedding.jsonl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tool-embedding.jsonl filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: ToolRet Demo
-emoji: 📉
-colorFrom: green
-colorTo: gray
 sdk: streamlit
 sdk_version: 1.42.2
 app_file: app.py

 ---
+title: Tool Retriever
+emoji: 😻
+colorFrom: blue
+colorTo: pink
 sdk: streamlit
 sdk_version: 1.42.2
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+import sys
+import faiss
+import numpy as np
+import streamlit as st
+import pandas as pd
+from text2vec import SentenceModel
+from src.jsonl_Indexer import JSONLIndexer
+def get_cli_args():
+    args = {}
+    argv = sys.argv[2:] if len(sys.argv) > 2 else []
+    for arg in argv:
+        if '=' in arg:
+            key, value = arg.split('=', 1)
+            args[key.strip()] = value.strip()
+    return args
+cli_args = get_cli_args()
+DEFAULT_CONFIG = {
+    'model_path': 'BAAI/bge-base-en-v1.5',
+    'dataset_path': 'tool-embedding.jsonl',
+    'vector_size': 768,
+    'embedding_field': 'embedding',
+    'id_field': 'id'
+}
+config = DEFAULT_CONFIG.copy()
+config.update(cli_args)
+config['vector_size'] = int(config['vector_size'])
+# ---------------------------
+# 缓存数据集加载函数（避免每次运行时重复下载数据）
+# ---------------------------
+@st.cache_data
+def load_tools_datasets():
+    from datasets import load_dataset, concatenate_datasets
+    ds1 = load_dataset("mangopy/ToolRet-Tools", "code")
+    ds2 = load_dataset("mangopy/ToolRet-Tools", "customized")
+    ds3 = load_dataset("mangopy/ToolRet-Tools", "web")
+    ds = concatenate_datasets([ds1['tools'], ds2['tools'], ds3['tools']])
+    # 重命名'id'字段为'tool'
+    ds = ds.rename_columns({'id': 'tool'})
+    return ds
+ds = load_tools_datasets()
+df2 = ds.to_pandas()
+# 如果数据量较大，可以通过设置索引加速后续的合并操作
+df2.set_index('tool', inplace=True)
+# ---------------------------
+# 缓存模型加载函数
+# ---------------------------
+@st.cache_resource
+def get_model(model_path: str = config['model_path']):
+    return SentenceModel(model_path)
+# 缓存检索器创建函数
+@st.cache_resource
+def create_retriever(vector_sz: int, dataset_path: str, embedding_field: str, id_field: str, _model):
+    retriever = JSONLIndexer(vector_sz=vector_sz, model=_model)
+    retriever.load_jsonl(dataset_path, embedding_field=embedding_field, id_field=id_field)
+    return retriever
+# ---------------------------
+# 侧边栏配置
+# ---------------------------
+st.sidebar.markdown("<div style='text-align: center;'><h3>📄 Model Configuration</h3></div>", unsafe_allow_html=True)
+model_options = ["BAAI/bge-base-en-v1.5"]
+selected_model = st.sidebar.selectbox("Select Model", model_options)
+st.sidebar.write("Selected model:", selected_model)
+st.sidebar.write("Embedding length: 768")
+# 使用下拉框选中的模型（避免重复加载）
+model = get_model(selected_model)
+retriever = create_retriever(
+    config['vector_size'],
+    config['dataset_path'],
+    config['embedding_field'],
+    config['id_field'],
+    _model=model
+)
+# ---------------------------
+# 界面样式设置
+# ---------------------------
+st.markdown("""
+    <style>
+    .search-container {
+        display: flex;
+        justify-content: center;
+        align-items: center;
+        gap: 10px;
+        margin-top: 20px;
+    }
+    .search-box input {
+        width: 500px !important;
+        height: 45px;
+        font-size: 16px;
+        border-radius: 25px;
+        padding-left: 15px;
+    }
+    .search-btn button {
+        height: 45px;
+        font-size: 16px;
+        border-radius: 25px;
+    }
+    </style>
+""", unsafe_allow_html=True)
+st.markdown("<h1 style='text-align: center;'>🔍 Tool Retrieval</h1>", unsafe_allow_html=True)
+# ---------------------------
+# 主体检索区域
+# ---------------------------
+col1, col2 = st.columns([4, 1])
+with col1:
+    query = st.text_input("", placeholder="Enter your search query...", key="search_query", label_visibility="collapsed")
+with col2:
+    search_clicked = st.button("🔎 Search", use_container_width=True)
+top_k = st.slider("Top-K tools", 1, 100, 50, help="Choose the number of results to display")
+if search_clicked and query:
+    rec_ids, scores = retriever.search_return_id(query, top_k)
+    # 构建检索结果 DataFrame
+    df1 = pd.DataFrame({"relevance": scores, "tool": rec_ids})
+    # 使用 join 加速合并（前提是 df2 已设置好索引）
+    results_df = df1.join(df2, on='tool', how='left').reset_index(drop=False)
+    st.subheader("🗂️ Retrieval results")
+    styled_results = results_df.style.apply(
+        lambda x: [
+            "background-color: #F7F7F7" if i % 2 == 0 else "background-color: #FFFFFF"
+            for i in range(len(x))
+        ],
+        axis=0,
+    ).format({"relevance": "{:.4f}"})
+    st.dataframe(
+        styled_results,
+        column_config={
+            "relevance": st.column_config.ProgressColumn(
+                "relevance",
+                help="记录与查询的匹配程度",
+                format="%.4f",
+                min_value=0,
+                max_value=float(max(scores)) if len(scores) > 0 else 1,
+            ),
+            "tool": st.column_config.TextColumn("tool", help="tool help text", width="medium")
+        },
+        hide_index=True,
+        use_container_width=True,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,152 @@

+aiohappyeyeballs==2.4.3
+aiohttp==3.11.0
+aiosignal==1.3.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+asttokens==2.4.1
+async-timeout==5.0.1
+attrs==24.2.0
+blessed==1.20.0
+blinker==1.9.0
+branca==0.8.0
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+comm==0.2.2
+contourpy==1.3.0
+cycler==0.12.1
+datasets==3.1.0
+debugpy==1.8.8
+decorator==5.1.1
+dill==0.3.8
+distro==1.9.0
+et_xmlfile==2.0.0
+exceptiongroup==1.2.2
+executing==2.1.0
+f==0.0.1
+faiss-gpu==1.7.2
+filelock==3.16.1
+folium==0.18.0
+fonttools==4.54.1
+frozenlist==1.5.0
+fsspec==2024.9.0
+geopandas==1.0.1
+gitdb==4.0.11
+GitPython==3.1.43
+gpustat==1.1.1
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.2
+idna==3.10
+importlib_metadata==8.5.0
+importlib_resources==6.4.5
+ipykernel==6.29.5
+ipython==8.18.1
+jedi==0.19.2
+jieba==0.42.1
+Jinja2==3.1.4
+jiter==0.7.1
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+kiwisolver==1.4.7
+loguru==0.7.2
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+narwhals==1.13.5
+nest-asyncio==1.6.0
+networkx==3.2.1
+numpy==1.26.0
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-ml-py==12.560.30
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==0.28.0
+openpyxl==3.1.5
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.0.0
+platformdirs==4.3.6
+prettytable==3.12.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+protobuf==5.28.3
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==18.0.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydeck==0.9.1
+pyecharts==2.0.7
+Pygments==2.18.0
+pyogrio==0.10.0
+pyparsing==3.2.0
+pyproj==3.6.1
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.21.0
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.13.1
+seaborn==0.13.2
+shapely==2.0.6
+simplejson==3.19.3
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+stack-data==0.6.3
+streamlit==1.40.1
+streamlit-echarts==0.4.0
+streamlit-option-menu==0.4.0
+streamlit_folium==0.23.1
+sympy==1.13.1
+tenacity==9.0.0
+text2vec==1.3.1
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+toml==0.10.2
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.67.0
+traitlets==5.14.3
+transformers==4.46.2
+triton==3.1.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==6.0.0
+wcwidth==0.2.13
+wordcloud==1.9.4
+xxhash==3.5.0
+xyzservices==2024.9.0
+yarl==1.17.1
+zipp==3.21.0

src/jsonl_Indexer.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import json
+import faiss
+import numpy as np
+from typing import List, Tuple
+from text2vec import SentenceModel
+class JSONLIndexer(object):
+    """
+    JSONL 文件检索器（基于预计算embedding）
+    """
+    def __init__(self, vector_sz: int, n_subquantizers=0, n_bits=8, model: SentenceModel = None, **kwargs):
+        """
+        初始化索引器，选择使用FAISS的类型
+        :param vector_sz: 嵌入向量的大小
+        :param n_subquantizers: 子量化器数量
+        :param n_bits: 每个子向量的位数
+        :param model: SentenceModel 模型，用于对query重新embedding
+        """
+        if n_subquantizers > 0:
+            self.index = faiss.IndexPQ(vector_sz, n_subquantizers, n_bits, faiss.METRIC_INNER_PRODUCT)
+        else:
+            self.index = faiss.IndexFlatIP(vector_sz)
+        self.index_id_to_data = []  # FAISS索引ID到JSON记录索引的映射
+        self.data = []  # 存储所有JSON对象
+        self.model = model
+        print(f'Initialized FAISS index of type {type(self.index)}')
+    def load_jsonl(self, dataset_path: str, embedding_field: str = "embedding", id_field: str = "id") -> None:
+        """
+        加载JSONL文件并构建FAISS索引（使用预计算embedding）
+        :param dataset_path: JSONL文件路径
+        :param embedding_field: JSON对象中存放embedding的字段名
+        :param id_field: JSON对象中作为待检索文本的字段（这里认为为id）
+        """
+        print(f'📂 Loading JSONL file: {dataset_path}...')
+        # 逐行读取JSONL文件
+        with open(dataset_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                record = json.loads(line)
+                self.data.append(record)
+        total = len(self.data)
+        print(f'✅ Loaded {total} records from {dataset_path}.')
+        # 直接从每个JSON对象中提取预计算embedding
+        embeddings_list = []
+        for rec in self.data:
+            emb = rec.get(embedding_field, [])
+            # 检查embedding长度是否符合预期
+            if len(emb) != self.index.d and self.index.ntotal == 0:
+                # 如果第一次添加且长度不匹配，可以根据需要进行处理，比如报错或跳过
+                raise ValueError(f"Embedding length mismatch. Expected {self.index.d}, got {len(emb)}.")
+            embeddings_list.append(np.array(emb, dtype=np.float32))
+        embeddings = np.stack(embeddings_list, axis=0)
+        print(f'✅ Embeddings loaded, shape: {embeddings.shape}. Indexing data...')
+        # 用数据在FAISS中建立索引
+        ids = list(range(total))
+        self.index_data(ids, embeddings)
+        print('🎉 Indexing complete!')
+    def index_data(self, ids: List[int], embeddings: np.array, **kwargs):
+        """
+        将预先计算好的embedding添加到FAISS索引中
+        :param ids: 每个记录的索引号（这里用list(range(total))）
+        :param embeddings: 记录对应的embedding矩阵
+        """
+        self._update_id_mapping(ids)
+        embeddings = embeddings.astype('float32')
+        # 如果索引未训练，则先训练
+        if not self.index.is_trained:
+            print('⚙️ Training FAISS index...')
+            self.index.train(embeddings)
+            print('✅ FAISS index trained.')
+        self.index.add(embeddings)
+        print(f'✅ Indexed {len(self.index_id_to_data)} records.')
+    def _update_id_mapping(self, row_ids: List[int]):
+        """更新FAISS索引ID到JSON记录索引的映射"""
+        self.index_id_to_data.extend(row_ids)
+    def search_return_id(self, query: str, top_docs: int) -> Tuple[List[str], List[float]]:
+        """
+        根据query返回最相似的JSON记录的id和相似度分数
+        :param query: 查询文本
+        :param top_docs: 返回的最近邻记录数量
+        :return: (记录的id列表, 分数列表)
+        """
+        db_indices, scores = self.search(query, top_docs)
+        # 这里假设待检索文本就是json对象中的id字段
+        result_ids = [self.data[i]["id"] for i in db_indices]
+        return result_ids, scores
+    def search(self, query: str, top_docs: int) -> Tuple[List[int], List[float]]:
+        """
+        对query重新embedding后，在FAISS索引中检索
+        :param query: 查询文本
+        :param top_docs: 返回的最近邻记录数量
+        :return: (JSON记录的索引列表, 相似度分数列表)
+        """
+        # 仅对query重新计算embedding
+        query_vector = self.model.encode(query).astype('float32').reshape(1, -1)
+        scores, indexes = self.index.search(query_vector, top_docs)
+        scores = scores[0]
+        indexes = indexes[0]
+        db_indices = [self.index_id_to_data[i] for i in indexes]
+        return db_indices, scores
+# 示例用法
+if __name__ == '__main__':
+    model = SentenceModel("BAAI/bge-base-en-v1.5")
+    vector_size = 768  # 请根据你的模型确定嵌入向量维度
+    indexer = JSONLIndexer(vector_sz=vector_size, model=model)
+    jsonl_path = "tool-embedding.jsonl"  # 替换为实际JSONL文件路径
+    indexer.load_jsonl(jsonl_path)
+    query = "your search query here"
+    ids, scores = indexer.search_return_id(query, top_docs=5)
+    print("检索结果：", list(zip(ids, scores)))

tool-embedding.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e172fd628eba5d4d59d1b467e58827a9874e981a4667e3e297ca4c01f1b275f5
+size 674988141