Spaces:

Yyy0530
/

tool_retriever

Running

App Files Files Community

Yyy0530 commited on 11 days ago

Commit

9ab9c77

1 Parent(s): 5ad0dae

添加 JSONL 文件检索器和 Streamlit 应用，支持基于预计算 embedding 的相似记录检索

Browse files

Files changed (3) hide show

requirements.txt +152 -0
src/jsonl_Indexer.py +125 -0
streamlit_jsonl_retriever.py +95 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,152 @@

+aiohappyeyeballs==2.4.3
+aiohttp==3.11.0
+aiosignal==1.3.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+asttokens==2.4.1
+async-timeout==5.0.1
+attrs==24.2.0
+blessed==1.20.0
+blinker==1.9.0
+branca==0.8.0
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+comm==0.2.2
+contourpy==1.3.0
+cycler==0.12.1
+datasets==3.1.0
+debugpy==1.8.8
+decorator==5.1.1
+dill==0.3.8
+distro==1.9.0
+et_xmlfile==2.0.0
+exceptiongroup==1.2.2
+executing==2.1.0
+f==0.0.1
+faiss-gpu==1.7.2
+filelock==3.16.1
+folium==0.18.0
+fonttools==4.54.1
+frozenlist==1.5.0
+fsspec==2024.9.0
+geopandas==1.0.1
+gitdb==4.0.11
+GitPython==3.1.43
+gpustat==1.1.1
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.2
+idna==3.10
+importlib_metadata==8.5.0
+importlib_resources==6.4.5
+ipykernel==6.29.5
+ipython==8.18.1
+jedi==0.19.2
+jieba==0.42.1
+Jinja2==3.1.4
+jiter==0.7.1
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+kiwisolver==1.4.7
+loguru==0.7.2
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+narwhals==1.13.5
+nest-asyncio==1.6.0
+networkx==3.2.1
+numpy==1.26.0
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-ml-py==12.560.30
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==0.28.0
+openpyxl==3.1.5
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.0.0
+platformdirs==4.3.6
+prettytable==3.12.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+protobuf==5.28.3
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==18.0.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydeck==0.9.1
+pyecharts==2.0.7
+Pygments==2.18.0
+pyogrio==0.10.0
+pyparsing==3.2.0
+pyproj==3.6.1
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.21.0
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.13.1
+seaborn==0.13.2
+shapely==2.0.6
+simplejson==3.19.3
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+stack-data==0.6.3
+streamlit==1.40.1
+streamlit-echarts==0.4.0
+streamlit-option-menu==0.4.0
+streamlit_folium==0.23.1
+sympy==1.13.1
+tenacity==9.0.0
+text2vec==1.3.1
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+toml==0.10.2
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.67.0
+traitlets==5.14.3
+transformers==4.46.2
+triton==3.1.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==6.0.0
+wcwidth==0.2.13
+wordcloud==1.9.4
+xxhash==3.5.0
+xyzservices==2024.9.0
+yarl==1.17.1
+zipp==3.21.0

src/jsonl_Indexer.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import json
+import faiss
+import numpy as np
+from typing import List, Tuple
+from text2vec import SentenceModel
+class JSONLIndexer(object):
+    """
+    JSONL 文件检索器（基于预计算embedding）
+    """
+    def __init__(self, vector_sz: int, n_subquantizers=0, n_bits=8, model: SentenceModel = None, **kwargs):
+        """
+        初始化索引器，选择使用FAISS的类型
+        :param vector_sz: 嵌入向量的大小
+        :param n_subquantizers: 子量化器数量
+        :param n_bits: 每个子向量的位数
+        :param model: SentenceModel 模型，用于对query重新embedding
+        """
+        if n_subquantizers > 0:
+            self.index = faiss.IndexPQ(vector_sz, n_subquantizers, n_bits, faiss.METRIC_INNER_PRODUCT)
+        else:
+            self.index = faiss.IndexFlatIP(vector_sz)
+        self.index_id_to_data = []  # FAISS索引ID到JSON记录索引的映射
+        self.data = []  # 存储所有JSON对象
+        self.model = model
+        print(f'Initialized FAISS index of type {type(self.index)}')
+    def load_jsonl(self, dataset_path: str, embedding_field: str = "embedding", id_field: str = "id") -> None:
+        """
+        加载JSONL文件并构建FAISS索引（使用预计算embedding）
+        :param dataset_path: JSONL文件路径
+        :param embedding_field: JSON对象中存放embedding的字段名
+        :param id_field: JSON对象中作为待检索文本的字段（这里认为为id）
+        """
+        print(f'📂 Loading JSONL file: {dataset_path}...')
+        # 逐行读取JSONL文件
+        with open(dataset_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                record = json.loads(line)
+                self.data.append(record)
+        total = len(self.data)
+        print(f'✅ Loaded {total} records from {dataset_path}.')
+        # 直接从每个JSON对象中提取预计算embedding
+        embeddings_list = []
+        for rec in self.data:
+            emb = rec.get(embedding_field, [])
+            # 检查embedding长度是否符合预期
+            if len(emb) != self.index.d and self.index.ntotal == 0:
+                # 如果第一次添加且长度不匹配，可以根据需要进行处理，比如报错或跳过
+                raise ValueError(f"Embedding length mismatch. Expected {self.index.d}, got {len(emb)}.")
+            embeddings_list.append(np.array(emb, dtype=np.float32))
+        embeddings = np.stack(embeddings_list, axis=0)
+        print(f'✅ Embeddings loaded, shape: {embeddings.shape}. Indexing data...')
+        # 用数据在FAISS中建立索引
+        ids = list(range(total))
+        self.index_data(ids, embeddings)
+        print('🎉 Indexing complete!')
+    def index_data(self, ids: List[int], embeddings: np.array, **kwargs):
+        """
+        将预先计算好的embedding添加到FAISS索引中
+        :param ids: 每个记录的索引号（这里用list(range(total))）
+        :param embeddings: 记录对应的embedding矩阵
+        """
+        self._update_id_mapping(ids)
+        embeddings = embeddings.astype('float32')
+        # 如果索引未训练，则先训练
+        if not self.index.is_trained:
+            print('⚙️ Training FAISS index...')
+            self.index.train(embeddings)
+            print('✅ FAISS index trained.')
+        self.index.add(embeddings)
+        print(f'✅ Indexed {len(self.index_id_to_data)} records.')
+    def _update_id_mapping(self, row_ids: List[int]):
+        """更新FAISS索引ID到JSON记录索引的映射"""
+        self.index_id_to_data.extend(row_ids)
+    def search_return_id(self, query: str, top_docs: int) -> Tuple[List[str], List[float]]:
+        """
+        根据query返回最相似的JSON记录的id和相似度分数
+        :param query: 查询文本
+        :param top_docs: 返回的最近邻记录数量
+        :return: (记录的id列表, 分数列表)
+        """
+        db_indices, scores = self.search(query, top_docs)
+        # 这里假设待检索文本就是json对象中的id字段
+        result_ids = [self.data[i]["id"] for i in db_indices]
+        return result_ids, scores
+    def search(self, query: str, top_docs: int) -> Tuple[List[int], List[float]]:
+        """
+        对query重新embedding后，在FAISS索引中检索
+        :param query: 查询文本
+        :param top_docs: 返回的最近邻记录数量
+        :return: (JSON记录的索引列表, 相似度分数列表)
+        """
+        # 仅对query重新计算embedding
+        query_vector = self.model.encode(query).astype('float32').reshape(1, -1)
+        scores, indexes = self.index.search(query_vector, top_docs)
+        scores = scores[0]
+        indexes = indexes[0]
+        db_indices = [self.index_id_to_data[i] for i in indexes]
+        return db_indices, scores
+# 示例用法
+if __name__ == '__main__':
+    model = SentenceModel("BAAI/bge-base-en-v1.5")
+    vector_size = 768  # 请根据你的模型确定嵌入向量维度
+    indexer = JSONLIndexer(vector_sz=vector_size, model=model)
+    jsonl_path = "tool-embedding.jsonl"  # 替换为实际JSONL文件路径
+    indexer.load_jsonl(jsonl_path)
+    query = "your search query here"
+    ids, scores = indexer.search_return_id(query, top_docs=5)
+    print("检索结果：", list(zip(ids, scores)))

streamlit_jsonl_retriever.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '3'
+import os
+import sys
+import faiss
+import numpy as np
+import streamlit as st
+from text2vec import SentenceModel
+# 请确保 JSONLIndexer 在 src 目录下或者已正确安装
+from src.jsonl_Indexer import JSONLIndexer
+# 命令行参数处理函数
+def get_cli_args():
+    args = {}
+    # 跳过第一个参数（脚本名）和第二个参数（streamlit run）
+    argv = sys.argv[2:] if len(sys.argv) > 2 else []
+    for arg in argv:
+        if '=' in arg:
+            key, value = arg.split('=', 1)
+            args[key.strip()] = value.strip()
+    return args
+# 获取命令行参数
+cli_args = get_cli_args()
+# 设置默认值（适用于 JSONL 文件）
+DEFAULT_CONFIG = {
+    'model_path': 'BAAI/bge-base-en-v1.5',
+    'dataset_path': 'src/tool-embedding.jsonl',  # JSONL 文件路径
+    'vector_size': 768,
+    'embedding_field': 'embedding',   # JSON中存储embedding的字段名
+    'id_field': 'id'                  # JSON中作为待检索文本的字段
+}
+# 合并默认配置和命令行参数
+config = DEFAULT_CONFIG.copy()
+config.update(cli_args)
+# 将 vector_size 转换为整数
+config['vector_size'] = int(config['vector_size'])
+@st.cache_resource
+def get_model(model_path: str = config['model_path']):
+    model = SentenceModel(model_path)
+    return model
+@st.cache_resource
+def create_retriever(vector_sz: int, dataset_path: str, embedding_field: str, id_field: str, _model):
+    retriever = JSONLIndexer(vector_sz=vector_sz, model=_model)
+    retriever.load_jsonl(dataset_path, embedding_field=embedding_field, id_field=id_field)
+    return retriever
+# 在侧边栏显示当前配置
+if st.sidebar.checkbox("Show Configuration"):
+    st.sidebar.write("Current Configuration:")
+    for key, value in config.items():
+        st.sidebar.write(f"{key}: {value}")
+# 初始化模型和检索器
+model = get_model(config['model_path'])
+retriever = create_retriever(
+    config['vector_size'],
+    config['dataset_path'],
+    config['embedding_field'],
+    config['id_field'],
+    _model=model
+)
+# Streamlit 应用界面
+st.title("JSONL Data Retrieval Visualization")
+st.write("该应用基于预计算的 JSONL 文件 embedding，输入查询后将检索相似记录。")
+# 查询输入
+query = st.text_input("Enter a search query:")
+top_k = st.slider("Select number of results to display", min_value=1, max_value=100, value=5)
+# 检索并展示结果
+if st.button("Search") and query:
+    # 注意：JSONLIndexer 提供的是 search_return_id 方法，返回的是 JSON 中 id 字段
+    rec_ids, scores = retriever.search_return_id(query, top_k)
+    st.write("### Results:")
+    with st.expander("Retrieval Results (click to expand)"):
+        for j, rec_id in enumerate(rec_ids):
+            st.markdown(
+                f"""
+                <div style="border:1px solid #ccc; padding:10px; border-radius:5px; margin-bottom:10px; background-color:#f9f9f9;">
+                    <p><b>Record {j+1} ID:</b> {rec_id}</p>
+                    <p><b>Score:</b> {scores[j]:.4f}</p>
+                </div>
+                """,
+                unsafe_allow_html=True
+            )