Onnx Support (#6)
Browse files- Upload 2 files (68e15535054770a466a956bdfff94cd70e53d393)
- Update README.md (b4595376fce1812665312d0557400026cdeb7739)
- Delete onnx/model_quantized.onnx (9ff2b78f84eedf2d680c1fca6f16544eab75d0b1)
Co-authored-by: Michael <[email protected]>
- README.md +47 -0
- onnx/model.onnx +3 -0
README.md
CHANGED
@@ -2866,6 +2866,53 @@ sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, di
|
|
2866 |
print("Sentence embeddings:", sentence_embeddings)
|
2867 |
```
|
2868 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2869 |
### Usage for Reranker
|
2870 |
|
2871 |
Different from embedding model, reranker uses question and document as input and directly output similarity instead of embedding.
|
|
|
2866 |
print("Sentence embeddings:", sentence_embeddings)
|
2867 |
```
|
2868 |
|
2869 |
+
|
2870 |
+
#### Usage of the ONNX files
|
2871 |
+
|
2872 |
+
```python
|
2873 |
+
from optimum.onnxruntime import ORTModelForFeatureExtraction # type: ignore
|
2874 |
+
|
2875 |
+
import torch
|
2876 |
+
from transformers import AutoModel, AutoTokenizer
|
2877 |
+
|
2878 |
+
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')
|
2879 |
+
model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5', revision="refs/pr/13")
|
2880 |
+
model_ort = ORTModelForFeatureExtraction.from_pretrained('BAAI/bge-large-en-v1.5', revision="refs/pr/13",file_name="onnx/model.onnx")
|
2881 |
+
|
2882 |
+
# Sentences we want sentence embeddings for
|
2883 |
+
sentences = ["样例数据-1", "样例数据-2"]
|
2884 |
+
|
2885 |
+
# Tokenize sentences
|
2886 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
2887 |
+
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
|
2888 |
+
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
|
2889 |
+
|
2890 |
+
model_output_ort = model_ort(**encoded_input)
|
2891 |
+
# Compute token embeddings
|
2892 |
+
with torch.no_grad():
|
2893 |
+
model_output = model(**encoded_input)
|
2894 |
+
|
2895 |
+
# model_output and model_output_ort are identical
|
2896 |
+
|
2897 |
+
```
|
2898 |
+
|
2899 |
+
#### Usage via infinity
|
2900 |
+
Its also possible to deploy the onnx files with the [infinity_emb](https://github.com/michaelfeil/infinity) pip package.
|
2901 |
+
```python
|
2902 |
+
import asyncio
|
2903 |
+
from infinity_emb import AsyncEmbeddingEngine, EngineArgs
|
2904 |
+
|
2905 |
+
sentences = ["Embed this is sentence via Infinity.", "Paris is in France."]
|
2906 |
+
engine = AsyncEmbeddingEngine.from_args(
|
2907 |
+
EngineArgs(model_name_or_path = "BAAI/bge-large-en-v1.5", device="cpu", engine="optimum" # or engine="torch"
|
2908 |
+
))
|
2909 |
+
|
2910 |
+
async def main():
|
2911 |
+
async with engine:
|
2912 |
+
embeddings, usage = await engine.embed(sentences=sentences)
|
2913 |
+
asyncio.run(main())
|
2914 |
+
```
|
2915 |
+
|
2916 |
### Usage for Reranker
|
2917 |
|
2918 |
Different from embedding model, reranker uses question and document as input and directly output similarity instead of embedding.
|
onnx/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9bc579acdba21c253c62a9bf866891355a63ffa3442b52c8a37d75b2ccb91848
|
3 |
+
size 435811539
|