Update README.md
Browse files
README.md
CHANGED
@@ -18,6 +18,7 @@ A model for mapping abstract sentence descriptions to sentences that fit the des
|
|
18 |
from transformers import AutoTokenizer, AutoModel
|
19 |
import torch
|
20 |
from typing import List
|
|
|
21 |
|
22 |
def load_finetuned_model():
|
23 |
|
@@ -36,4 +37,56 @@ def encode_batch(model, tokenizer, sentences: List[str], device: str):
|
|
36 |
features = torch.sum(features[:,1:,:] * input_ids["attention_mask"][:,1:].unsqueeze(-1), dim=1) / torch.clamp(torch.sum(input_ids["attention_mask"][:,1:], dim=1, keepdims=True), min=1e-9)
|
37 |
return features
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
```
|
|
|
18 |
from transformers import AutoTokenizer, AutoModel
|
19 |
import torch
|
20 |
from typing import List
|
21 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
22 |
|
23 |
def load_finetuned_model():
|
24 |
|
|
|
37 |
features = torch.sum(features[:,1:,:] * input_ids["attention_mask"][:,1:].unsqueeze(-1), dim=1) / torch.clamp(torch.sum(input_ids["attention_mask"][:,1:], dim=1, keepdims=True), min=1e-9)
|
38 |
return features
|
39 |
|
40 |
+
```
|
41 |
+
|
42 |
+
Usage example:
|
43 |
+
|
44 |
+
```python
|
45 |
+
tokenizer, query_encoder, sentence_encoder = load_finetuned_model()
|
46 |
+
relevant_sentences = ["Fingersoft's parent company is the Finger Group.",
|
47 |
+
"WHIRC – a subsidiary company of Wright-Hennepin",
|
48 |
+
"CK Life Sciences International (Holdings) Inc. (), or CK Life Sciences, is a subsidiary of CK Hutchison Holdings",
|
49 |
+
"EM Microelectronic-Marin (subsidiary of The Swatch Group).",
|
50 |
+
"The company is currently a division of the corporate group Jam Industries.",
|
51 |
+
"Volt Technical Resources is a business unit of Volt Workforce Solutions, a subsidiary of Volt Information Sciences (currently trading over-the-counter as VISI.)."
|
52 |
+
]
|
53 |
+
|
54 |
+
irrelevant_sentences = ["The second company is deemed to be a subsidiary of the parent company.",
|
55 |
+
"The company has gone through more than one incarnation.",
|
56 |
+
"The company is owned by its employees.",
|
57 |
+
"Larger companies compete for market share by acquiring smaller companies that may own a particular market sector.",
|
58 |
+
"A parent company is a company that owns 51% or more voting stock in another firm (or subsidiary).",
|
59 |
+
"It is a holding company that provides services through its subsidiaries in the following areas: oil and gas, industrial and infrastructure, government and power."
|
60 |
+
]
|
61 |
+
|
62 |
+
all_sentences = relevant_sentences + irrelevant_sentences
|
63 |
+
query = "<query>: A company is a part of a larger company."
|
64 |
+
|
65 |
+
embeddings = encode_batch(sentence_encoder, tokenizer, all_sentences, "cpu").detach().cpu().numpy()
|
66 |
+
query_embedding = encode_batch(query_encoder, tokenizer, [query], "cpu").detach().cpu().numpy()
|
67 |
+
|
68 |
+
sims = cosine_similarity(query_embedding, embeddings)[0]
|
69 |
+
sentences_sims = list(zip(all_sentences, sims))
|
70 |
+
sentences_sims.sort(key=lambda x: x[1], reverse=True)
|
71 |
+
|
72 |
+
for s, sim in sentences_sims:
|
73 |
+
print(s, sim)
|
74 |
+
|
75 |
+
```
|
76 |
+
|
77 |
+
Expected output:
|
78 |
+
|
79 |
+
```
|
80 |
+
WHIRC – a subsidiary company of Wright-Hennepin 0.9396286
|
81 |
+
EM Microelectronic-Marin (subsidiary of The Swatch Group). 0.93929046
|
82 |
+
Fingersoft's parent company is the Finger Group. 0.936247
|
83 |
+
CK Life Sciences International (Holdings) Inc. (), or CK Life Sciences, is a subsidiary of CK Hutchison Holdings 0.9350312
|
84 |
+
The company is currently a division of the corporate group Jam Industries. 0.9273489
|
85 |
+
Volt Technical Resources is a business unit of Volt Workforce Solutions, a subsidiary of Volt Information Sciences (currently trading over-the-counter as VISI.). 0.9005086
|
86 |
+
The second company is deemed to be a subsidiary of the parent company. 0.6723645
|
87 |
+
It is a holding company that provides services through its subsidiaries in the following areas: oil and gas, industrial and infrastructure, government and power. 0.60081375
|
88 |
+
A parent company is a company that owns 51% or more voting stock in another firm (or subsidiary). 0.59490484
|
89 |
+
The company is owned by its employees. 0.55286574
|
90 |
+
The company has gone through more than one incarnation. 0.38889483
|
91 |
+
Larger companies compete for market share by acquiring smaller companies that may own a particular market sector. 0.25472647
|
92 |
```
|