ravfogs commited on
Commit
52e8fb9
·
1 Parent(s): 81883d8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +35 -19
README.md CHANGED
@@ -37,12 +37,13 @@ def encode_batch(model, tokenizer, sentences: List[str], device: str):
37
  features = torch.sum(features[:,1:,:] * input_ids["attention_mask"][:,1:].unsqueeze(-1), dim=1) / torch.clamp(torch.sum(input_ids["attention_mask"][:,1:], dim=1, keepdims=True), min=1e-9)
38
  return features
39
 
 
40
 
 
41
 
42
- if __name__ == "__main__":
43
-
44
- tokenizer, query_encoder, sentence_encoder = load_finetuned_model()
45
- relevant_sentences = ["Fingersoft's parent company is the Finger Group.",
46
  "WHIRC – a subsidiary company of Wright-Hennepin",
47
  "CK Life Sciences International (Holdings) Inc. (), or CK Life Sciences, is a subsidiary of CK Hutchison Holdings",
48
  "EM Microelectronic-Marin (subsidiary of The Swatch Group).",
@@ -50,7 +51,7 @@ if __name__ == "__main__":
50
  "Volt Technical Resources is a business unit of Volt Workforce Solutions, a subsidiary of Volt Information Sciences (currently trading over-the-counter as VISI.)."
51
  ]
52
 
53
- irrelevant_sentences = ["The second company is deemed to be a subsidiary of the parent company.",
54
  "The company has gone through more than one incarnation.",
55
  "The company is owned by its employees.",
56
  "Larger companies compete for market share by acquiring smaller companies that may own a particular market sector.",
@@ -58,19 +59,34 @@ if __name__ == "__main__":
58
  "It is a holding company that provides services through its subsidiaries in the following areas: oil and gas, industrial and infrastructure, government and power."
59
  ]
60
 
61
- all_sentences = relevant_sentences + irrelevant_sentences
62
- query = "<query>: A company is a part of a larger company."
63
 
64
- embeddings = encode_batch(sentence_encoder, tokenizer, all_sentences, "cpu").detach().cpu().numpy()
65
- query_embedding = encode_batch(query_encoder, tokenizer, [query], "cpu").detach().cpu().numpy()
66
-
67
- sims = cosine_similarity(query_embedding, embeddings)[0]
68
- sentences_sims = list(zip(all_sentences, sims))
69
- sentences_sims.sort(key=lambda x: x[1], reverse=True)
70
-
71
- for s, sim in sentences_sims:
72
- print(s, sim)
73
-
74
-
75
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  ```
 
37
  features = torch.sum(features[:,1:,:] * input_ids["attention_mask"][:,1:].unsqueeze(-1), dim=1) / torch.clamp(torch.sum(input_ids["attention_mask"][:,1:], dim=1, keepdims=True), min=1e-9)
38
  return features
39
 
40
+ ```
41
 
42
+ Usage example:
43
 
44
+ ```python
45
+ tokenizer, query_encoder, sentence_encoder = load_finetuned_model()
46
+ relevant_sentences = ["Fingersoft's parent company is the Finger Group.",
 
47
  "WHIRC – a subsidiary company of Wright-Hennepin",
48
  "CK Life Sciences International (Holdings) Inc. (), or CK Life Sciences, is a subsidiary of CK Hutchison Holdings",
49
  "EM Microelectronic-Marin (subsidiary of The Swatch Group).",
 
51
  "Volt Technical Resources is a business unit of Volt Workforce Solutions, a subsidiary of Volt Information Sciences (currently trading over-the-counter as VISI.)."
52
  ]
53
 
54
+ irrelevant_sentences = ["The second company is deemed to be a subsidiary of the parent company.",
55
  "The company has gone through more than one incarnation.",
56
  "The company is owned by its employees.",
57
  "Larger companies compete for market share by acquiring smaller companies that may own a particular market sector.",
 
59
  "It is a holding company that provides services through its subsidiaries in the following areas: oil and gas, industrial and infrastructure, government and power."
60
  ]
61
 
62
+ all_sentences = relevant_sentences + irrelevant_sentences
63
+ query = "<query>: A company is a part of a larger company."
64
 
65
+ embeddings = encode_batch(sentence_encoder, tokenizer, all_sentences, "cpu").detach().cpu().numpy()
66
+ query_embedding = encode_batch(query_encoder, tokenizer, [query], "cpu").detach().cpu().numpy()
67
+
68
+ sims = cosine_similarity(query_embedding, embeddings)[0]
69
+ sentences_sims = list(zip(all_sentences, sims))
70
+ sentences_sims.sort(key=lambda x: x[1], reverse=True)
71
+
72
+ for s, sim in sentences_sims:
73
+ print(s, sim)
74
+
75
+ ```
76
+
77
+ Expected output:
78
+
79
+ ```
80
+ WHIRC – a subsidiary company of Wright-Hennepin 0.9396286
81
+ EM Microelectronic-Marin (subsidiary of The Swatch Group). 0.93929046
82
+ Fingersoft's parent company is the Finger Group. 0.936247
83
+ CK Life Sciences International (Holdings) Inc. (), or CK Life Sciences, is a subsidiary of CK Hutchison Holdings 0.9350312
84
+ The company is currently a division of the corporate group Jam Industries. 0.9273489
85
+ Volt Technical Resources is a business unit of Volt Workforce Solutions, a subsidiary of Volt Information Sciences (currently trading over-the-counter as VISI.). 0.9005086
86
+ The second company is deemed to be a subsidiary of the parent company. 0.6723645
87
+ It is a holding company that provides services through its subsidiaries in the following areas: oil and gas, industrial and infrastructure, government and power. 0.60081375
88
+ A parent company is a company that owns 51% or more voting stock in another firm (or subsidiary). 0.59490484
89
+ The company is owned by its employees. 0.55286574
90
+ The company has gone through more than one incarnation. 0.38889483
91
+ Larger companies compete for market share by acquiring smaller companies that may own a particular market sector. 0.25472647
92
  ```