Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- Model_define.py +35 -0
- Vocabulary.py +71 -0
- app.py +50 -0
- lstm_model.bin +3 -0
- lstm_model_states.pt +3 -0
- requirements.txt +3 -0
Model_define.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
class Sentiment_LSTM(nn.Module):
|
5 |
+
def __init__(self, embedding_matrix:torch.Tensor, lstm_hidden_size, lstm_layers, linear_hidden_size):
|
6 |
+
super(Sentiment_LSTM, self).__init__()
|
7 |
+
|
8 |
+
self.input_size = embedding_matrix.size(-1)
|
9 |
+
self.lstm_hidden_size = lstm_hidden_size
|
10 |
+
self.lstm_layers = lstm_layers
|
11 |
+
self.linear_hidden_size = linear_hidden_size
|
12 |
+
|
13 |
+
self.embedding_matrix = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
|
14 |
+
|
15 |
+
self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.lstm_hidden_size,
|
16 |
+
num_layers=self.lstm_layers, batch_first=True)
|
17 |
+
|
18 |
+
# We will use only the last hidden state of the last layer for the prediction
|
19 |
+
self.fc = nn.Sequential(nn.Linear(self.lstm_hidden_size, self.linear_hidden_size),
|
20 |
+
nn.ReLU(), nn.Dropout(0.3))
|
21 |
+
|
22 |
+
self.classifier = nn.Linear(self.linear_hidden_size, 1)
|
23 |
+
|
24 |
+
def forward(self, x, device):
|
25 |
+
h_0 = torch.zeros((self.lstm_layers, x.size(0), self.lstm_hidden_size)).to(device)
|
26 |
+
c_0 = torch.zeros((self.lstm_layers, x.size(0), self.lstm_hidden_size)).to(device)
|
27 |
+
|
28 |
+
embds = self.embedding_matrix(x)
|
29 |
+
all_outputs, (h_final, c_final) = self.lstm(embds, (h_0, c_0))
|
30 |
+
h_final_final_layer = h_final[-1,:,:]
|
31 |
+
|
32 |
+
fc_out = self.fc(h_final_final_layer)
|
33 |
+
|
34 |
+
output = self.classifier(h_final_final_layer)
|
35 |
+
return nn.functional.sigmoid(output)
|
Vocabulary.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from collections import Counter
|
3 |
+
|
4 |
+
class spacy_tokenizer():
|
5 |
+
def __init__(self):
|
6 |
+
self.spacy_eng = spacy.load("en_core_web_sm")
|
7 |
+
|
8 |
+
def __call__(self, text):
|
9 |
+
return [tok.text.lower() for tok in self.spacy_eng.tokenizer(text)]
|
10 |
+
|
11 |
+
|
12 |
+
class Vocabulary:
|
13 |
+
def __init__(self, callable_tokenizer=None, max_freq=3, unk=True, sos=False, eos=False):
|
14 |
+
|
15 |
+
self.sos = sos
|
16 |
+
self.eos = eos
|
17 |
+
self.unk = unk
|
18 |
+
if callable_tokenizer:
|
19 |
+
self.callable_tokenizer = callable_tokenizer
|
20 |
+
else:
|
21 |
+
self.callable_tokenizer = spacy_tokenizer()
|
22 |
+
|
23 |
+
self.stoi = {"<PAD>": 0}
|
24 |
+
if self.unk:
|
25 |
+
self.stoi['<UNK>'] = len(self.stoi)
|
26 |
+
if self.sos:
|
27 |
+
self.stoi['<SOS>'] = len(self.stoi)
|
28 |
+
if self.eos:
|
29 |
+
self.stoi['<EOS>'] = len(self.stoi)
|
30 |
+
|
31 |
+
def __len__(self):
|
32 |
+
return len(self.stoi)
|
33 |
+
|
34 |
+
def get_vocabulary(self):
|
35 |
+
return self.stoi
|
36 |
+
|
37 |
+
def add_token(self, token_name: str):
|
38 |
+
if token_name not in self.stoi:
|
39 |
+
self.stoi[token_name] = len(self.stoi)
|
40 |
+
|
41 |
+
def build_vocabulary(self, sentences_list):
|
42 |
+
if type(sentences_list[0]) != str:
|
43 |
+
## ex: [['eating', 'apples'], ['eating', 'oranges']]
|
44 |
+
sentences_list = [' '.join(sen) for sen in sentences_list]
|
45 |
+
|
46 |
+
word_counts = Counter()
|
47 |
+
for sentence in sentences_list:
|
48 |
+
tokens = self.callable_tokenizer(sentence)
|
49 |
+
word_counts.update(tokens)
|
50 |
+
|
51 |
+
# Filter words with mox_freq or more occurrences
|
52 |
+
filtered_words = [word for word, count in word_counts.items() if count >= 3]
|
53 |
+
for word in filtered_words:
|
54 |
+
if word not in self.stoi:
|
55 |
+
self.stoi[word] = len(self.stoi)
|
56 |
+
|
57 |
+
def get_numerical_tokens(self, text: str):
|
58 |
+
tokens = self.callable_tokenizer(text)
|
59 |
+
# tokens.insert(0, '<SOS>') if self.sos else None
|
60 |
+
# tokens.append('<EOS>') if self.eos else None
|
61 |
+
unk_id = self.stoi.get('<UNK>', None)
|
62 |
+
return [self.stoi.get(word, unk_id) for word in tokens]
|
63 |
+
|
64 |
+
def __call__(self, text: str):
|
65 |
+
return self.get_numerical_tokens(text)
|
66 |
+
|
67 |
+
def tokens_to_text(self, tokens_list):
|
68 |
+
keys = list(self.stoi.keys())
|
69 |
+
values = list(self.stoi.values())
|
70 |
+
|
71 |
+
return ' '.join([keys[values.index(token)] for token in tokens_list])
|
app.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import gradio as gr
|
3 |
+
from Vocabulary import spacy_tokenizer
|
4 |
+
from Model_define import Sentiment_LSTM
|
5 |
+
|
6 |
+
|
7 |
+
device = 'gpu' if torch.cuda.is_available() else 'cpu'
|
8 |
+
|
9 |
+
model = torch.load("lstm_model.bin", map_location=device, weights_only=False)
|
10 |
+
model_state = torch.load("lstm_model_states.pt", map_location=device, weights_only=False)
|
11 |
+
|
12 |
+
|
13 |
+
vocab = model_state['vocabulary']
|
14 |
+
tokenizer = spacy_tokenizer()
|
15 |
+
|
16 |
+
cls_to_idx = model_state['class_dict']
|
17 |
+
idx_to_cls = {value:key for key,value in cls_to_idx.items()}
|
18 |
+
|
19 |
+
def pre_processor(text):
|
20 |
+
tokens = tokenizer(text.lower())
|
21 |
+
unk_id = vocab.get('<UNK>', None)
|
22 |
+
return torch.tensor([vocab.get(word, unk_id) for word in tokens])
|
23 |
+
|
24 |
+
def post_processor(raw_output):
|
25 |
+
label = (raw_output >= 0.5).int()
|
26 |
+
return idx_to_cls[label.item()].capitalize(), round(raw_output.item(), 2)
|
27 |
+
|
28 |
+
|
29 |
+
@torch.no_grad
|
30 |
+
def lunch(raw_input):
|
31 |
+
input = pre_processor(raw_input)
|
32 |
+
output = model(input.unsqueeze(0), device)
|
33 |
+
return post_processor(output)
|
34 |
+
|
35 |
+
custom_css ='.gr-button {background-color: #bf4b04; color: white;}'
|
36 |
+
|
37 |
+
with gr.Blocks(css=custom_css) as demo:
|
38 |
+
with gr.Row():
|
39 |
+
with gr.Column():
|
40 |
+
input_text = gr.Textbox(label='Input a Review or click an Example')
|
41 |
+
gr.Examples(["It is no wonder that the film has such a high rating, it is quite literally breathtaking. What can I say that hasn't said before? Not much, it's the story, the acting, the premise, but most of all, this movie is about how it makes you feel. Sometimes you watch a film, and can't remember it days later, this film loves with you, once you've seen it, you don't forget.",
|
42 |
+
"This film is nothing but one cliche after another. Having seen many of the 100's of prison films made from the early 30's to the 50's, I was able to pull almost every minute of Shawcrap from one of those films."],
|
43 |
+
inputs=input_text, label="Examples: ")
|
44 |
+
with gr.Column():
|
45 |
+
class_name = gr.Textbox(label="This review is")
|
46 |
+
confidence = gr.Textbox(label='Confidence')
|
47 |
+
start_btn = gr.Button(value='Submit', elem_classes=["gr-button"])
|
48 |
+
start_btn.click(fn=lunch, inputs=input_text, outputs=[class_name, confidence])
|
49 |
+
|
50 |
+
demo.launch()
|
lstm_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1dceea1f78112220fedc531cae7815a0808ef2582d53de8149631590db30227
|
3 |
+
size 22590178
|
lstm_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b4ef1293501e36bb34ba9d8d12cedbdfd2555e3e070baef5cdba91bce8739b6
|
3 |
+
size 1071816
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
spacy
|
3 |
+
gradio
|