lunadebruyne commited on
Commit
bd33424
·
1 Parent(s): b760f7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -138
app.py CHANGED
@@ -6,84 +6,24 @@ import pandas as pd
6
  from tqdm import tqdm
7
 
8
  from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification
9
- from transformers import TrainingArguments, Trainer
10
-
11
- from datasets import load_dataset
12
 
13
 
14
  description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotion in a sentence."
15
- description2 = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."
16
 
17
  inference_modelpath = "model/checkpoint-128"
18
 
19
- """
20
- output_dir = "model"
21
- model_config = {
22
- "model_weights": "pdelobelle/robbert-v2-dutch-base",
23
- "num_labels": 6,
24
- "max_length": 128,
25
- "device": "cpu"
26
- }
27
-
28
- ## Tokenizer and model
29
- tokenizer = AutoTokenizer.from_pretrained(model_config["model_weights"])
30
- model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
31
-
32
-
33
- # Function for encoding (tokenizing) data
34
- def encode_data(data):
35
- text = data["text"]
36
- label = data["label"]
37
- encoded_input = tokenizer(
38
- text,
39
- add_special_tokens=True,
40
- max_length= model_config["max_length"],
41
- padding= "max_length",
42
- return_overflowing_tokens=True,
43
- truncation=True
44
- )
45
- encoded_input["labels"] = label
46
- return encoded_input
47
-
48
-
49
- # Test arguments for Trainer
50
- test_args = TrainingArguments(
51
- output_dir = output_dir,
52
- do_train = False,
53
- do_predict = True,
54
- per_device_eval_batch_size = 64,
55
- dataloader_drop_last = False
56
- )
57
- trainer = Trainer(
58
- model = model,
59
- args = test_args)
60
-
61
-
62
- def inference_dataset(file_object):
63
- input_file = file_object
64
- data_paths = {"train": input_file, "inference": input_file}
65
- dataset = load_dataset('csv', skiprows=1, data_files=data_paths, column_names = ['id', 'text', 'label'], delimiter='\t')
66
- encoded_dataset = dataset.map(encode_data, batched=True)
67
- encoded_dataset.set_format("torch")
68
- encoded_dataset["inference"] = encoded_dataset["inference"].remove_columns("label")
69
- # Run trainer in prediction mode
70
- prediction_output = trainer.predict(encoded_dataset["inference"])
71
- predictions = prediction_output[0]
72
- ids = dataset["inference"]["id"]
73
- texts = dataset["inference"]["text"]
74
- preds = np.argmax(predictions, axis=1)
75
- preds = [model.config.id2label[pred] for pred in preds]
76
- predictions_content = list(zip(ids, texts, preds))
77
- # write predictions to file
78
- output = "output.txt"
79
- f = open(output, 'w')
80
- f.write("id\ttext\tprediction\n")
81
- for line in predictions_content:
82
- f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n')
83
- f.close()
84
- return output
85
- """
86
-
87
  def inference_dataset(file_object, option_list):
88
  tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
89
  model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
@@ -119,68 +59,6 @@ def inference_dataset(file_object, option_list):
119
  output5 = "This option was selected."
120
  return [output1, output2, output3, output4, output5]
121
 
122
- def what_happened(text, file_object, option_list):
123
- if file_object:
124
- output = "You uploaded a file."
125
- #if len(option_list) > 0:
126
- #output = output + "\nYou selected these options:\n- " + "\n- ".join(option_list)
127
- else:
128
- output = "Normally, this demo should analyse the emotions in this text:\n" + text
129
- if len(option_list) > 0:
130
- output = output + "\nYou can only select options when uploading a dataset."
131
- return output
132
-
133
- def what_happened1(text):
134
- output = "Normally, this demo should analyse the emotions in this text:\n" + text
135
- return output
136
-
137
- def what_happened2(file_object, option_list):
138
- #input_file = open(file_object.name, 'r')
139
- #lines = input_file.read()
140
- #input_file.close()
141
- #output_file = open('output.txt', 'w')
142
- #output_file.write(lines)
143
- #output_file.close()
144
- #output1 = 'output.txt'
145
- output1 = inference_dataset(file_object.name)
146
- output2 = output3 = output4 = output5 = "This option was not selected."
147
- if "emotion frequencies" in option_list:
148
- output2 = "This option was selected."
149
- if "emotion distribution over time" in option_list:
150
- output3 = "This option was selected."
151
- if "peaks" in option_list:
152
- output4 = "This option was selected."
153
- if "topics" in option_list:
154
- output5 = "This option was selected."
155
- return [output1, output2, output3, output4, output5]
156
-
157
- def inference_sentence(text):
158
- tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
159
- model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
160
- for text in tqdm([text]):
161
- inputs = tokenizer(text, return_tensors="pt")
162
- with torch.no_grad(): # run model
163
- logits = model(**inputs).logits
164
- predicted_class_id = logits.argmax().item()
165
- output = model.config.id2label[predicted_class_id]
166
- return output
167
-
168
-
169
- iface0 = gr.Interface(
170
- fn=what_happened,
171
- inputs=[
172
- gr.Textbox(
173
- label= "Enter a sentence",
174
- lines=1,
175
- value="Your name"),
176
- gr.File(
177
- label="Or upload a dataset"),
178
- gr.CheckboxGroup(
179
- ["emotion frequencies", "emotion distribution over time", "peaks", "topics"],
180
- label = "Select options")
181
- ],
182
- outputs="text")
183
-
184
  iface_sentence = gr.Interface(
185
  fn=inference_sentence,
186
  description = description_sentence,
@@ -189,10 +67,9 @@ iface_sentence = gr.Interface(
189
  lines=1),
190
  outputs="text")
191
 
192
- iface2 = gr.Interface(
193
- #fn=what_happened2,
194
  fn = inference_dataset,
195
- description = description2,
196
  inputs=[
197
  gr.File(
198
  label="Upload a dataset"),
@@ -210,6 +87,6 @@ iface2 = gr.Interface(
210
  gr.Textbox(label="Topics")
211
  ])
212
 
213
- iface = gr.TabbedInterface([iface_sentence, iface2], ["Sentence", "Dataset"])
214
 
215
  iface.queue().launch()
 
6
  from tqdm import tqdm
7
 
8
  from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification
 
 
 
9
 
10
 
11
  description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotion in a sentence."
12
+ description_dataset = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."
13
 
14
  inference_modelpath = "model/checkpoint-128"
15
 
16
+ def inference_sentence(text):
17
+ tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
18
+ model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
19
+ for text in tqdm([text]):
20
+ inputs = tokenizer(text, return_tensors="pt")
21
+ with torch.no_grad(): # run model
22
+ logits = model(**inputs).logits
23
+ predicted_class_id = logits.argmax().item()
24
+ output = model.config.id2label[predicted_class_id]
25
+ return output
26
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def inference_dataset(file_object, option_list):
28
  tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
29
  model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
 
59
  output5 = "This option was selected."
60
  return [output1, output2, output3, output4, output5]
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  iface_sentence = gr.Interface(
63
  fn=inference_sentence,
64
  description = description_sentence,
 
67
  lines=1),
68
  outputs="text")
69
 
70
+ iface_dataset = gr.Interface(
 
71
  fn = inference_dataset,
72
+ description = description_dataset,
73
  inputs=[
74
  gr.File(
75
  label="Upload a dataset"),
 
87
  gr.Textbox(label="Topics")
88
  ])
89
 
90
+ iface = gr.TabbedInterface([iface_sentence, iface_dataset], ["Sentence", "Dataset"])
91
 
92
  iface.queue().launch()