Spaces:
Running
Running
import gradio as gr | |
import torch | |
import numpy as np | |
import pandas as pd | |
from tqdm import tqdm | |
from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification | |
description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotion in a sentence." | |
description2 = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected." | |
inference_modelpath = "model/checkpoint-128" | |
""" | |
output_dir = "model" | |
model_config = { | |
"model_weights": "pdelobelle/robbert-v2-dutch-base", | |
"num_labels": 6, | |
"max_length": 128, | |
"device": "cpu" | |
} | |
## Tokenizer and model | |
tokenizer = AutoTokenizer.from_pretrained(model_config["model_weights"]) | |
model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath) | |
# Function for encoding (tokenizing) data | |
def encode_data(data): | |
text = data["text"] | |
label = data["label"] | |
encoded_input = tokenizer( | |
text, | |
add_special_tokens=True, | |
max_length= model_config["max_length"], | |
padding= "max_length", | |
return_overflowing_tokens=True, | |
truncation=True | |
) | |
encoded_input["labels"] = label | |
return encoded_input | |
# Test arguments for Trainer | |
test_args = TrainingArguments( | |
output_dir = output_dir, | |
do_train = False, | |
do_predict = True, | |
per_device_eval_batch_size = 64, | |
dataloader_drop_last = False | |
) | |
trainer = Trainer( | |
model = model, | |
args = test_args) | |
def inference_dataset(file_object): | |
#input_file = open(file_object.name, 'r') | |
input_file = file_object | |
data_paths = {"train": input_file, "inference": input_file} | |
dataset = load_dataset('csv', skiprows=1, data_files=data_paths, column_names = ['id', 'text', 'label'], delimiter='\t') | |
encoded_dataset = dataset.map(encode_data, batched=True) | |
encoded_dataset.set_format("torch") | |
encoded_dataset["inference"] = encoded_dataset["inference"].remove_columns("label") | |
# Run trainer in prediction mode | |
prediction_output = trainer.predict(encoded_dataset["inference"]) | |
predictions = prediction_output[0] | |
ids = dataset["inference"]["id"] | |
texts = dataset["inference"]["text"] | |
preds = np.argmax(predictions, axis=1) | |
preds = [model.config.id2label[pred] for pred in preds] | |
predictions_content = list(zip(ids, texts, preds)) | |
# write predictions to file | |
output = "output.txt" | |
f = open(output, 'w') | |
f.write("id\ttext\tprediction\n") | |
for line in predictions_content: | |
f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n') | |
f.close() | |
return output | |
""" | |
def inference_dataset(file_object, option_list): | |
tokenizer = AutoTokenizer.from_pretrained(inference_modelpath) | |
model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath) | |
data_path = open(file_object, 'r') | |
df = pd.read_csv(data_path, delimiter='\t', header=0, names=['id', 'text']) | |
ids = df["id"].tolist() | |
texts = df["text"].tolist() | |
preds = [] | |
for text in tqdm(texts): # progressbar | |
inputs = tokenizer(text, return_tensors="pt") | |
with torch.no_grad(): # run model | |
logits = model(**inputs).logits | |
predicted_class_id = logits.argmax().item() | |
prediction = model.config.id2label[predicted_class_id] | |
preds.append(prediction) | |
predictions_content = list(zip(ids, texts, preds)) | |
# write predictions to file | |
output = "output.txt" | |
f = open(output, 'w') | |
f.write("id\ttext\tprediction\n") | |
for line in predictions_content: | |
f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n') | |
f.close() | |
output1 = output | |
output2 = output3 = output4 = output5 = "This option was not selected." | |
if "emotion frequencies" in option_list: | |
output2 = "This option was selected." | |
if "emotion distribution over time" in option_list: | |
output3 = "This option was selected." | |
if "peaks" in option_list: | |
output4 = "This option was selected." | |
if "topics" in option_list: | |
output5 = "This option was selected." | |
return [output1, output2, output3, output4, output5] | |
def what_happened(text, file_object, option_list): | |
if file_object: | |
output = "You uploaded a file." | |
#if len(option_list) > 0: | |
#output = output + "\nYou selected these options:\n- " + "\n- ".join(option_list) | |
else: | |
output = "Normally, this demo should analyse the emotions in this text:\n" + text | |
if len(option_list) > 0: | |
output = output + "\nYou can only select options when uploading a dataset." | |
return output | |
def what_happened1(text): | |
output = "Normally, this demo should analyse the emotions in this text:\n" + text | |
return output | |
def what_happened2(file_object, option_list): | |
#input_file = open(file_object.name, 'r') | |
#lines = input_file.read() | |
#input_file.close() | |
#output_file = open('output.txt', 'w') | |
#output_file.write(lines) | |
#output_file.close() | |
#output1 = 'output.txt' | |
output1 = inference_dataset(file_object.name) | |
output2 = output3 = output4 = output5 = "This option was not selected." | |
if "emotion frequencies" in option_list: | |
output2 = "This option was selected." | |
if "emotion distribution over time" in option_list: | |
output3 = "This option was selected." | |
if "peaks" in option_list: | |
output4 = "This option was selected." | |
if "topics" in option_list: | |
output5 = "This option was selected." | |
return [output1, output2, output3, output4, output5] | |
def inference_sentence(text): | |
tokenizer = AutoTokenizer.from_pretrained(inference_modelpath) | |
model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath) | |
inputs = tokenizer(text, return_tensors="pt") | |
with torch.no_grad(): # run model | |
logits = model(**inputs).logits | |
predicted_class_id = logits.argmax().item() | |
output = model.config.id2label[predicted_class_id] | |
return output | |
iface0 = gr.Interface( | |
fn=what_happened, | |
inputs=[ | |
gr.Textbox( | |
label= "Enter a sentence", | |
lines=1, | |
value="Your name"), | |
gr.File( | |
label="Or upload a dataset"), | |
gr.CheckboxGroup( | |
["emotion frequencies", "emotion distribution over time", "peaks", "topics"], | |
label = "Select options") | |
], | |
outputs="text") | |
iface_sentence = gr.Interface( | |
fn=inference_sentence, | |
description = description_sentence, | |
inputs = gr.Textbox( | |
label="Enter a sentence", | |
lines=1), | |
outputs="text") | |
iface2 = gr.Interface( | |
fn=inference_dataset, | |
description = description2, | |
inputs=[ | |
gr.File( | |
label="Upload a dataset"), | |
gr.CheckboxGroup( | |
["emotion frequencies", "emotion distribution over time", "peaks", "topics"], | |
label = "Select options") | |
], | |
#outputs=["text", "text", "text", "text", "text"]) | |
outputs = [ | |
#gr.Textbox(label="Output file"), | |
"file", | |
gr.Textbox(label="Emotion frequencies"), | |
gr.Textbox(label="Emotion distribution over time"), | |
gr.Textbox(label="Peaks"), | |
gr.Textbox(label="Topics") | |
]) | |
iface = gr.TabbedInterface([iface_sentence, iface2], ["Sentence", "Dataset"]) | |
iface.launch() |