Spaces:

lunadebruyne
/

EmotioNL

Running

App Files Files Community

EmotioNL / app.py

lunadebruyne

Update app.py

1743490 about 2 years ago

raw

history blame

8.08 kB

	import gradio as gr
	import torch
	import numpy as np

	import pandas as pd
	from tqdm import tqdm

	from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForSequenceClassification


	description_sentence = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotion in a sentence."
	description2 = "<h3>Demo EmotioNL</h3>\nThis demo allows you to analyse the emotions in a dataset.\nThe data should be in tsv-format with two named columns: the first column (id) should contain the sentence IDs, and the second column (text) should contain the actual texts. Optionally, there is a third column named 'date', which specifies the date associated with the text (e.g., tweet date). This column is necessary when the options 'emotion distribution over time' and 'peaks' are selected."

	inference_modelpath = "model/checkpoint-128"

	"""
	output_dir = "model"
	model_config = {
	"model_weights": "pdelobelle/robbert-v2-dutch-base",
	"num_labels": 6,
	"max_length": 128,
	"device": "cpu"
	}

	## Tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained(model_config["model_weights"])
	model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)


	# Function for encoding (tokenizing) data
	def encode_data(data):
	text = data["text"]
	label = data["label"]
	encoded_input = tokenizer(
	text,
	add_special_tokens=True,
	max_length= model_config["max_length"],
	padding= "max_length",
	return_overflowing_tokens=True,
	truncation=True
	)
	encoded_input["labels"] = label
	return encoded_input


	# Test arguments for Trainer
	test_args = TrainingArguments(
	output_dir = output_dir,
	do_train = False,
	do_predict = True,
	per_device_eval_batch_size = 64,
	dataloader_drop_last = False
	)
	trainer = Trainer(
	model = model,
	args = test_args)


	def inference_dataset(file_object):
	#input_file = open(file_object.name, 'r')
	input_file = file_object
	data_paths = {"train": input_file, "inference": input_file}
	dataset = load_dataset('csv', skiprows=1, data_files=data_paths, column_names = ['id', 'text', 'label'], delimiter='\t')
	encoded_dataset = dataset.map(encode_data, batched=True)
	encoded_dataset.set_format("torch")
	encoded_dataset["inference"] = encoded_dataset["inference"].remove_columns("label")
	# Run trainer in prediction mode
	prediction_output = trainer.predict(encoded_dataset["inference"])
	predictions = prediction_output[0]
	ids = dataset["inference"]["id"]
	texts = dataset["inference"]["text"]
	preds = np.argmax(predictions, axis=1)
	preds = [model.config.id2label[pred] for pred in preds]
	predictions_content = list(zip(ids, texts, preds))
	# write predictions to file
	output = "output.txt"
	f = open(output, 'w')
	f.write("id\ttext\tprediction\n")
	for line in predictions_content:
	f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n')
	f.close()
	return output
	"""

	def inference_dataset(file_object, option_list):
	tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
	model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
	data_path = open(file_object, 'r')
	df = pd.read_csv(data_path, delimiter='\t', header=0, names=['id', 'text'])
	ids = df["id"].tolist()
	texts = df["text"].tolist()
	preds = []
	for text in tqdm(texts): # progressbar
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad(): # run model
	logits = model(**inputs).logits
	predicted_class_id = logits.argmax().item()
	prediction = model.config.id2label[predicted_class_id]
	preds.append(prediction)
	predictions_content = list(zip(ids, texts, preds))
	# write predictions to file
	output = "output.txt"
	f = open(output, 'w')
	f.write("id\ttext\tprediction\n")
	for line in predictions_content:
	f.write(str(line[0]) + '\t' + str(line[1]) + '\t' + str(line[2]) + '\n')
	f.close()
	output1 = output
	output2 = output3 = output4 = output5 = "This option was not selected."
	if "emotion frequencies" in option_list:
	output2 = "This option was selected."
	if "emotion distribution over time" in option_list:
	output3 = "This option was selected."
	if "peaks" in option_list:
	output4 = "This option was selected."
	if "topics" in option_list:
	output5 = "This option was selected."
	return [output1, output2, output3, output4, output5]

	def what_happened(text, file_object, option_list):
	if file_object:
	output = "You uploaded a file."
	#if len(option_list) > 0:
	#output = output + "\nYou selected these options:\n- " + "\n- ".join(option_list)
	else:
	output = "Normally, this demo should analyse the emotions in this text:\n" + text
	if len(option_list) > 0:
	output = output + "\nYou can only select options when uploading a dataset."
	return output

	def what_happened1(text):
	output = "Normally, this demo should analyse the emotions in this text:\n" + text
	return output

	def what_happened2(file_object, option_list):
	#input_file = open(file_object.name, 'r')
	#lines = input_file.read()
	#input_file.close()
	#output_file = open('output.txt', 'w')
	#output_file.write(lines)
	#output_file.close()
	#output1 = 'output.txt'
	output1 = inference_dataset(file_object.name)
	output2 = output3 = output4 = output5 = "This option was not selected."
	if "emotion frequencies" in option_list:
	output2 = "This option was selected."
	if "emotion distribution over time" in option_list:
	output3 = "This option was selected."
	if "peaks" in option_list:
	output4 = "This option was selected."
	if "topics" in option_list:
	output5 = "This option was selected."
	return [output1, output2, output3, output4, output5]

	def inference_sentence(text):
	tokenizer = AutoTokenizer.from_pretrained(inference_modelpath)
	model = AutoModelForSequenceClassification.from_pretrained(inference_modelpath)
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad(): # run model
	logits = model(**inputs).logits
	predicted_class_id = logits.argmax().item()
	output = model.config.id2label[predicted_class_id]
	return output


	iface0 = gr.Interface(
	fn=what_happened,
	inputs=[
	gr.Textbox(
	label= "Enter a sentence",
	lines=1,
	value="Your name"),
	gr.File(
	label="Or upload a dataset"),
	gr.CheckboxGroup(
	["emotion frequencies", "emotion distribution over time", "peaks", "topics"],
	label = "Select options")
	],
	outputs="text")

	iface_sentence = gr.Interface(
	fn=inference_sentence,
	description = description_sentence,
	inputs = gr.Textbox(
	label="Enter a sentence",
	lines=1),
	outputs="text")

	iface2 = gr.Interface(
	fn=inference_dataset,
	description = description2,
	inputs=[
	gr.File(
	label="Upload a dataset"),
	gr.CheckboxGroup(
	["emotion frequencies", "emotion distribution over time", "peaks", "topics"],
	label = "Select options")
	],
	#outputs=["text", "text", "text", "text", "text"])
	outputs = [
	#gr.Textbox(label="Output file"),
	"file",
	gr.Textbox(label="Emotion frequencies"),
	gr.Textbox(label="Emotion distribution over time"),
	gr.Textbox(label="Peaks"),
	gr.Textbox(label="Topics")
	])

	iface = gr.TabbedInterface([iface_sentence, iface2], ["Sentence", "Dataset"])

	iface.launch()