Spaces:
Sleeping
Sleeping
import numpy as np | |
import torch | |
from transformers import AutoTokenizer | |
from transformers import AutoModelForSequenceClassification | |
class MovieClassifier: | |
ratings = ['bad', 'average', 'good'] | |
# initialize the model and tokenizer | |
def __init__(self, model_path): | |
self.model = AutoModelForSequenceClassification.from_pretrained(model_path) | |
self.tokenizer = AutoTokenizer.from_pretrained(model_path) | |
def __preProcessInput(self, titles, summaries, genres): | |
# titles: list of strings in the form: [title_1, title_2, ...] | |
# summaries: list of summaries(strings) in the form: [summary_1, summary_2, ...] | |
# genres: list of genres in the form: [[genres_1], [genres_2], ...] with genres_i = "genres_i1", "genres_i2", ... | |
inputs = [] | |
for i in range(len(titles)): | |
# normalice spacing in the titles | |
title_i = (' ').join(titles[i].split()) | |
# normalice spacing in the summaries | |
summary_i = (' ').join(summaries[i].split()) | |
if genres[i] == []: | |
genres_i = 'NonGiven' | |
else: | |
# convert the lists of genres to strings separated by '|' | |
genres_i = '|'.join(genres[i]) | |
input_i = {'title': title_i, 'summary': summary_i, 'genres': genres_i} | |
inputs.append(input_i) | |
return inputs | |
def __tokenizeInputs(self, inputs): | |
title_mod = [movie['title'] + '<SEP>' + movie['summary'] for movie in inputs] | |
genres_list = [movie['genres'] for movie in inputs] | |
return self.tokenizer(title_mod, genres_list, padding = 'max_length', | |
truncation = True, | |
return_tensors = "pt") | |
def __modelPredictions(self, model, tokenized_input): | |
# generate model predictions using the model logits and tokenized input and determine | |
# the most likely rating using | |
with torch.no_grad(): | |
model_output = self.model(**tokenized_input) | |
logits = model_output.logits | |
predictions = np.argmax(logits, axis = -1) | |
return predictions | |
def __predMovieRating(self, predictions): | |
predicted_ratings = [] | |
for pred in predictions: | |
predicted_ratings.append((pred, self.ratings[pred])) | |
return predicted_ratings | |
def predict(self, title, summary, genre): | |
movies = self.__preProcessInput(title, summary, genre) | |
tokenized_movies = self.__tokenizeInputs(movies) | |
predictions = self.__modelPredictions(self.model, tokenized_movies) | |
pred_ratings = self.__predMovieRating(predictions) | |
return pred_ratings | |