Spaces:
Sleeping
Sleeping
File size: 2,771 Bytes
21d8cce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import numpy as np
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
class MovieClassifier:
ratings = ['bad', 'average', 'good']
# initialize the model and tokenizer
def __init__(self, model_path):
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
def __preProcessInput(self, titles, summaries, genres):
# titles: list of strings in the form: [title_1, title_2, ...]
# summaries: list of summaries(strings) in the form: [summary_1, summary_2, ...]
# genres: list of genres in the form: [[genres_1], [genres_2], ...] with genres_i = "genres_i1", "genres_i2", ...
inputs = []
for i in range(len(titles)):
# normalice spacing in the titles
title_i = (' ').join(titles[i].split())
# normalice spacing in the summaries
summary_i = (' ').join(summaries[i].split())
if genres[i] == []:
genres_i = 'NonGiven'
else:
# convert the lists of genres to strings separated by '|'
genres_i = '|'.join(genres[i])
input_i = {'title': title_i, 'summary': summary_i, 'genres': genres_i}
inputs.append(input_i)
return inputs
def __tokenizeInputs(self, inputs):
title_mod = [movie['title'] + '<SEP>' + movie['summary'] for movie in inputs]
genres_list = [movie['genres'] for movie in inputs]
return self.tokenizer(title_mod, genres_list, padding = 'max_length',
truncation = True,
return_tensors = "pt")
def __modelPredictions(self, model, tokenized_input):
# generate model predictions using the model logits and tokenized input and determine
# the most likely rating using
with torch.no_grad():
model_output = self.model(**tokenized_input)
logits = model_output.logits
predictions = np.argmax(logits, axis = -1)
return predictions
def __predMovieRating(self, predictions):
predicted_ratings = []
for pred in predictions:
predicted_ratings.append((pred, self.ratings[pred]))
return predicted_ratings
def predict(self, title, summary, genre):
movies = self.__preProcessInput(title, summary, genre)
tokenized_movies = self.__tokenizeInputs(movies)
predictions = self.__modelPredictions(self.model, tokenized_movies)
pred_ratings = self.__predMovieRating(predictions)
return pred_ratings
|