File size: 2,771 Bytes
21d8cce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
import torch

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification


class MovieClassifier:

    ratings = ['bad', 'average', 'good']

    # initialize the model and tokenizer
    def __init__(self, model_path):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        
    def __preProcessInput(self, titles, summaries, genres):
    # titles: list of strings in the form: [title_1, title_2, ...]
    # summaries: list of summaries(strings) in the form: [summary_1, summary_2, ...]
    # genres: list of genres in the form: [[genres_1], [genres_2], ...] with genres_i = "genres_i1", "genres_i2", ...
    
        inputs = []
    
        for i in range(len(titles)):
        # normalice spacing in the titles
            title_i = (' ').join(titles[i].split())
        
        # normalice spacing in the summaries
            summary_i = (' ').join(summaries[i].split())
        
            if genres[i] == []:
                genres_i = 'NonGiven'
            else:
            # convert the lists of genres to strings separated by '|'
                genres_i = '|'.join(genres[i])
            
            input_i = {'title': title_i, 'summary': summary_i, 'genres': genres_i}
            inputs.append(input_i)
        
        return inputs
    
    def __tokenizeInputs(self, inputs):
        title_mod = [movie['title'] + '<SEP>' + movie['summary'] for movie in inputs]
        genres_list = [movie['genres'] for movie in inputs]
        
        return self.tokenizer(title_mod, genres_list, padding = 'max_length', 
                         truncation = True, 
                         return_tensors = "pt")
    
    def __modelPredictions(self, model, tokenized_input):
    # generate model predictions using the model logits and tokenized input and determine 
    # the most likely rating using
    
        with torch.no_grad():
            model_output = self.model(**tokenized_input)
        
        logits = model_output.logits
        predictions = np.argmax(logits, axis = -1)
    
        return predictions
    
    def __predMovieRating(self, predictions):
        predicted_ratings = []
    
        for pred in predictions:
            predicted_ratings.append((pred, self.ratings[pred]))
            
        return predicted_ratings
    
    def predict(self, title, summary, genre):
        movies = self.__preProcessInput(title, summary, genre)
        tokenized_movies = self.__tokenizeInputs(movies)
        predictions = self.__modelPredictions(self.model, tokenized_movies)
        pred_ratings = self.__predMovieRating(predictions)
        
        return pred_ratings