File size: 2,739 Bytes
24d1036
 
a9ccfae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24d1036
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
## OLD, NOT USED

from utils import spacy_tokenizer
from collections import Counter

class Vocabulary:
    def __init__(self, tokenizer, max_freq=3, unk=True, sos=False, eos=False):
        self.sos = sos
        self.eos = eos
        self.unk = unk
        self.tokenizer = tokenizer
        self.max_freq = max_freq

        self.stoi = {"<PAD>": 0}
        if unk: self.stoi['<UNK>'] = len(self.stoi)
        if sos: self.stoi['<SOS>'] = len(self.stoi)
        if eos: self.stoi['<EOS>'] = len(self.stoi)

    def __len__(self):
        return len(self.stoi)

    def get_vocabulary(self):
        return self.stoi

    def set_vocabulary(self, stoi):
        self.stoi = stoi

    def add_token(self, token_name: str):
        if token_name not in self.stoi:
            self.stoi[token_name] = len(self.stoi)

    def build_vocabulary(self, sentences_list):
        if isinstance(sentences_list[0], list):
            sentences_list = [' '.join(sentence) for sentence in sentences_list]

        if isinstance(self.tokenizer, spacy_tokenizer) and hasattr(self.tokenizer, 'batch_tokenize'):
            tokens_list = self.tokenizer.batch_tokenize(sentences_list)
        else:
            tokens_list = [self.tokenizer(sentence) for sentence in sentences_list]

        word_counts = Counter(token for tokens in tokens_list for token in tokens)
        filtered_words = [word for word, count in word_counts.items() if count >= self.max_freq]
        self.stoi.update({word: i+len(self.stoi) for i, word in enumerate(filtered_words)})

    def get_numerical_tokens(self, text):
        tokens = self.tokenizer(text)
        if self.sos: tokens.insert(0, '<SOS>')
        if self.eos: tokens.append('<EOS>')
        unk_id = self.stoi.get('<UNK>', None)
        return [self.stoi.get(token, unk_id) for token in tokens]

    def __call__(self, text):
        return self.get_numerical_tokens(text)

    def tokens_to_text(self, tokens_list):
        keys = list(self.stoi.keys())
        values = list(self.stoi.values())

        return ' '.join([keys[values.index(token)] for token in tokens_list])


# # Arabic Tokenizer
# class camel_tokenizer():
#     def __call__(self, text):
#         return text.split(' ')


# # English Tokenizer
# class spacy_tokenizer:
#     def __init__(self):
#         self.spacy_eng = spacy.load("en_core_web_sm", disable=["ner", "parser"])
#         self.spacy_eng.max_length = 10**6

#     def __call__(self, text):
#         return [tok.text for tok in self.spacy_eng.tokenizer(text)]

#     def batch_tokenize(self, texts):
#         return [[tok.text for tok in doc] for doc in self.spacy_eng.pipe(texts, batch_size=256)]