## OLD, NOT USED from utils import spacy_tokenizer from collections import Counter class Vocabulary: def __init__(self, tokenizer, max_freq=3, unk=True, sos=False, eos=False): self.sos = sos self.eos = eos self.unk = unk self.tokenizer = tokenizer self.max_freq = max_freq self.stoi = {"": 0} if unk: self.stoi[''] = len(self.stoi) if sos: self.stoi[''] = len(self.stoi) if eos: self.stoi[''] = len(self.stoi) def __len__(self): return len(self.stoi) def get_vocabulary(self): return self.stoi def set_vocabulary(self, stoi): self.stoi = stoi def add_token(self, token_name: str): if token_name not in self.stoi: self.stoi[token_name] = len(self.stoi) def build_vocabulary(self, sentences_list): if isinstance(sentences_list[0], list): sentences_list = [' '.join(sentence) for sentence in sentences_list] if isinstance(self.tokenizer, spacy_tokenizer) and hasattr(self.tokenizer, 'batch_tokenize'): tokens_list = self.tokenizer.batch_tokenize(sentences_list) else: tokens_list = [self.tokenizer(sentence) for sentence in sentences_list] word_counts = Counter(token for tokens in tokens_list for token in tokens) filtered_words = [word for word, count in word_counts.items() if count >= self.max_freq] self.stoi.update({word: i+len(self.stoi) for i, word in enumerate(filtered_words)}) def get_numerical_tokens(self, text): tokens = self.tokenizer(text) if self.sos: tokens.insert(0, '') if self.eos: tokens.append('') unk_id = self.stoi.get('', None) return [self.stoi.get(token, unk_id) for token in tokens] def __call__(self, text): return self.get_numerical_tokens(text) def tokens_to_text(self, tokens_list): keys = list(self.stoi.keys()) values = list(self.stoi.values()) return ' '.join([keys[values.index(token)] for token in tokens_list]) # # Arabic Tokenizer # class camel_tokenizer(): # def __call__(self, text): # return text.split(' ') # # English Tokenizer # class spacy_tokenizer: # def __init__(self): # self.spacy_eng = spacy.load("en_core_web_sm", disable=["ner", "parser"]) # self.spacy_eng.max_length = 10**6 # def __call__(self, text): # return [tok.text for tok in self.spacy_eng.tokenizer(text)] # def batch_tokenize(self, texts): # return [[tok.text for tok in doc] for doc in self.spacy_eng.pipe(texts, batch_size=256)]