import spacy from collections import Counter class spacy_tokenizer(): def __init__(self): self.spacy_eng = spacy.load("en_core_web_sm") def __call__(self, text): return [tok.text.lower() for tok in self.spacy_eng.tokenizer(text)] class Vocabulary: def __init__(self, callable_tokenizer=None, max_freq=3, unk=True, sos=False, eos=False): self.sos = sos self.eos = eos self.unk = unk if callable_tokenizer: self.callable_tokenizer = callable_tokenizer else: self.callable_tokenizer = spacy_tokenizer() self.stoi = {"": 0} if self.unk: self.stoi[''] = len(self.stoi) if self.sos: self.stoi[''] = len(self.stoi) if self.eos: self.stoi[''] = len(self.stoi) def __len__(self): return len(self.stoi) def get_vocabulary(self): return self.stoi def add_token(self, token_name: str): if token_name not in self.stoi: self.stoi[token_name] = len(self.stoi) def build_vocabulary(self, sentences_list): if type(sentences_list[0]) != str: ## ex: [['eating', 'apples'], ['eating', 'oranges']] sentences_list = [' '.join(sen) for sen in sentences_list] word_counts = Counter() for sentence in sentences_list: tokens = self.callable_tokenizer(sentence) word_counts.update(tokens) # Filter words with mox_freq or more occurrences filtered_words = [word for word, count in word_counts.items() if count >= 3] for word in filtered_words: if word not in self.stoi: self.stoi[word] = len(self.stoi) def get_numerical_tokens(self, text: str): tokens = self.callable_tokenizer(text) # tokens.insert(0, '') if self.sos else None # tokens.append('') if self.eos else None unk_id = self.stoi.get('', None) return [self.stoi.get(word, unk_id) for word in tokens] def __call__(self, text: str): return self.get_numerical_tokens(text) def tokens_to_text(self, tokens_list): keys = list(self.stoi.keys()) values = list(self.stoi.values()) return ' '.join([keys[values.index(token)] for token in tokens_list])