import unicodedata import contractions import re import sentencepiece as spm ## Tokenizer class Callable_tokenizer(): def __init__(self, tokenizer_path): self.path = tokenizer_path self.tokenizer = spm.SentencePieceProcessor() self.tokenizer.load(tokenizer_path) def __call__(self, text): return self.tokenizer.Encode(text) def get_tokenId(self, token_name): return self.tokenizer.piece_to_id(token_name) def get_tokenName(self, id): return self.tokenizer.id_to_piece(id) def decode(self, tokens_list): return self.tokenizer.Decode(tokens_list) def __len__(self): return len(self.tokenizer) def user_tokenization(self, text): return [self.get_tokenId('')] + self(text) + [self.get_tokenId('')] # https://stackoverflow.com/a/518232/2809427 def unicodeToAscii(s): return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') def preprocess_ar(text): text = unicodeToAscii(text) # Remove diacritics "التشكيل" text = text.replace("ة", "ه") text = text.replace(",", "،") text = text.replace(".", "۔") text = text.replace("?", "؟") text = re.sub(r"[;:]", "؛", text) text = re.sub(r"[^؀-ۿ0-9.!¿]+", " ", text) text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one return text def preprocess_en(text): text = text.lower() text = contractions.fix(text) # Fix contractions "it's" -> "it is" text = re.sub(r"[^a-z0-9?.!,¿:;'\"]+", " ", text) text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one return text