import unicodedata
import contractions
import re
import sentencepiece as spm


## Tokenizer
class Callable_tokenizer():
    def __init__(self, tokenizer_path):
        self.path = tokenizer_path
        self.tokenizer = spm.SentencePieceProcessor()
        self.tokenizer.load(tokenizer_path)
    def __call__(self, text):
        return self.tokenizer.Encode(text)

    def get_tokenId(self, token_name):
        return self.tokenizer.piece_to_id(token_name)

    def get_tokenName(self, id):
        return self.tokenizer.id_to_piece(id)

    def decode(self, tokens_list):
        return self.tokenizer.Decode(tokens_list)

    def __len__(self):
        return len(self.tokenizer)

    def user_tokenization(self, text):
        return [self.get_tokenId('<s>')] + self(text) + [self.get_tokenId('</s>')]


# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def preprocess_ar(text):
    text = unicodeToAscii(text) # Remove diacritics "التشكيل"
    text = text.replace("ة", "ه")
    text = text.replace(",", "،")
    text = text.replace(".", "۔")
    text = text.replace("?", "؟")
    text = re.sub(r"[;:]", "؛", text)
    text = re.sub(r"[^؀-ۿ0-9.!¿]+", " ", text)
    text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one
    return text


def preprocess_en(text):
    text = text.lower()
    text = contractions.fix(text) # Fix contractions "it's" -> "it is"
    text = re.sub(r"[^a-z0-9?.!,¿:;'\"]+", " ", text)
    text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one
    return text