Spaces:

TheDemond
/

NMT-Seq2Seq-with-Attention

Sleeping

File size: 1,653 Bytes

a9ccfae

import unicodedata
import pyarabic.araby as araby
import contractions
import re
from camel_tools.tokenizers.word import simple_word_tokenize
import spacy

# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_ar(text):
    text = text.lower()
    text = unicodeToAscii(text)
    text = re.sub(r"([?.!؟،,¿])", r" \1 ", text)
    text = re.sub(r"[^؀-ۿ?.!,¿]+", " ", text)
    text = araby.strip_diacritics(text) # Remove diacritics "التشكيل"
    text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one
    return text

def preprocess_en(text):
    text = text.lower()
    text = contractions.fix(text) # Fix contractions "it's" -> "it is"
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one
    return text

# Arabic Tokenizer
class camel_tokenizer():
    def __call__(self, text):
        return simple_word_tokenize(text)
        # return [tok.text.lower() for tok in simple_word_tokenize(text)]


# English Tokenizer
class spacy_tokenizer:
    def __init__(self):
        self.spacy_eng = spacy.load("en_core_web_sm", disable=["ner", "parser"])
        self.spacy_eng.max_length = 10**6

    def __call__(self, text):
        return [tok.text for tok in self.spacy_eng.tokenizer(text)]

    def batch_tokenize(self, texts):
        return [[tok.text for tok in doc] for doc in self.spacy_eng.pipe(texts, batch_size=256)]