|
import unicodedata
|
|
import pyarabic.araby as araby
|
|
import contractions
|
|
import re
|
|
from camel_tools.tokenizers.word import simple_word_tokenize
|
|
import spacy
|
|
|
|
|
|
def unicodeToAscii(s):
|
|
return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
|
|
|
|
def preprocess_ar(text):
|
|
text = text.lower()
|
|
text = unicodeToAscii(text)
|
|
text = re.sub(r"([?.!؟،,¿])", r" \1 ", text)
|
|
text = re.sub(r"[^-ۿ?.!,¿]+", " ", text)
|
|
text = araby.strip_diacritics(text)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
|
|
def preprocess_en(text):
|
|
text = text.lower()
|
|
text = contractions.fix(text)
|
|
text = re.sub(r"([?.!,¿])", r" \1 ", text)
|
|
text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
|
|
|
|
class camel_tokenizer():
|
|
def __call__(self, text):
|
|
return simple_word_tokenize(text)
|
|
|
|
|
|
|
|
|
|
class spacy_tokenizer:
|
|
def __init__(self):
|
|
self.spacy_eng = spacy.load("en_core_web_sm", disable=["ner", "parser"])
|
|
self.spacy_eng.max_length = 10**6
|
|
|
|
def __call__(self, text):
|
|
return [tok.text for tok in self.spacy_eng.tokenizer(text)]
|
|
|
|
def batch_tokenize(self, texts):
|
|
return [[tok.text for tok in doc] for doc in self.spacy_eng.pipe(texts, batch_size=256)] |