TheDemond's picture
Upload 7 files
a9ccfae verified
raw
history blame
1.65 kB
import unicodedata
import pyarabic.araby as araby
import contractions
import re
from camel_tools.tokenizers.word import simple_word_tokenize
import spacy
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def preprocess_ar(text):
text = text.lower()
text = unicodeToAscii(text)
text = re.sub(r"([?.!؟،,¿])", r" \1 ", text)
text = re.sub(r"[^؀-ۿ?.!,¿]+", " ", text)
text = araby.strip_diacritics(text) # Remove diacritics "التشكيل"
text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one
return text
def preprocess_en(text):
text = text.lower()
text = contractions.fix(text) # Fix contractions "it's" -> "it is"
text = re.sub(r"([?.!,¿])", r" \1 ", text)
text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one
return text
# Arabic Tokenizer
class camel_tokenizer():
def __call__(self, text):
return simple_word_tokenize(text)
# return [tok.text.lower() for tok in simple_word_tokenize(text)]
# English Tokenizer
class spacy_tokenizer:
def __init__(self):
self.spacy_eng = spacy.load("en_core_web_sm", disable=["ner", "parser"])
self.spacy_eng.max_length = 10**6
def __call__(self, text):
return [tok.text for tok in self.spacy_eng.tokenizer(text)]
def batch_tokenize(self, texts):
return [[tok.text for tok in doc] for doc in self.spacy_eng.pipe(texts, batch_size=256)]