Spaces:

TheDemond
/

NMT-Seq2Seq-with-Attention

Sleeping

Upload 7 files

a9ccfae verified 3 months ago

1.65 kB

	import unicodedata
	import pyarabic.araby as araby
	import contractions
	import re
	from camel_tools.tokenizers.word import simple_word_tokenize
	import spacy

	# https://stackoverflow.com/a/518232/2809427
	def unicodeToAscii(s):
	return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

	def preprocess_ar(text):
	text = text.lower()
	text = unicodeToAscii(text)
	text = re.sub(r"([?.!؟،,¿])", r" \1 ", text)
	text = re.sub(r"[^؀-ۿ?.!,¿]+", " ", text)
	text = araby.strip_diacritics(text) # Remove diacritics "التشكيل"
	text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one
	return text

	def preprocess_en(text):
	text = text.lower()
	text = contractions.fix(text) # Fix contractions "it's" -> "it is"
	text = re.sub(r"([?.!,¿])", r" \1 ", text)
	text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
	text = re.sub(r'\s+', ' ', text).strip() # Trim multiple whitespaces to one
	return text

	# Arabic Tokenizer
	class camel_tokenizer():
	def __call__(self, text):
	return simple_word_tokenize(text)
	# return [tok.text.lower() for tok in simple_word_tokenize(text)]


	# English Tokenizer
	class spacy_tokenizer:
	def __init__(self):
	self.spacy_eng = spacy.load("en_core_web_sm", disable=["ner", "parser"])
	self.spacy_eng.max_length = 10**6

	def __call__(self, text):
	return [tok.text for tok in self.spacy_eng.tokenizer(text)]

	def batch_tokenize(self, texts):
	return [[tok.text for tok in doc] for doc in self.spacy_eng.pipe(texts, batch_size=256)]