Spaces:
Runtime error
Runtime error
import spacy | |
from spacy.tokenizer import Tokenizer | |
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS, HYPHENS | |
from spacy.util import compile_infix_regex | |
from spacy.lang.en import English | |
nlp = English() | |
def get_tokenizer_gec(nlp): | |
infixes = ( | |
LIST_ELLIPSES | |
+ LIST_ICONS | |
+ [ | |
r"(?<=[0-9])[+\-\*^](?=[0-9-])", | |
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( | |
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES | |
), | |
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | |
#r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), | |
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), | |
] | |
) | |
infix_re = compile_infix_regex(infixes) | |
return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search, | |
suffix_search=nlp.tokenizer.suffix_search, | |
infix_finditer=infix_re.finditer, | |
token_match=nlp.tokenizer.token_match, | |
rules=nlp.Defaults.tokenizer_exceptions) | |
def get_tokenizer_bea19(nlp): | |
infixes = ( | |
LIST_ELLIPSES | |
+ LIST_ICONS | |
+ [ | |
r"(?<=[0-9])[+\-\*^](?=[0-9-])", | |
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( | |
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES | |
), | |
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | |
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), | |
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), | |
] | |
) | |
infix_re = compile_infix_regex(infixes) | |
return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search, | |
suffix_search=nlp.tokenizer.suffix_search, | |
infix_finditer=infix_re.finditer, | |
token_match=nlp.tokenizer.token_match, | |
rules=nlp.Defaults.tokenizer_exceptions) | |
tokenizer_gec = get_tokenizer_gec(nlp) | |
tokenizer_bea19 = get_tokenizer_bea19(nlp) | |
def spacy_tokenize_gec(text): | |
nlp.tokenizer = tokenizer_gec | |
return [str(w) for w in nlp(text)] | |
def spacy_tokenize_bea19(text): | |
nlp.tokenizer = tokenizer_bea19 | |
return [str(w) for w in nlp(text)] | |