File size: 619 Bytes
456b206 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
def preprocess_text(texts):
"""
Preprocesses a list of texts by converting to lowercase, removing punctuation, and tokenizing.
Args:
texts (list): List of text strings to preprocess.
Returns:
list: List of preprocessed and tokenized texts.
"""
processed_texts = []
for text in texts:
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
tokens = word_tokenize(text)
processed_texts.append(tokens)
return processed_texts
|