File size: 619 Bytes
456b206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import string

import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize


def preprocess_text(texts):
    """
    Preprocesses a list of texts by converting to lowercase, removing punctuation, and tokenizing.

    Args:
    texts (list): List of text strings to preprocess.

    Returns:
    list: List of preprocessed and tokenized texts.
    """
    processed_texts = []
    for text in texts:
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)
        processed_texts.append(tokens)
    return processed_texts