Spaces:

RakeshUtekar
/

RAG-based-PDF-Query-System

Running

File size: 619 Bytes

456b206

import string

import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize


def preprocess_text(texts):
    """
    Preprocesses a list of texts by converting to lowercase, removing punctuation, and tokenizing.

    Args:
    texts (list): List of text strings to preprocess.

    Returns:
    list: List of preprocessed and tokenized texts.
    """
    processed_texts = []
    for text in texts:
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)
        processed_texts.append(tokens)
    return processed_texts