Spaces:
Runtime error
Runtime error
File size: 1,058 Bytes
456b206 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def create_vectorizer(processed_texts):
"""
Creates a TF-IDF vectorizer and transforms the texts.
Args:
processed_texts (list): List of preprocessed and tokenized texts.
Returns:
tuple: TF-IDF vectorizer and transformed text matrix.
"""
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([' '.join(text) for text in processed_texts])
return vectorizer, X
def retrieve(query, X, vectorizer, top_k=5):
"""
Retrieves the top-k most relevant texts for a given query.
Args:
query (str): Query string.
X (matrix): TF-IDF transformed text matrix.
vectorizer (TfidfVectorizer): TF-IDF vectorizer.
top_k (int): Number of top results to retrieve.
Returns:
list: Indices of the top-k most relevant texts.
"""
query_vec = vectorizer.transform([query])
scores = np.dot(X, query_vec.T).toarray()
top_indices = np.argsort(scores, axis=0)[-top_k:][::-1]
return top_indices.flatten()
|