RakeshUtekar
commited on
added files to run
Browse files- extract.py +21 -0
- preprocess.py +25 -0
- requirements.txt +5 -0
- retrieve.py +35 -0
extract.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pdfplumber
|
2 |
+
|
3 |
+
|
4 |
+
def extract_text_from_pdfs(pdf_files):
|
5 |
+
"""
|
6 |
+
Extracts text from a list of PDF files.
|
7 |
+
|
8 |
+
Args:
|
9 |
+
pdf_files (list): List of paths to PDF files.
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
list: List of extracted text from each PDF.
|
13 |
+
"""
|
14 |
+
all_texts = []
|
15 |
+
for pdf_file in pdf_files:
|
16 |
+
with pdfplumber.open(pdf_file) as pdf:
|
17 |
+
text = ""
|
18 |
+
for page in pdf.pages:
|
19 |
+
text += page.extract_text()
|
20 |
+
all_texts.append(text)
|
21 |
+
return all_texts
|
preprocess.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
|
3 |
+
import nltk
|
4 |
+
|
5 |
+
nltk.download('punkt')
|
6 |
+
from nltk.tokenize import word_tokenize
|
7 |
+
|
8 |
+
|
9 |
+
def preprocess_text(texts):
|
10 |
+
"""
|
11 |
+
Preprocesses a list of texts by converting to lowercase, removing punctuation, and tokenizing.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
texts (list): List of text strings to preprocess.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
list: List of preprocessed and tokenized texts.
|
18 |
+
"""
|
19 |
+
processed_texts = []
|
20 |
+
for text in texts:
|
21 |
+
text = text.lower()
|
22 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
23 |
+
tokens = word_tokenize(text)
|
24 |
+
processed_texts.append(tokens)
|
25 |
+
return processed_texts
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
PyMuPDF
|
3 |
+
nltk
|
4 |
+
scikit-learn
|
5 |
+
openai
|
retrieve.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
+
|
4 |
+
|
5 |
+
def create_vectorizer(processed_texts):
|
6 |
+
"""
|
7 |
+
Creates a TF-IDF vectorizer and transforms the texts.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
processed_texts (list): List of preprocessed and tokenized texts.
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
tuple: TF-IDF vectorizer and transformed text matrix.
|
14 |
+
"""
|
15 |
+
vectorizer = TfidfVectorizer()
|
16 |
+
X = vectorizer.fit_transform([' '.join(text) for text in processed_texts])
|
17 |
+
return vectorizer, X
|
18 |
+
|
19 |
+
def retrieve(query, X, vectorizer, top_k=5):
|
20 |
+
"""
|
21 |
+
Retrieves the top-k most relevant texts for a given query.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
query (str): Query string.
|
25 |
+
X (matrix): TF-IDF transformed text matrix.
|
26 |
+
vectorizer (TfidfVectorizer): TF-IDF vectorizer.
|
27 |
+
top_k (int): Number of top results to retrieve.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
list: Indices of the top-k most relevant texts.
|
31 |
+
"""
|
32 |
+
query_vec = vectorizer.transform([query])
|
33 |
+
scores = np.dot(X, query_vec.T).toarray()
|
34 |
+
top_indices = np.argsort(scores, axis=0)[-top_k:][::-1]
|
35 |
+
return top_indices.flatten()
|