RakeshUtekar commited on
Commit
456b206
·
verified ·
1 Parent(s): 459ab69

added files to run

Browse files
Files changed (4) hide show
  1. extract.py +21 -0
  2. preprocess.py +25 -0
  3. requirements.txt +5 -0
  4. retrieve.py +35 -0
extract.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+
3
+
4
+ def extract_text_from_pdfs(pdf_files):
5
+ """
6
+ Extracts text from a list of PDF files.
7
+
8
+ Args:
9
+ pdf_files (list): List of paths to PDF files.
10
+
11
+ Returns:
12
+ list: List of extracted text from each PDF.
13
+ """
14
+ all_texts = []
15
+ for pdf_file in pdf_files:
16
+ with pdfplumber.open(pdf_file) as pdf:
17
+ text = ""
18
+ for page in pdf.pages:
19
+ text += page.extract_text()
20
+ all_texts.append(text)
21
+ return all_texts
preprocess.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+
3
+ import nltk
4
+
5
+ nltk.download('punkt')
6
+ from nltk.tokenize import word_tokenize
7
+
8
+
9
+ def preprocess_text(texts):
10
+ """
11
+ Preprocesses a list of texts by converting to lowercase, removing punctuation, and tokenizing.
12
+
13
+ Args:
14
+ texts (list): List of text strings to preprocess.
15
+
16
+ Returns:
17
+ list: List of preprocessed and tokenized texts.
18
+ """
19
+ processed_texts = []
20
+ for text in texts:
21
+ text = text.lower()
22
+ text = text.translate(str.maketrans('', '', string.punctuation))
23
+ tokens = word_tokenize(text)
24
+ processed_texts.append(tokens)
25
+ return processed_texts
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ PyMuPDF
3
+ nltk
4
+ scikit-learn
5
+ openai
retrieve.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+
4
+
5
+ def create_vectorizer(processed_texts):
6
+ """
7
+ Creates a TF-IDF vectorizer and transforms the texts.
8
+
9
+ Args:
10
+ processed_texts (list): List of preprocessed and tokenized texts.
11
+
12
+ Returns:
13
+ tuple: TF-IDF vectorizer and transformed text matrix.
14
+ """
15
+ vectorizer = TfidfVectorizer()
16
+ X = vectorizer.fit_transform([' '.join(text) for text in processed_texts])
17
+ return vectorizer, X
18
+
19
+ def retrieve(query, X, vectorizer, top_k=5):
20
+ """
21
+ Retrieves the top-k most relevant texts for a given query.
22
+
23
+ Args:
24
+ query (str): Query string.
25
+ X (matrix): TF-IDF transformed text matrix.
26
+ vectorizer (TfidfVectorizer): TF-IDF vectorizer.
27
+ top_k (int): Number of top results to retrieve.
28
+
29
+ Returns:
30
+ list: Indices of the top-k most relevant texts.
31
+ """
32
+ query_vec = vectorizer.transform([query])
33
+ scores = np.dot(X, query_vec.T).toarray()
34
+ top_indices = np.argsort(scores, axis=0)[-top_k:][::-1]
35
+ return top_indices.flatten()