RakeshUtekar commited on
Commit
459ab69
·
verified ·
1 Parent(s): 50b9efe
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ import streamlit as st
5
+
6
+ from extract import extract_text_from_pdfs
7
+ from generate import generate_response
8
+ from preprocess import preprocess_text
9
+ from retrieve import create_vectorizer, retrieve
10
+
11
+ # Streamlit UI
12
+ st.title("RAG-based PDF Query System")
13
+
14
+ uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
15
+
16
+ if uploaded_files:
17
+ st.write("Processing the uploaded PDFs...")
18
+
19
+ # Initialize progress bar
20
+ progress_bar = st.progress(0)
21
+ status_text = st.empty()
22
+
23
+ # Save uploaded files to disk
24
+ pdf_files = []
25
+ for uploaded_file in uploaded_files:
26
+ with open(uploaded_file.name, "wb") as f:
27
+ f.write(uploaded_file.getbuffer())
28
+ pdf_files.append(uploaded_file.name)
29
+
30
+ # Extract text from PDFs with progress updates
31
+ num_files = len(pdf_files)
32
+ texts = []
33
+ for i, pdf_file in enumerate(pdf_files):
34
+ status_text.text(f"Extracting text from file {i + 1} of {num_files}...")
35
+ text = extract_text_from_pdfs([pdf_file])
36
+ texts.extend(text)
37
+ progress_bar.progress((i + 1) / num_files)
38
+ time.sleep(0.1) # Simulate time taken for processing
39
+
40
+ # Preprocess text with progress updates
41
+ status_text.text("Preprocessing text...")
42
+ progress_bar.progress(0.5)
43
+ processed_texts = preprocess_text(texts)
44
+ time.sleep(0.1) # Simulate time taken for processing
45
+
46
+ # Create vectorizer and transform texts
47
+ status_text.text("Creating vectorizer and transforming texts...")
48
+ progress_bar.progress(0.75)
49
+ vectorizer, X = create_vectorizer(processed_texts)
50
+ time.sleep(0.1) # Simulate time taken for processing
51
+
52
+ # Finalize progress
53
+ progress_bar.progress(1.0)
54
+ status_text.text("Processing complete!")
55
+
56
+ query = st.text_input("Enter your query:")
57
+
58
+ if query:
59
+ # Retrieve relevant texts
60
+ top_indices = retrieve(query, X, vectorizer)
61
+ retrieved_texts = [texts[i] for i in top_indices]
62
+
63
+ # Generate response
64
+ response = generate_response(retrieved_texts, query)
65
+
66
+ st.write("Response:")
67
+ st.write(response)
68
+
69
+ # Clean up uploaded files
70
+ for pdf_file in pdf_files:
71
+ os.remove(pdf_file)