RakeshUtekar commited on
Commit
56b0710
·
verified ·
1 Parent(s): 613efdf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -53
app.py CHANGED
@@ -1,71 +1,107 @@
1
  import os
2
  import time
3
 
 
4
  import streamlit as st
 
5
 
6
  from extract import extract_text_from_pdfs
7
  from generate import generate_response
8
  from preprocess import preprocess_text
9
  from retrieve import create_vectorizer, retrieve
10
 
11
- # Streamlit UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  st.title("RAG-based PDF Query System")
13
 
 
14
  uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
15
 
16
  if uploaded_files:
17
- st.write("Processing the uploaded PDFs...")
18
-
19
- # Initialize progress bar
20
- progress_bar = st.progress(0)
21
- status_text = st.empty()
22
-
23
- # Save uploaded files to disk
24
- pdf_files = []
25
- for uploaded_file in uploaded_files:
26
- with open(uploaded_file.name, "wb") as f:
27
- f.write(uploaded_file.getbuffer())
28
- pdf_files.append(uploaded_file.name)
29
-
30
- # Extract text from PDFs with progress updates
31
- num_files = len(pdf_files)
32
- texts = []
33
- for i, pdf_file in enumerate(pdf_files):
34
- status_text.text(f"Extracting text from file {i + 1} of {num_files}...")
35
- text = extract_text_from_pdfs([pdf_file])
36
- texts.extend(text)
37
- progress_bar.progress((i + 1) / num_files)
38
- time.sleep(0.1) # Simulate time taken for processing
39
-
40
- # Preprocess text with progress updates
41
- status_text.text("Preprocessing text...")
42
- progress_bar.progress(0.5)
43
- processed_texts = preprocess_text(texts)
44
- time.sleep(0.1) # Simulate time taken for processing
45
-
46
- # Create vectorizer and transform texts
47
- status_text.text("Creating vectorizer and transforming texts...")
48
- progress_bar.progress(0.75)
49
- vectorizer, X = create_vectorizer(processed_texts)
50
- time.sleep(0.1) # Simulate time taken for processing
51
-
52
- # Finalize progress
53
- progress_bar.progress(1.0)
54
- status_text.text("Processing complete!")
55
-
56
- query = st.text_input("Enter your query:")
57
-
58
- if query:
59
- # Retrieve relevant texts
60
- top_indices = retrieve(query, X, vectorizer)
61
- retrieved_texts = [texts[i] for i in top_indices]
62
-
63
- # Generate response
64
- response = generate_response(retrieved_texts, query)
65
-
66
- st.write("Response:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  st.write(response)
68
 
69
- # Clean up uploaded files
70
- for pdf_file in pdf_files:
 
71
  os.remove(pdf_file)
 
1
  import os
2
  import time
3
 
4
+ import openai
5
  import streamlit as st
6
+ from dotenv import load_dotenv
7
 
8
  from extract import extract_text_from_pdfs
9
  from generate import generate_response
10
  from preprocess import preprocess_text
11
  from retrieve import create_vectorizer, retrieve
12
 
13
+ # Load environment variables from .env file
14
+ load_dotenv()
15
+
16
+ # Set OpenAI API key
17
+ openai.api_key = os.getenv('OPENAI_API_KEY')
18
+
19
+ # Initialize session state
20
+ if "messages" not in st.session_state:
21
+ st.session_state.messages = []
22
+
23
+ if "pdf_files" not in st.session_state:
24
+ st.session_state.pdf_files = []
25
+
26
+ if "processed_texts" not in st.session_state:
27
+ st.session_state.processed_texts = []
28
+
29
  st.title("RAG-based PDF Query System")
30
 
31
+ # File uploader for PDF files
32
  uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
33
 
34
  if uploaded_files:
35
+ if "uploaded_files" not in st.session_state or uploaded_files != st.session_state.uploaded_files:
36
+ st.session_state.uploaded_files = uploaded_files
37
+ st.session_state.messages = [] # Clear previous messages
38
+ st.session_state.pdf_files = []
39
+ st.session_state.processed_texts = []
40
+
41
+ # Initialize status container
42
+ with st.status("Processing the uploaded PDFs...", state="running") as status:
43
+ # Save uploaded files to disk
44
+ for uploaded_file in uploaded_files:
45
+ with open(uploaded_file.name, "wb") as f:
46
+ f.write(uploaded_file.getbuffer())
47
+ st.session_state.pdf_files.append(uploaded_file.name)
48
+
49
+ # Extract text from PDFs
50
+ num_files = len(st.session_state.pdf_files)
51
+ texts = []
52
+ for i, pdf_file in enumerate(st.session_state.pdf_files):
53
+ st.write(f"Extracting text from file {i + 1} of {num_files}...")
54
+ text = extract_text_from_pdfs([pdf_file])
55
+ texts.extend(text)
56
+ time.sleep(0.1) # Simulate time taken for processing
57
+
58
+ # Preprocess text
59
+ st.write("Preprocessing text...")
60
+ st.session_state.processed_texts = preprocess_text(texts)
61
+ time.sleep(0.1) # Simulate time taken for processing
62
+
63
+ # Create vectorizer and transform texts
64
+ st.write("Creating vectorizer and transforming texts...")
65
+ st.session_state.vectorizer, st.session_state.X = create_vectorizer(st.session_state.processed_texts)
66
+ time.sleep(0.1) # Simulate time taken for processing
67
+
68
+ # Update status to complete
69
+ status.update(label="Processing complete!", state="complete")
70
+
71
+ else:
72
+ st.stop()
73
+
74
+ # Chat interface
75
+ st.write("### Ask a question about the uploaded PDFs")
76
+
77
+ # Display chat messages
78
+ for message in st.session_state.messages:
79
+ with st.chat_message(message["role"]):
80
+ st.write(message["content"])
81
+
82
+ # Chat input
83
+ prompt = st.chat_input("Ask something about the uploaded PDFs")
84
+ if prompt:
85
+ # Add user message to session state
86
+ st.session_state.messages.append({"role": "user", "content": prompt})
87
+
88
+ # Retrieve relevant texts
89
+ top_indices = retrieve(prompt, st.session_state.X, st.session_state.vectorizer)
90
+ retrieved_texts = [" ".join(st.session_state.processed_texts[i]) for i in top_indices]
91
+
92
+ # Generate response
93
+ response = generate_response(retrieved_texts, prompt)
94
+ st.session_state.messages.append({"role": "assistant", "content": response})
95
+
96
+ # Display user message
97
+ with st.chat_message("user"):
98
+ st.write(prompt)
99
+
100
+ # Display assistant message
101
+ with st.chat_message("assistant"):
102
  st.write(response)
103
 
104
+ # Clean up uploaded files
105
+ for pdf_file in st.session_state.pdf_files:
106
+ if os.path.exists(pdf_file):
107
  os.remove(pdf_file)