Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,63 +4,50 @@ import requests
|
|
4 |
import gradio as gr
|
5 |
from huggingface_hub import InferenceClient
|
6 |
from dataclasses import dataclass
|
7 |
-
import speech_recognition as sr
|
8 |
-
import
|
9 |
from PIL import Image
|
10 |
|
11 |
@dataclass
|
12 |
class ChatMessage:
|
13 |
-
"""Custom ChatMessage class since huggingface_hub doesn't provide one"""
|
14 |
role: str
|
15 |
content: str
|
16 |
|
17 |
def to_dict(self):
|
18 |
-
"""Converts ChatMessage to a dictionary for JSON serialization."""
|
19 |
return {"role": self.role, "content": self.content}
|
20 |
|
21 |
class XylariaChat:
|
22 |
def __init__(self):
|
23 |
-
# Securely load HuggingFace token
|
24 |
self.hf_token = os.getenv("HF_TOKEN")
|
25 |
if not self.hf_token:
|
26 |
raise ValueError("HuggingFace token not found in environment variables")
|
27 |
|
28 |
-
# Initialize the inference client with the Qwen model
|
29 |
self.client = InferenceClient(
|
30 |
-
model="Qwen/QwQ-32B-Preview",
|
31 |
api_key=self.hf_token
|
32 |
)
|
33 |
|
34 |
-
# Image captioning API setup
|
35 |
self.image_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
|
36 |
self.image_api_headers = {"Authorization": f"Bearer {self.hf_token}"}
|
37 |
|
38 |
-
# Initialize conversation history and persistent memory
|
39 |
self.conversation_history = []
|
40 |
self.persistent_memory = {}
|
41 |
|
42 |
-
# System prompt with more detailed instructions
|
43 |
self.system_prompt = """You are a helpful and harmless assistant. You are Xylaria developed by Sk Md Saad Amin . You should think step-by-step."""
|
44 |
|
|
|
|
|
45 |
def store_information(self, key, value):
|
46 |
-
"""Store important information in persistent memory"""
|
47 |
self.persistent_memory[key] = value
|
48 |
return f"Stored: {key} = {value}"
|
49 |
|
50 |
def retrieve_information(self, key):
|
51 |
-
"""Retrieve information from persistent memory"""
|
52 |
return self.persistent_memory.get(key, "No information found for this key.")
|
53 |
|
54 |
def reset_conversation(self):
|
55 |
-
"""
|
56 |
-
Completely reset the conversation history, persistent memory,
|
57 |
-
and clear API-side memory
|
58 |
-
"""
|
59 |
-
# Clear local memory
|
60 |
self.conversation_history = []
|
61 |
self.persistent_memory.clear()
|
62 |
|
63 |
-
# Reinitialize the client (not strictly necessary for the API, but can help with local state)
|
64 |
try:
|
65 |
self.client = InferenceClient(
|
66 |
model="Qwen/QwQ-32B-Preview",
|
@@ -69,39 +56,26 @@ class XylariaChat:
|
|
69 |
except Exception as e:
|
70 |
print(f"Error resetting API client: {e}")
|
71 |
|
72 |
-
return None
|
73 |
|
74 |
def caption_image(self, image):
|
75 |
-
"""
|
76 |
-
Caption an uploaded image using Hugging Face API
|
77 |
-
Args:
|
78 |
-
image (str): Base64 encoded image or file path
|
79 |
-
Returns:
|
80 |
-
str: Image caption or error message
|
81 |
-
"""
|
82 |
try:
|
83 |
-
# If image is a file path, read and encode
|
84 |
if isinstance(image, str) and os.path.isfile(image):
|
85 |
with open(image, "rb") as f:
|
86 |
data = f.read()
|
87 |
-
# If image is already base64 encoded
|
88 |
elif isinstance(image, str):
|
89 |
-
# Remove data URI prefix if present
|
90 |
if image.startswith('data:image'):
|
91 |
image = image.split(',')[1]
|
92 |
data = base64.b64decode(image)
|
93 |
-
# If image is a file-like object (unlikely with Gradio, but good to have)
|
94 |
else:
|
95 |
data = image.read()
|
96 |
|
97 |
-
# Send request to Hugging Face API
|
98 |
response = requests.post(
|
99 |
self.image_api_url,
|
100 |
headers=self.image_api_headers,
|
101 |
data=data
|
102 |
)
|
103 |
|
104 |
-
# Check response
|
105 |
if response.status_code == 200:
|
106 |
caption = response.json()[0].get('generated_text', 'No caption generated')
|
107 |
return caption
|
@@ -112,46 +86,23 @@ class XylariaChat:
|
|
112 |
return f"Error processing image: {str(e)}"
|
113 |
|
114 |
def perform_math_ocr(self, image_path):
|
115 |
-
"""
|
116 |
-
Perform OCR on an image and return the extracted text.
|
117 |
-
Args:
|
118 |
-
image_path (str): Path to the image file.
|
119 |
-
Returns:
|
120 |
-
str: Extracted text from the image, or an error message.
|
121 |
-
"""
|
122 |
try:
|
123 |
-
# Open the image using Pillow library
|
124 |
img = Image.open(image_path)
|
125 |
-
|
126 |
-
|
127 |
-
text = pytesseract.image_to_string(img)
|
128 |
-
|
129 |
-
# Remove leading/trailing whitespace and return
|
130 |
return text.strip()
|
131 |
-
|
132 |
except Exception as e:
|
133 |
return f"Error during Math OCR: {e}"
|
134 |
-
|
135 |
def get_response(self, user_input, image=None):
|
136 |
-
"""
|
137 |
-
Generate a response using chat completions with improved error handling
|
138 |
-
Args:
|
139 |
-
user_input (str): User's message
|
140 |
-
image (optional): Uploaded image
|
141 |
-
Returns:
|
142 |
-
Stream of chat completions or error message
|
143 |
-
"""
|
144 |
try:
|
145 |
-
# Prepare messages with conversation context and persistent memory
|
146 |
messages = []
|
147 |
|
148 |
-
# Add system prompt as first message
|
149 |
messages.append(ChatMessage(
|
150 |
role="system",
|
151 |
content=self.system_prompt
|
152 |
).to_dict())
|
153 |
|
154 |
-
# Add persistent memory context if available
|
155 |
if self.persistent_memory:
|
156 |
memory_context = "Remembered Information:\n" + "\n".join(
|
157 |
[f"{k}: {v}" for k, v in self.persistent_memory.items()]
|
@@ -161,29 +112,23 @@ class XylariaChat:
|
|
161 |
content=memory_context
|
162 |
).to_dict())
|
163 |
|
164 |
-
# Convert existing conversation history to ChatMessage objects and then to dictionaries
|
165 |
for msg in self.conversation_history:
|
166 |
messages.append(msg)
|
167 |
|
168 |
-
# Process image if uploaded
|
169 |
if image:
|
170 |
image_caption = self.caption_image(image)
|
171 |
user_input = f"description of an image: {image_caption}\n\nUser's message about it: {user_input}"
|
172 |
|
173 |
-
# Add user input
|
174 |
messages.append(ChatMessage(
|
175 |
role="user",
|
176 |
content=user_input
|
177 |
).to_dict())
|
178 |
|
179 |
-
# Calculate available tokens
|
180 |
input_tokens = sum(len(msg['content'].split()) for msg in messages)
|
181 |
-
max_new_tokens = 16384 - input_tokens - 50
|
182 |
|
183 |
-
# Limit max_new_tokens to prevent exceeding the total limit
|
184 |
max_new_tokens = min(max_new_tokens, 10020)
|
185 |
|
186 |
-
# Generate response with streaming
|
187 |
stream = self.client.chat_completion(
|
188 |
messages=messages,
|
189 |
model="Qwen/QwQ-32B-Preview",
|
@@ -192,20 +137,14 @@ class XylariaChat:
|
|
192 |
top_p=0.9,
|
193 |
stream=True
|
194 |
)
|
195 |
-
|
196 |
return stream
|
197 |
-
|
198 |
except Exception as e:
|
199 |
print(f"Detailed error in get_response: {e}")
|
200 |
return f"Error generating response: {str(e)}"
|
201 |
|
202 |
def messages_to_prompt(self, messages):
|
203 |
-
"""
|
204 |
-
Convert a list of ChatMessage dictionaries to a single prompt string.
|
205 |
-
|
206 |
-
This is a simple implementation and you might need to adjust it
|
207 |
-
based on the specific requirements of the model you are using.
|
208 |
-
"""
|
209 |
prompt = ""
|
210 |
for msg in messages:
|
211 |
if msg["role"] == "system":
|
@@ -214,94 +153,77 @@ class XylariaChat:
|
|
214 |
prompt += f"<|user|>\n{msg['content']}<|end|>\n"
|
215 |
elif msg["role"] == "assistant":
|
216 |
prompt += f"<|assistant|>\n{msg['content']}<|end|>\n"
|
217 |
-
prompt += "<|assistant|>\n"
|
218 |
return prompt
|
219 |
-
|
220 |
def recognize_speech(self, audio_file):
|
221 |
-
"""
|
222 |
-
Transcribes audio to text using speech_recognition library.
|
223 |
-
"""
|
224 |
recognizer = sr.Recognizer()
|
225 |
|
226 |
try:
|
227 |
with sr.AudioFile(audio_file) as source:
|
228 |
audio_data = recognizer.record(source)
|
229 |
-
text = recognizer.recognize_google(audio_data)
|
230 |
return text
|
231 |
except sr.UnknownValueError:
|
232 |
return "Could not understand audio"
|
233 |
except sr.RequestError:
|
234 |
return "Could not request results from Google Speech Recognition service"
|
235 |
-
|
236 |
def create_interface(self):
|
237 |
def streaming_response(message, chat_history, image_filepath, math_ocr_image_path, audio_file):
|
238 |
-
|
239 |
-
# Speech Recognition (if audio is uploaded)
|
240 |
if audio_file:
|
241 |
voice_message = self.recognize_speech(audio_file)
|
242 |
if not voice_message.startswith("Error"):
|
243 |
-
message = voice_message
|
244 |
|
245 |
ocr_text = ""
|
246 |
-
# OCR (with output size check)
|
247 |
if math_ocr_image_path:
|
248 |
ocr_text = self.perform_math_ocr(math_ocr_image_path)
|
249 |
if ocr_text.startswith("Error"):
|
250 |
-
updated_history = chat_history + [[message, ocr_text]]
|
251 |
yield "", updated_history, None, None, None
|
252 |
return
|
253 |
-
elif len(ocr_text) > 500:
|
254 |
ocr_text = "OCR output is too large to be processed."
|
255 |
-
updated_history = chat_history + [[message, ocr_text]]
|
256 |
yield "", updated_history, None, None, None
|
257 |
return
|
258 |
else:
|
259 |
message = f"Math OCR Result: {ocr_text}\n\nUser's message: {message}"
|
260 |
|
261 |
-
# Check if an image was actually uploaded
|
262 |
if image_filepath:
|
263 |
response_stream = self.get_response(message, image_filepath)
|
264 |
else:
|
265 |
response_stream = self.get_response(message)
|
266 |
-
|
267 |
|
268 |
-
# Handle errors in get_response
|
269 |
if isinstance(response_stream, str):
|
270 |
-
|
271 |
-
updated_history = chat_history + [[message, response_stream]]
|
272 |
yield "", updated_history, None, None, None
|
273 |
return
|
274 |
|
275 |
-
# Prepare for streaming response
|
276 |
full_response = ""
|
277 |
-
updated_history = chat_history + [[message, ""]]
|
278 |
|
279 |
-
# Streaming output
|
280 |
try:
|
281 |
for chunk in response_stream:
|
282 |
if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
|
283 |
chunk_content = chunk.choices[0].delta.content
|
284 |
full_response += chunk_content
|
285 |
-
|
286 |
-
|
287 |
-
updated_history[-1][1] = full_response
|
288 |
yield "", updated_history, None, None, None
|
289 |
except Exception as e:
|
290 |
print(f"Streaming error: {e}")
|
291 |
-
|
292 |
-
updated_history[-1][1] = f"Error during response: {e}"
|
293 |
yield "", updated_history, None, None, None
|
294 |
return
|
295 |
|
296 |
-
# Update conversation history
|
297 |
self.conversation_history.append(ChatMessage(role="user", content=message).to_dict())
|
298 |
self.conversation_history.append(ChatMessage(role="assistant", content=full_response).to_dict())
|
299 |
|
300 |
-
# Limit conversation history
|
301 |
if len(self.conversation_history) > 10:
|
302 |
self.conversation_history = self.conversation_history[-10:]
|
303 |
|
304 |
-
# Custom CSS for Inter font and improved styling
|
305 |
custom_css = """
|
306 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
307 |
body, .gradio-container {
|
@@ -315,7 +237,6 @@ class XylariaChat:
|
|
315 |
.gradio-container button {
|
316 |
font-family: 'Inter', sans-serif !important;
|
317 |
}
|
318 |
-
/* Image Upload Styling */
|
319 |
.image-container {
|
320 |
display: flex;
|
321 |
gap: 10px;
|
@@ -332,11 +253,9 @@ class XylariaChat:
|
|
332 |
max-height: 200px;
|
333 |
border-radius: 8px;
|
334 |
}
|
335 |
-
/* Remove clear image buttons */
|
336 |
.clear-button {
|
337 |
display: none;
|
338 |
}
|
339 |
-
/* Animate chatbot messages */
|
340 |
.chatbot-container .message {
|
341 |
opacity: 0;
|
342 |
animation: fadeIn 0.5s ease-in-out forwards;
|
@@ -351,30 +270,27 @@ class XylariaChat:
|
|
351 |
transform: translateY(0);
|
352 |
}
|
353 |
}
|
354 |
-
/* Accordion Animation */
|
355 |
.gradio-accordion {
|
356 |
overflow: hidden;
|
357 |
-
transition: max-height 0.3s ease-in-out;
|
358 |
-
max-height: 0;
|
359 |
}
|
360 |
-
|
361 |
.gradio-accordion.open {
|
362 |
-
max-height: 500px;
|
363 |
}
|
364 |
"""
|
365 |
|
366 |
with gr.Blocks(theme='soft', css=custom_css) as demo:
|
367 |
-
# Chat interface with improved styling
|
368 |
with gr.Column():
|
369 |
chatbot = gr.Chatbot(
|
370 |
label="Xylaria 1.5 Senoa (EXPERIMENTAL)",
|
371 |
height=500,
|
372 |
show_copy_button=True,
|
|
|
373 |
)
|
374 |
|
375 |
-
# Enhanced Image Upload Section
|
376 |
with gr.Accordion("Image Input", open=False) as accordion:
|
377 |
-
with gr.Row(elem_classes="image-container"):
|
378 |
with gr.Column(elem_classes="image-upload"):
|
379 |
img = gr.Image(
|
380 |
sources=["upload", "webcam"],
|
@@ -389,9 +305,7 @@ class XylariaChat:
|
|
389 |
label="Upload Image for Math OCR",
|
390 |
elem_classes="image-preview"
|
391 |
)
|
392 |
-
# Removed clear buttons as per requirement
|
393 |
|
394 |
-
# Input row with improved layout
|
395 |
with gr.Row():
|
396 |
with gr.Column(scale=4):
|
397 |
txt = gr.Textbox(
|
@@ -401,18 +315,16 @@ class XylariaChat:
|
|
401 |
)
|
402 |
with gr.Column(scale=1):
|
403 |
audio_input = gr.Audio(
|
404 |
-
|
405 |
type="filepath",
|
406 |
label="Voice Input"
|
407 |
)
|
408 |
btn = gr.Button("Send", scale=1)
|
409 |
|
410 |
-
# Clear history and memory buttons
|
411 |
with gr.Row():
|
412 |
clear = gr.Button("Clear Conversation")
|
413 |
clear_memory = gr.Button("Clear Memory")
|
414 |
|
415 |
-
# Submit functionality with streaming and image support
|
416 |
btn.click(
|
417 |
fn=streaming_response,
|
418 |
inputs=[txt, chatbot, img, math_ocr_img, audio_input],
|
@@ -424,7 +336,6 @@ class XylariaChat:
|
|
424 |
outputs=[txt, chatbot, img, math_ocr_img, audio_input]
|
425 |
)
|
426 |
|
427 |
-
# Clear conversation history
|
428 |
clear.click(
|
429 |
fn=lambda: None,
|
430 |
inputs=None,
|
@@ -432,15 +343,13 @@ class XylariaChat:
|
|
432 |
queue=False
|
433 |
)
|
434 |
|
435 |
-
# Clear persistent memory and reset conversation
|
436 |
clear_memory.click(
|
437 |
fn=self.reset_conversation,
|
438 |
inputs=None,
|
439 |
outputs=[chatbot],
|
440 |
queue=False
|
441 |
)
|
442 |
-
|
443 |
-
# Accordion animation JavaScript
|
444 |
demo.load(None, None, None, _js="""
|
445 |
() => {
|
446 |
const accordion = document.querySelector(".gradio-accordion");
|
@@ -455,18 +364,16 @@ class XylariaChat:
|
|
455 |
}
|
456 |
""")
|
457 |
|
458 |
-
# Ensure memory is cleared when the interface is closed
|
459 |
demo.load(self.reset_conversation, None, None)
|
460 |
|
461 |
return demo
|
462 |
|
463 |
-
# Launch the interface
|
464 |
def main():
|
465 |
chat = XylariaChat()
|
466 |
interface = chat.create_interface()
|
467 |
interface.launch(
|
468 |
-
share=True,
|
469 |
-
debug=True
|
470 |
)
|
471 |
|
472 |
if __name__ == "__main__":
|
|
|
4 |
import gradio as gr
|
5 |
from huggingface_hub import InferenceClient
|
6 |
from dataclasses import dataclass
|
7 |
+
import speech_recognition as sr
|
8 |
+
import easyocr
|
9 |
from PIL import Image
|
10 |
|
11 |
@dataclass
|
12 |
class ChatMessage:
|
|
|
13 |
role: str
|
14 |
content: str
|
15 |
|
16 |
def to_dict(self):
|
|
|
17 |
return {"role": self.role, "content": self.content}
|
18 |
|
19 |
class XylariaChat:
|
20 |
def __init__(self):
|
|
|
21 |
self.hf_token = os.getenv("HF_TOKEN")
|
22 |
if not self.hf_token:
|
23 |
raise ValueError("HuggingFace token not found in environment variables")
|
24 |
|
|
|
25 |
self.client = InferenceClient(
|
26 |
+
model="Qwen/QwQ-32B-Preview",
|
27 |
api_key=self.hf_token
|
28 |
)
|
29 |
|
|
|
30 |
self.image_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
|
31 |
self.image_api_headers = {"Authorization": f"Bearer {self.hf_token}"}
|
32 |
|
|
|
33 |
self.conversation_history = []
|
34 |
self.persistent_memory = {}
|
35 |
|
|
|
36 |
self.system_prompt = """You are a helpful and harmless assistant. You are Xylaria developed by Sk Md Saad Amin . You should think step-by-step."""
|
37 |
|
38 |
+
self.reader = easyocr.Reader(['ch_sim','en'])
|
39 |
+
|
40 |
def store_information(self, key, value):
|
|
|
41 |
self.persistent_memory[key] = value
|
42 |
return f"Stored: {key} = {value}"
|
43 |
|
44 |
def retrieve_information(self, key):
|
|
|
45 |
return self.persistent_memory.get(key, "No information found for this key.")
|
46 |
|
47 |
def reset_conversation(self):
|
|
|
|
|
|
|
|
|
|
|
48 |
self.conversation_history = []
|
49 |
self.persistent_memory.clear()
|
50 |
|
|
|
51 |
try:
|
52 |
self.client = InferenceClient(
|
53 |
model="Qwen/QwQ-32B-Preview",
|
|
|
56 |
except Exception as e:
|
57 |
print(f"Error resetting API client: {e}")
|
58 |
|
59 |
+
return None
|
60 |
|
61 |
def caption_image(self, image):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
try:
|
|
|
63 |
if isinstance(image, str) and os.path.isfile(image):
|
64 |
with open(image, "rb") as f:
|
65 |
data = f.read()
|
|
|
66 |
elif isinstance(image, str):
|
|
|
67 |
if image.startswith('data:image'):
|
68 |
image = image.split(',')[1]
|
69 |
data = base64.b64decode(image)
|
|
|
70 |
else:
|
71 |
data = image.read()
|
72 |
|
|
|
73 |
response = requests.post(
|
74 |
self.image_api_url,
|
75 |
headers=self.image_api_headers,
|
76 |
data=data
|
77 |
)
|
78 |
|
|
|
79 |
if response.status_code == 200:
|
80 |
caption = response.json()[0].get('generated_text', 'No caption generated')
|
81 |
return caption
|
|
|
86 |
return f"Error processing image: {str(e)}"
|
87 |
|
88 |
def perform_math_ocr(self, image_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
try:
|
|
|
90 |
img = Image.open(image_path)
|
91 |
+
result = self.reader.readtext(image_path)
|
92 |
+
text = ' '.join([item[1] for item in result])
|
|
|
|
|
|
|
93 |
return text.strip()
|
|
|
94 |
except Exception as e:
|
95 |
return f"Error during Math OCR: {e}"
|
96 |
+
|
97 |
def get_response(self, user_input, image=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
try:
|
|
|
99 |
messages = []
|
100 |
|
|
|
101 |
messages.append(ChatMessage(
|
102 |
role="system",
|
103 |
content=self.system_prompt
|
104 |
).to_dict())
|
105 |
|
|
|
106 |
if self.persistent_memory:
|
107 |
memory_context = "Remembered Information:\n" + "\n".join(
|
108 |
[f"{k}: {v}" for k, v in self.persistent_memory.items()]
|
|
|
112 |
content=memory_context
|
113 |
).to_dict())
|
114 |
|
|
|
115 |
for msg in self.conversation_history:
|
116 |
messages.append(msg)
|
117 |
|
|
|
118 |
if image:
|
119 |
image_caption = self.caption_image(image)
|
120 |
user_input = f"description of an image: {image_caption}\n\nUser's message about it: {user_input}"
|
121 |
|
|
|
122 |
messages.append(ChatMessage(
|
123 |
role="user",
|
124 |
content=user_input
|
125 |
).to_dict())
|
126 |
|
|
|
127 |
input_tokens = sum(len(msg['content'].split()) for msg in messages)
|
128 |
+
max_new_tokens = 16384 - input_tokens - 50
|
129 |
|
|
|
130 |
max_new_tokens = min(max_new_tokens, 10020)
|
131 |
|
|
|
132 |
stream = self.client.chat_completion(
|
133 |
messages=messages,
|
134 |
model="Qwen/QwQ-32B-Preview",
|
|
|
137 |
top_p=0.9,
|
138 |
stream=True
|
139 |
)
|
140 |
+
|
141 |
return stream
|
142 |
+
|
143 |
except Exception as e:
|
144 |
print(f"Detailed error in get_response: {e}")
|
145 |
return f"Error generating response: {str(e)}"
|
146 |
|
147 |
def messages_to_prompt(self, messages):
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
prompt = ""
|
149 |
for msg in messages:
|
150 |
if msg["role"] == "system":
|
|
|
153 |
prompt += f"<|user|>\n{msg['content']}<|end|>\n"
|
154 |
elif msg["role"] == "assistant":
|
155 |
prompt += f"<|assistant|>\n{msg['content']}<|end|>\n"
|
156 |
+
prompt += "<|assistant|>\n"
|
157 |
return prompt
|
158 |
+
|
159 |
def recognize_speech(self, audio_file):
|
|
|
|
|
|
|
160 |
recognizer = sr.Recognizer()
|
161 |
|
162 |
try:
|
163 |
with sr.AudioFile(audio_file) as source:
|
164 |
audio_data = recognizer.record(source)
|
165 |
+
text = recognizer.recognize_google(audio_data)
|
166 |
return text
|
167 |
except sr.UnknownValueError:
|
168 |
return "Could not understand audio"
|
169 |
except sr.RequestError:
|
170 |
return "Could not request results from Google Speech Recognition service"
|
171 |
+
|
172 |
def create_interface(self):
|
173 |
def streaming_response(message, chat_history, image_filepath, math_ocr_image_path, audio_file):
|
|
|
|
|
174 |
if audio_file:
|
175 |
voice_message = self.recognize_speech(audio_file)
|
176 |
if not voice_message.startswith("Error"):
|
177 |
+
message = voice_message
|
178 |
|
179 |
ocr_text = ""
|
|
|
180 |
if math_ocr_image_path:
|
181 |
ocr_text = self.perform_math_ocr(math_ocr_image_path)
|
182 |
if ocr_text.startswith("Error"):
|
183 |
+
updated_history = chat_history + [[{"role": "user", "content": message}, {"role": "assistant", "content": ocr_text}]]
|
184 |
yield "", updated_history, None, None, None
|
185 |
return
|
186 |
+
elif len(ocr_text) > 500:
|
187 |
ocr_text = "OCR output is too large to be processed."
|
188 |
+
updated_history = chat_history + [[{"role": "user", "content": message}, {"role": "assistant", "content": ocr_text}]]
|
189 |
yield "", updated_history, None, None, None
|
190 |
return
|
191 |
else:
|
192 |
message = f"Math OCR Result: {ocr_text}\n\nUser's message: {message}"
|
193 |
|
|
|
194 |
if image_filepath:
|
195 |
response_stream = self.get_response(message, image_filepath)
|
196 |
else:
|
197 |
response_stream = self.get_response(message)
|
|
|
198 |
|
|
|
199 |
if isinstance(response_stream, str):
|
200 |
+
updated_history = chat_history + [[{"role": "user", "content": message}, {"role": "assistant", "content": response_stream}]]
|
|
|
201 |
yield "", updated_history, None, None, None
|
202 |
return
|
203 |
|
|
|
204 |
full_response = ""
|
205 |
+
updated_history = chat_history + [[{"role": "user", "content": message}, {"role": "assistant", "content": ""}]]
|
206 |
|
|
|
207 |
try:
|
208 |
for chunk in response_stream:
|
209 |
if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
|
210 |
chunk_content = chunk.choices[0].delta.content
|
211 |
full_response += chunk_content
|
212 |
+
|
213 |
+
updated_history[-1][1]["content"] = full_response
|
|
|
214 |
yield "", updated_history, None, None, None
|
215 |
except Exception as e:
|
216 |
print(f"Streaming error: {e}")
|
217 |
+
updated_history[-1][1]["content"] = f"Error during response: {e}"
|
|
|
218 |
yield "", updated_history, None, None, None
|
219 |
return
|
220 |
|
|
|
221 |
self.conversation_history.append(ChatMessage(role="user", content=message).to_dict())
|
222 |
self.conversation_history.append(ChatMessage(role="assistant", content=full_response).to_dict())
|
223 |
|
|
|
224 |
if len(self.conversation_history) > 10:
|
225 |
self.conversation_history = self.conversation_history[-10:]
|
226 |
|
|
|
227 |
custom_css = """
|
228 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
229 |
body, .gradio-container {
|
|
|
237 |
.gradio-container button {
|
238 |
font-family: 'Inter', sans-serif !important;
|
239 |
}
|
|
|
240 |
.image-container {
|
241 |
display: flex;
|
242 |
gap: 10px;
|
|
|
253 |
max-height: 200px;
|
254 |
border-radius: 8px;
|
255 |
}
|
|
|
256 |
.clear-button {
|
257 |
display: none;
|
258 |
}
|
|
|
259 |
.chatbot-container .message {
|
260 |
opacity: 0;
|
261 |
animation: fadeIn 0.5s ease-in-out forwards;
|
|
|
270 |
transform: translateY(0);
|
271 |
}
|
272 |
}
|
|
|
273 |
.gradio-accordion {
|
274 |
overflow: hidden;
|
275 |
+
transition: max-height 0.3s ease-in-out;
|
276 |
+
max-height: 0;
|
277 |
}
|
|
|
278 |
.gradio-accordion.open {
|
279 |
+
max-height: 500px;
|
280 |
}
|
281 |
"""
|
282 |
|
283 |
with gr.Blocks(theme='soft', css=custom_css) as demo:
|
|
|
284 |
with gr.Column():
|
285 |
chatbot = gr.Chatbot(
|
286 |
label="Xylaria 1.5 Senoa (EXPERIMENTAL)",
|
287 |
height=500,
|
288 |
show_copy_button=True,
|
289 |
+
type='messages'
|
290 |
)
|
291 |
|
|
|
292 |
with gr.Accordion("Image Input", open=False) as accordion:
|
293 |
+
with gr.Row(elem_classes="image-container"):
|
294 |
with gr.Column(elem_classes="image-upload"):
|
295 |
img = gr.Image(
|
296 |
sources=["upload", "webcam"],
|
|
|
305 |
label="Upload Image for Math OCR",
|
306 |
elem_classes="image-preview"
|
307 |
)
|
|
|
308 |
|
|
|
309 |
with gr.Row():
|
310 |
with gr.Column(scale=4):
|
311 |
txt = gr.Textbox(
|
|
|
315 |
)
|
316 |
with gr.Column(scale=1):
|
317 |
audio_input = gr.Audio(
|
318 |
+
sources=["microphone"],
|
319 |
type="filepath",
|
320 |
label="Voice Input"
|
321 |
)
|
322 |
btn = gr.Button("Send", scale=1)
|
323 |
|
|
|
324 |
with gr.Row():
|
325 |
clear = gr.Button("Clear Conversation")
|
326 |
clear_memory = gr.Button("Clear Memory")
|
327 |
|
|
|
328 |
btn.click(
|
329 |
fn=streaming_response,
|
330 |
inputs=[txt, chatbot, img, math_ocr_img, audio_input],
|
|
|
336 |
outputs=[txt, chatbot, img, math_ocr_img, audio_input]
|
337 |
)
|
338 |
|
|
|
339 |
clear.click(
|
340 |
fn=lambda: None,
|
341 |
inputs=None,
|
|
|
343 |
queue=False
|
344 |
)
|
345 |
|
|
|
346 |
clear_memory.click(
|
347 |
fn=self.reset_conversation,
|
348 |
inputs=None,
|
349 |
outputs=[chatbot],
|
350 |
queue=False
|
351 |
)
|
352 |
+
|
|
|
353 |
demo.load(None, None, None, _js="""
|
354 |
() => {
|
355 |
const accordion = document.querySelector(".gradio-accordion");
|
|
|
364 |
}
|
365 |
""")
|
366 |
|
|
|
367 |
demo.load(self.reset_conversation, None, None)
|
368 |
|
369 |
return demo
|
370 |
|
|
|
371 |
def main():
|
372 |
chat = XylariaChat()
|
373 |
interface = chat.create_interface()
|
374 |
interface.launch(
|
375 |
+
share=True,
|
376 |
+
debug=True
|
377 |
)
|
378 |
|
379 |
if __name__ == "__main__":
|