Spaces:

Lap1official
/

API

Running

App Files Files Community

Reality123b commited on Dec 20, 2024

Commit

e69c140

verified ·

1 Parent(s): c3dfd41

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -127

app.py CHANGED Viewed

@@ -4,63 +4,50 @@ import requests
 import gradio as gr
 from huggingface_hub import InferenceClient
 from dataclasses import dataclass
-import speech_recognition as sr  # Import speech_recognition
-import pytesseract
 from PIL import Image
 @dataclass
 class ChatMessage:
-    """Custom ChatMessage class since huggingface_hub doesn't provide one"""
     role: str
     content: str
     def to_dict(self):
-        """Converts ChatMessage to a dictionary for JSON serialization."""
         return {"role": self.role, "content": self.content}
 class XylariaChat:
     def __init__(self):
-        # Securely load HuggingFace token
         self.hf_token = os.getenv("HF_TOKEN")
         if not self.hf_token:
             raise ValueError("HuggingFace token not found in environment variables")
-        # Initialize the inference client with the Qwen model
         self.client = InferenceClient(
-            model="Qwen/QwQ-32B-Preview",  # Using the specified model
             api_key=self.hf_token
         )
-        # Image captioning API setup
         self.image_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
         self.image_api_headers = {"Authorization": f"Bearer {self.hf_token}"}
-        # Initialize conversation history and persistent memory
         self.conversation_history = []
         self.persistent_memory = {}
-        # System prompt with more detailed instructions
         self.system_prompt = """You are a helpful and harmless assistant. You are Xylaria developed by Sk Md Saad Amin . You should think step-by-step."""
     def store_information(self, key, value):
-        """Store important information in persistent memory"""
         self.persistent_memory[key] = value
         return f"Stored: {key} = {value}"
     def retrieve_information(self, key):
-        """Retrieve information from persistent memory"""
         return self.persistent_memory.get(key, "No information found for this key.")
     def reset_conversation(self):
-        """
-        Completely reset the conversation history, persistent memory,
-        and clear API-side memory
-        """
-        # Clear local memory
         self.conversation_history = []
         self.persistent_memory.clear()
-        # Reinitialize the client (not strictly necessary for the API, but can help with local state)
         try:
             self.client = InferenceClient(
                 model="Qwen/QwQ-32B-Preview",
@@ -69,39 +56,26 @@ class XylariaChat:
         except Exception as e:
             print(f"Error resetting API client: {e}")
-        return None  # To clear the chatbot interface
     def caption_image(self, image):
-        """
-        Caption an uploaded image using Hugging Face API
-        Args:
-            image (str): Base64 encoded image or file path
-        Returns:
-            str: Image caption or error message
-        """
         try:
-            # If image is a file path, read and encode
             if isinstance(image, str) and os.path.isfile(image):
                 with open(image, "rb") as f:
                     data = f.read()
-            # If image is already base64 encoded
             elif isinstance(image, str):
-                # Remove data URI prefix if present
                 if image.startswith('data:image'):
                     image = image.split(',')[1]
                 data = base64.b64decode(image)
-            # If image is a file-like object (unlikely with Gradio, but good to have)
             else:
                 data = image.read()
-            # Send request to Hugging Face API
             response = requests.post(
                 self.image_api_url,
                 headers=self.image_api_headers,
                 data=data
             )
-            # Check response
             if response.status_code == 200:
                 caption = response.json()[0].get('generated_text', 'No caption generated')
                 return caption
@@ -112,46 +86,23 @@ class XylariaChat:
             return f"Error processing image: {str(e)}"
     def perform_math_ocr(self, image_path):
-        """
-        Perform OCR on an image and return the extracted text.
-        Args:
-            image_path (str): Path to the image file.
-        Returns:
-            str: Extracted text from the image, or an error message.
-        """
         try:
-            # Open the image using Pillow library
             img = Image.open(image_path)
-            # Use Tesseract to do OCR on the image
-            text = pytesseract.image_to_string(img)
-            # Remove leading/trailing whitespace and return
             return text.strip()
         except Exception as e:
             return f"Error during Math OCR: {e}"
     def get_response(self, user_input, image=None):
-        """
-        Generate a response using chat completions with improved error handling
-        Args:
-            user_input (str): User's message
-            image (optional): Uploaded image
-        Returns:
-            Stream of chat completions or error message
-        """
         try:
-            # Prepare messages with conversation context and persistent memory
             messages = []
-            # Add system prompt as first message
             messages.append(ChatMessage(
                 role="system",
                 content=self.system_prompt
             ).to_dict())
-            # Add persistent memory context if available
             if self.persistent_memory:
                 memory_context = "Remembered Information:\n" + "\n".join(
                     [f"{k}: {v}" for k, v in self.persistent_memory.items()]
@@ -161,29 +112,23 @@ class XylariaChat:
                     content=memory_context
                 ).to_dict())
-            # Convert existing conversation history to ChatMessage objects and then to dictionaries
             for msg in self.conversation_history:
                 messages.append(msg)
-            # Process image if uploaded
             if image:
                 image_caption = self.caption_image(image)
                 user_input = f"description of an image: {image_caption}\n\nUser's message about it: {user_input}"
-            # Add user input
             messages.append(ChatMessage(
                 role="user",
                 content=user_input
             ).to_dict())
-            # Calculate available tokens
             input_tokens = sum(len(msg['content'].split()) for msg in messages)
-            max_new_tokens = 16384 - input_tokens - 50 # Reserve some tokens for safety
-            # Limit max_new_tokens to prevent exceeding the total limit
             max_new_tokens = min(max_new_tokens, 10020)
-            # Generate response with streaming
             stream = self.client.chat_completion(
                 messages=messages,
                 model="Qwen/QwQ-32B-Preview",
@@ -192,20 +137,14 @@ class XylariaChat:
                 top_p=0.9,
                 stream=True
             )
             return stream
         except Exception as e:
             print(f"Detailed error in get_response: {e}")
             return f"Error generating response: {str(e)}"
     def messages_to_prompt(self, messages):
-        """
-        Convert a list of ChatMessage dictionaries to a single prompt string.
-        This is a simple implementation and you might need to adjust it
-        based on the specific requirements of the model you are using.
-        """
         prompt = ""
         for msg in messages:
             if msg["role"] == "system":
@@ -214,94 +153,77 @@ class XylariaChat:
                 prompt += f"<|user|>\n{msg['content']}<|end|>\n"
             elif msg["role"] == "assistant":
                 prompt += f"<|assistant|>\n{msg['content']}<|end|>\n"
-        prompt += "<|assistant|>\n"  # Start of assistant's turn
         return prompt
     def recognize_speech(self, audio_file):
-        """
-        Transcribes audio to text using speech_recognition library.
-        """
         recognizer = sr.Recognizer()
         try:
             with sr.AudioFile(audio_file) as source:
                 audio_data = recognizer.record(source)
-                text = recognizer.recognize_google(audio_data)  # Using Google Web Speech API
                 return text
         except sr.UnknownValueError:
             return "Could not understand audio"
         except sr.RequestError:
             return "Could not request results from Google Speech Recognition service"
     def create_interface(self):
         def streaming_response(message, chat_history, image_filepath, math_ocr_image_path, audio_file):
-            # Speech Recognition (if audio is uploaded)
             if audio_file:
                 voice_message = self.recognize_speech(audio_file)
                 if not voice_message.startswith("Error"):
-                    message = voice_message  # Use transcribed text as the message
             ocr_text = ""
-            # OCR (with output size check)
             if math_ocr_image_path:
                 ocr_text = self.perform_math_ocr(math_ocr_image_path)
                 if ocr_text.startswith("Error"):
-                    updated_history = chat_history + [[message, ocr_text]]
                     yield "", updated_history, None, None, None
                     return
-                elif len(ocr_text) > 500:  # Check if OCR output is too large
                     ocr_text = "OCR output is too large to be processed."
-                    updated_history = chat_history + [[message, ocr_text]]
                     yield "", updated_history, None, None, None
                     return
                 else:
                     message = f"Math OCR Result: {ocr_text}\n\nUser's message: {message}"
-            # Check if an image was actually uploaded
             if image_filepath:
                 response_stream = self.get_response(message, image_filepath)
             else:
                 response_stream = self.get_response(message)
-            # Handle errors in get_response
             if isinstance(response_stream, str):
-                # Return immediately with the error message
-                updated_history = chat_history + [[message, response_stream]]
                 yield "", updated_history, None, None, None
                 return
-            # Prepare for streaming response
             full_response = ""
-            updated_history = chat_history + [[message, ""]]
-            # Streaming output
             try:
                 for chunk in response_stream:
                     if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
                         chunk_content = chunk.choices[0].delta.content
                         full_response += chunk_content
-                        # Update the last message in chat history with partial response
-                        updated_history[-1][1] = full_response
                         yield "", updated_history, None, None, None
             except Exception as e:
                 print(f"Streaming error: {e}")
-                # Display error in the chat interface
-                updated_history[-1][1] = f"Error during response: {e}"
                 yield "", updated_history, None, None, None
                 return
-            # Update conversation history
             self.conversation_history.append(ChatMessage(role="user", content=message).to_dict())
             self.conversation_history.append(ChatMessage(role="assistant", content=full_response).to_dict())
-            # Limit conversation history
             if len(self.conversation_history) > 10:
                 self.conversation_history = self.conversation_history[-10:]
-        # Custom CSS for Inter font and improved styling
         custom_css = """
         @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
         body, .gradio-container {
@@ -315,7 +237,6 @@ class XylariaChat:
         .gradio-container button {
             font-family: 'Inter', sans-serif !important;
         }
-        /* Image Upload Styling */
         .image-container {
             display: flex;
             gap: 10px;
@@ -332,11 +253,9 @@ class XylariaChat:
             max-height: 200px;
             border-radius: 8px;
         }
-        /* Remove clear image buttons */
         .clear-button {
             display: none;
         }
-        /* Animate chatbot messages */
         .chatbot-container .message {
             opacity: 0;
             animation: fadeIn 0.5s ease-in-out forwards;
@@ -351,30 +270,27 @@ class XylariaChat:
                 transform: translateY(0);
             }
         }
-        /* Accordion Animation */
         .gradio-accordion {
             overflow: hidden;
-            transition: max-height 0.3s ease-in-out; /* Adjust duration as needed */
-            max-height: 0; /* Initially collapsed */
         }
         .gradio-accordion.open {
-            max-height: 500px; /* Adjust to expected max height of content */
         }
         """
         with gr.Blocks(theme='soft', css=custom_css) as demo:
-            # Chat interface with improved styling
             with gr.Column():
                 chatbot = gr.Chatbot(
                     label="Xylaria 1.5 Senoa (EXPERIMENTAL)",
                     height=500,
                     show_copy_button=True,
                 )
-                # Enhanced Image Upload Section
                 with gr.Accordion("Image Input", open=False) as accordion:
-                    with gr.Row(elem_classes="image-container"):  # Use a Row for side-by-side layout
                         with gr.Column(elem_classes="image-upload"):
                             img = gr.Image(
                                 sources=["upload", "webcam"],
@@ -389,9 +305,7 @@ class XylariaChat:
                                 label="Upload Image for Math OCR",
                                 elem_classes="image-preview"
                             )
-                        # Removed clear buttons as per requirement
-                # Input row with improved layout
                 with gr.Row():
                     with gr.Column(scale=4):
                         txt = gr.Textbox(
@@ -401,18 +315,16 @@ class XylariaChat:
                         )
                     with gr.Column(scale=1):
                         audio_input = gr.Audio(
-                            source="microphone",
                             type="filepath",
                             label="Voice Input"
                         )
                     btn = gr.Button("Send", scale=1)
-                # Clear history and memory buttons
                 with gr.Row():
                     clear = gr.Button("Clear Conversation")
                     clear_memory = gr.Button("Clear Memory")
-                # Submit functionality with streaming and image support
                 btn.click(
                     fn=streaming_response,
                     inputs=[txt, chatbot, img, math_ocr_img, audio_input],
@@ -424,7 +336,6 @@ class XylariaChat:
                     outputs=[txt, chatbot, img, math_ocr_img, audio_input]
                 )
-                # Clear conversation history
                 clear.click(
                     fn=lambda: None,
                     inputs=None,
@@ -432,15 +343,13 @@ class XylariaChat:
                     queue=False
                 )
-                # Clear persistent memory and reset conversation
                 clear_memory.click(
                     fn=self.reset_conversation,
                     inputs=None,
                     outputs=[chatbot],
                     queue=False
                 )
-                # Accordion animation JavaScript
                 demo.load(None, None, None, _js="""
                 () => {
                     const accordion = document.querySelector(".gradio-accordion");
@@ -455,18 +364,16 @@ class XylariaChat:
                 }
                 """)
-                # Ensure memory is cleared when the interface is closed
                 demo.load(self.reset_conversation, None, None)
         return demo
-# Launch the interface
 def main():
     chat = XylariaChat()
     interface = chat.create_interface()
     interface.launch(
-        share=True,  # Optional: create a public link
-        debug=True   # Show detailed errors
     )
 if __name__ == "__main__":

 import gradio as gr
 from huggingface_hub import InferenceClient
 from dataclasses import dataclass
+import speech_recognition as sr
+import easyocr
 from PIL import Image
 @dataclass
 class ChatMessage:
     role: str
     content: str
     def to_dict(self):
         return {"role": self.role, "content": self.content}
 class XylariaChat:
     def __init__(self):
         self.hf_token = os.getenv("HF_TOKEN")
         if not self.hf_token:
             raise ValueError("HuggingFace token not found in environment variables")
         self.client = InferenceClient(
+            model="Qwen/QwQ-32B-Preview",
             api_key=self.hf_token
         )
         self.image_api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
         self.image_api_headers = {"Authorization": f"Bearer {self.hf_token}"}
         self.conversation_history = []
         self.persistent_memory = {}
         self.system_prompt = """You are a helpful and harmless assistant. You are Xylaria developed by Sk Md Saad Amin . You should think step-by-step."""
+        self.reader = easyocr.Reader(['ch_sim','en'])
     def store_information(self, key, value):
         self.persistent_memory[key] = value
         return f"Stored: {key} = {value}"
     def retrieve_information(self, key):
         return self.persistent_memory.get(key, "No information found for this key.")
     def reset_conversation(self):
         self.conversation_history = []
         self.persistent_memory.clear()
         try:
             self.client = InferenceClient(
                 model="Qwen/QwQ-32B-Preview",
         except Exception as e:
             print(f"Error resetting API client: {e}")
+        return None
     def caption_image(self, image):
         try:
             if isinstance(image, str) and os.path.isfile(image):
                 with open(image, "rb") as f:
                     data = f.read()
             elif isinstance(image, str):
                 if image.startswith('data:image'):
                     image = image.split(',')[1]
                 data = base64.b64decode(image)
             else:
                 data = image.read()
             response = requests.post(
                 self.image_api_url,
                 headers=self.image_api_headers,
                 data=data
             )
             if response.status_code == 200:
                 caption = response.json()[0].get('generated_text', 'No caption generated')
                 return caption
             return f"Error processing image: {str(e)}"
     def perform_math_ocr(self, image_path):
         try:
             img = Image.open(image_path)
+            result = self.reader.readtext(image_path)
+            text = ' '.join([item[1] for item in result])
             return text.strip()
         except Exception as e:
             return f"Error during Math OCR: {e}"
     def get_response(self, user_input, image=None):
         try:
             messages = []
             messages.append(ChatMessage(
                 role="system",
                 content=self.system_prompt
             ).to_dict())
             if self.persistent_memory:
                 memory_context = "Remembered Information:\n" + "\n".join(
                     [f"{k}: {v}" for k, v in self.persistent_memory.items()]
                     content=memory_context
                 ).to_dict())
             for msg in self.conversation_history:
                 messages.append(msg)
             if image:
                 image_caption = self.caption_image(image)
                 user_input = f"description of an image: {image_caption}\n\nUser's message about it: {user_input}"
             messages.append(ChatMessage(
                 role="user",
                 content=user_input
             ).to_dict())
             input_tokens = sum(len(msg['content'].split()) for msg in messages)
+            max_new_tokens = 16384 - input_tokens - 50
             max_new_tokens = min(max_new_tokens, 10020)
             stream = self.client.chat_completion(
                 messages=messages,
                 model="Qwen/QwQ-32B-Preview",
                 top_p=0.9,
                 stream=True
             )
             return stream
         except Exception as e:
             print(f"Detailed error in get_response: {e}")
             return f"Error generating response: {str(e)}"
     def messages_to_prompt(self, messages):
         prompt = ""
         for msg in messages:
             if msg["role"] == "system":
                 prompt += f"<|user|>\n{msg['content']}<|end|>\n"
             elif msg["role"] == "assistant":
                 prompt += f"<|assistant|>\n{msg['content']}<|end|>\n"
+        prompt += "<|assistant|>\n"
         return prompt
     def recognize_speech(self, audio_file):
         recognizer = sr.Recognizer()
         try:
             with sr.AudioFile(audio_file) as source:
                 audio_data = recognizer.record(source)
+                text = recognizer.recognize_google(audio_data)
                 return text
         except sr.UnknownValueError:
             return "Could not understand audio"
         except sr.RequestError:
             return "Could not request results from Google Speech Recognition service"
     def create_interface(self):
         def streaming_response(message, chat_history, image_filepath, math_ocr_image_path, audio_file):
             if audio_file:
                 voice_message = self.recognize_speech(audio_file)
                 if not voice_message.startswith("Error"):
+                    message = voice_message
             ocr_text = ""
             if math_ocr_image_path:
                 ocr_text = self.perform_math_ocr(math_ocr_image_path)
                 if ocr_text.startswith("Error"):
+                    updated_history = chat_history + [[{"role": "user", "content": message}, {"role": "assistant", "content": ocr_text}]]
                     yield "", updated_history, None, None, None
                     return
+                elif len(ocr_text) > 500:
                     ocr_text = "OCR output is too large to be processed."
+                    updated_history = chat_history + [[{"role": "user", "content": message}, {"role": "assistant", "content": ocr_text}]]
                     yield "", updated_history, None, None, None
                     return
                 else:
                     message = f"Math OCR Result: {ocr_text}\n\nUser's message: {message}"
             if image_filepath:
                 response_stream = self.get_response(message, image_filepath)
             else:
                 response_stream = self.get_response(message)
             if isinstance(response_stream, str):
+                updated_history = chat_history + [[{"role": "user", "content": message}, {"role": "assistant", "content": response_stream}]]
                 yield "", updated_history, None, None, None
                 return
             full_response = ""
+            updated_history = chat_history + [[{"role": "user", "content": message}, {"role": "assistant", "content": ""}]]
             try:
                 for chunk in response_stream:
                     if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
                         chunk_content = chunk.choices[0].delta.content
                         full_response += chunk_content
+                        updated_history[-1][1]["content"] = full_response
                         yield "", updated_history, None, None, None
             except Exception as e:
                 print(f"Streaming error: {e}")
+                updated_history[-1][1]["content"] = f"Error during response: {e}"
                 yield "", updated_history, None, None, None
                 return
             self.conversation_history.append(ChatMessage(role="user", content=message).to_dict())
             self.conversation_history.append(ChatMessage(role="assistant", content=full_response).to_dict())
             if len(self.conversation_history) > 10:
                 self.conversation_history = self.conversation_history[-10:]
         custom_css = """
         @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
         body, .gradio-container {
         .gradio-container button {
             font-family: 'Inter', sans-serif !important;
         }
         .image-container {
             display: flex;
             gap: 10px;
             max-height: 200px;
             border-radius: 8px;
         }
         .clear-button {
             display: none;
         }
         .chatbot-container .message {
             opacity: 0;
             animation: fadeIn 0.5s ease-in-out forwards;
                 transform: translateY(0);
             }
         }
         .gradio-accordion {
             overflow: hidden;
+            transition: max-height 0.3s ease-in-out;
+            max-height: 0;
         }
         .gradio-accordion.open {
+            max-height: 500px;
         }
         """
         with gr.Blocks(theme='soft', css=custom_css) as demo:
             with gr.Column():
                 chatbot = gr.Chatbot(
                     label="Xylaria 1.5 Senoa (EXPERIMENTAL)",
                     height=500,
                     show_copy_button=True,
+                    type='messages'
                 )
                 with gr.Accordion("Image Input", open=False) as accordion:
+                    with gr.Row(elem_classes="image-container"):
                         with gr.Column(elem_classes="image-upload"):
                             img = gr.Image(
                                 sources=["upload", "webcam"],
                                 label="Upload Image for Math OCR",
                                 elem_classes="image-preview"
                             )
                 with gr.Row():
                     with gr.Column(scale=4):
                         txt = gr.Textbox(
                         )
                     with gr.Column(scale=1):
                         audio_input = gr.Audio(
+                            sources=["microphone"],
                             type="filepath",
                             label="Voice Input"
                         )
                     btn = gr.Button("Send", scale=1)
                 with gr.Row():
                     clear = gr.Button("Clear Conversation")
                     clear_memory = gr.Button("Clear Memory")
                 btn.click(
                     fn=streaming_response,
                     inputs=[txt, chatbot, img, math_ocr_img, audio_input],
                     outputs=[txt, chatbot, img, math_ocr_img, audio_input]
                 )
                 clear.click(
                     fn=lambda: None,
                     inputs=None,
                     queue=False
                 )
                 clear_memory.click(
                     fn=self.reset_conversation,
                     inputs=None,
                     outputs=[chatbot],
                     queue=False
                 )
                 demo.load(None, None, None, _js="""
                 () => {
                     const accordion = document.querySelector(".gradio-accordion");
                 }
                 """)
                 demo.load(self.reset_conversation, None, None)
         return demo
 def main():
     chat = XylariaChat()
     interface = chat.create_interface()
     interface.launch(
+        share=True,
+        debug=True
     )
 if __name__ == "__main__":