llama-kokoro

Running

App Files Files Community

khurrameycon commited on Jan 17

Commit

8c4b3e2

verified ·

1 Parent(s): 1c73802

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -2

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import os
 import torch
 # Import eSpeak TTS pipeline
 from tts_cli import (
@@ -14,13 +15,15 @@ from tts_cli_op import (
     generate_long_form_tts as generate_long_form_tts_open,
 )
 from pretrained_models import Kokoro
 # ---------------------------------------------------------------------
 # Path to models and voicepacks
 # ---------------------------------------------------------------------
 MODELS_DIR = "pretrained_models/Kokoro"
 VOICES_DIR = "pretrained_models/Kokoro/voices"
 # ---------------------------------------------------------------------
 # List the models (.pth) and voices (.pt)
@@ -53,6 +56,32 @@ def tts_inference(text, engine, model_file, voice_file, speed=1.0):
     voice_file:  Selected .pt from the voices folder
     speed:       Speech speed
     """
     # 1) Map engine to the correct build_model + generate_long_form_tts
     build_fn, gen_fn = ENGINES[engine]
@@ -76,7 +105,7 @@ def tts_inference(text, engine, model_file, voice_file, speed=1.0):
         voicepack.eval()
     # 6) Generate TTS
-    audio, phonemes = gen_fn(model, text, voicepack, speed=speed)
     sr = 22050  # or your actual sample rate
     return (sr, audio)  # Gradio expects (sample_rate, np_array)

 import gradio as gr
 import os
 import torch
+from huggingface_hub import InferenceClient
 # Import eSpeak TTS pipeline
 from tts_cli import (
     generate_long_form_tts as generate_long_form_tts_open,
 )
 from pretrained_models import Kokoro
+#
 # ---------------------------------------------------------------------
 # Path to models and voicepacks
 # ---------------------------------------------------------------------
 MODELS_DIR = "pretrained_models/Kokoro"
 VOICES_DIR = "pretrained_models/Kokoro/voices"
+client = InferenceClient(api_key=HF_TOKEN)
 # ---------------------------------------------------------------------
 # List the models (.pth) and voices (.pt)
     voice_file:  Selected .pt from the voices folder
     speed:       Speech speed
     """
+    # 0) Get the response of user query from LLAMA
+    messages = [
+	{
+		"role": "user",
+		"content": [
+			{
+				"type": "text",
+				"text": text
+			} #,
+			# {
+			# 	"type": "image_url",
+			# 	"image_url": {
+			# 		"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+			# 	}
+			}
+            ]
+	}
+    ]
+response_from_llama = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+	messages=messages,
+	max_tokens=500)
     # 1) Map engine to the correct build_model + generate_long_form_tts
     build_fn, gen_fn = ENGINES[engine]
         voicepack.eval()
     # 6) Generate TTS
+    audio, phonemes = gen_fn(model, response_from_llama, voicepack, speed=speed)
     sr = 22050  # or your actual sample rate
     return (sr, audio)  # Gradio expects (sample_rate, np_array)