Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import torch
|
|
|
4 |
|
5 |
# Import eSpeak TTS pipeline
|
6 |
from tts_cli import (
|
@@ -14,13 +15,15 @@ from tts_cli_op import (
|
|
14 |
generate_long_form_tts as generate_long_form_tts_open,
|
15 |
)
|
16 |
from pretrained_models import Kokoro
|
17 |
-
|
18 |
# ---------------------------------------------------------------------
|
19 |
# Path to models and voicepacks
|
20 |
# ---------------------------------------------------------------------
|
21 |
MODELS_DIR = "pretrained_models/Kokoro"
|
22 |
VOICES_DIR = "pretrained_models/Kokoro/voices"
|
23 |
|
|
|
|
|
24 |
|
25 |
# ---------------------------------------------------------------------
|
26 |
# List the models (.pth) and voices (.pt)
|
@@ -53,6 +56,32 @@ def tts_inference(text, engine, model_file, voice_file, speed=1.0):
|
|
53 |
voice_file: Selected .pt from the voices folder
|
54 |
speed: Speech speed
|
55 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
# 1) Map engine to the correct build_model + generate_long_form_tts
|
57 |
build_fn, gen_fn = ENGINES[engine]
|
58 |
|
@@ -76,7 +105,7 @@ def tts_inference(text, engine, model_file, voice_file, speed=1.0):
|
|
76 |
voicepack.eval()
|
77 |
|
78 |
# 6) Generate TTS
|
79 |
-
audio, phonemes = gen_fn(model,
|
80 |
sr = 22050 # or your actual sample rate
|
81 |
|
82 |
return (sr, audio) # Gradio expects (sample_rate, np_array)
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import torch
|
4 |
+
from huggingface_hub import InferenceClient
|
5 |
|
6 |
# Import eSpeak TTS pipeline
|
7 |
from tts_cli import (
|
|
|
15 |
generate_long_form_tts as generate_long_form_tts_open,
|
16 |
)
|
17 |
from pretrained_models import Kokoro
|
18 |
+
#
|
19 |
# ---------------------------------------------------------------------
|
20 |
# Path to models and voicepacks
|
21 |
# ---------------------------------------------------------------------
|
22 |
MODELS_DIR = "pretrained_models/Kokoro"
|
23 |
VOICES_DIR = "pretrained_models/Kokoro/voices"
|
24 |
|
25 |
+
client = InferenceClient(api_key=HF_TOKEN)
|
26 |
+
|
27 |
|
28 |
# ---------------------------------------------------------------------
|
29 |
# List the models (.pth) and voices (.pt)
|
|
|
56 |
voice_file: Selected .pt from the voices folder
|
57 |
speed: Speech speed
|
58 |
"""
|
59 |
+
|
60 |
+
# 0) Get the response of user query from LLAMA
|
61 |
+
|
62 |
+
messages = [
|
63 |
+
{
|
64 |
+
"role": "user",
|
65 |
+
"content": [
|
66 |
+
{
|
67 |
+
"type": "text",
|
68 |
+
"text": text
|
69 |
+
} #,
|
70 |
+
# {
|
71 |
+
# "type": "image_url",
|
72 |
+
# "image_url": {
|
73 |
+
# "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
|
74 |
+
# }
|
75 |
+
}
|
76 |
+
]
|
77 |
+
}
|
78 |
+
]
|
79 |
+
|
80 |
+
response_from_llama = client.chat.completions.create(
|
81 |
+
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
82 |
+
messages=messages,
|
83 |
+
max_tokens=500)
|
84 |
+
|
85 |
# 1) Map engine to the correct build_model + generate_long_form_tts
|
86 |
build_fn, gen_fn = ENGINES[engine]
|
87 |
|
|
|
105 |
voicepack.eval()
|
106 |
|
107 |
# 6) Generate TTS
|
108 |
+
audio, phonemes = gen_fn(model, response_from_llama, voicepack, speed=speed)
|
109 |
sr = 22050 # or your actual sample rate
|
110 |
|
111 |
return (sr, audio) # Gradio expects (sample_rate, np_array)
|