khurrameycon commited on
Commit
8c4b3e2
·
verified ·
1 Parent(s): 1c73802

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -2
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import os
3
  import torch
 
4
 
5
  # Import eSpeak TTS pipeline
6
  from tts_cli import (
@@ -14,13 +15,15 @@ from tts_cli_op import (
14
  generate_long_form_tts as generate_long_form_tts_open,
15
  )
16
  from pretrained_models import Kokoro
17
-
18
  # ---------------------------------------------------------------------
19
  # Path to models and voicepacks
20
  # ---------------------------------------------------------------------
21
  MODELS_DIR = "pretrained_models/Kokoro"
22
  VOICES_DIR = "pretrained_models/Kokoro/voices"
23
 
 
 
24
 
25
  # ---------------------------------------------------------------------
26
  # List the models (.pth) and voices (.pt)
@@ -53,6 +56,32 @@ def tts_inference(text, engine, model_file, voice_file, speed=1.0):
53
  voice_file: Selected .pt from the voices folder
54
  speed: Speech speed
55
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # 1) Map engine to the correct build_model + generate_long_form_tts
57
  build_fn, gen_fn = ENGINES[engine]
58
 
@@ -76,7 +105,7 @@ def tts_inference(text, engine, model_file, voice_file, speed=1.0):
76
  voicepack.eval()
77
 
78
  # 6) Generate TTS
79
- audio, phonemes = gen_fn(model, text, voicepack, speed=speed)
80
  sr = 22050 # or your actual sample rate
81
 
82
  return (sr, audio) # Gradio expects (sample_rate, np_array)
 
1
  import gradio as gr
2
  import os
3
  import torch
4
+ from huggingface_hub import InferenceClient
5
 
6
  # Import eSpeak TTS pipeline
7
  from tts_cli import (
 
15
  generate_long_form_tts as generate_long_form_tts_open,
16
  )
17
  from pretrained_models import Kokoro
18
+ #
19
  # ---------------------------------------------------------------------
20
  # Path to models and voicepacks
21
  # ---------------------------------------------------------------------
22
  MODELS_DIR = "pretrained_models/Kokoro"
23
  VOICES_DIR = "pretrained_models/Kokoro/voices"
24
 
25
+ client = InferenceClient(api_key=HF_TOKEN)
26
+
27
 
28
  # ---------------------------------------------------------------------
29
  # List the models (.pth) and voices (.pt)
 
56
  voice_file: Selected .pt from the voices folder
57
  speed: Speech speed
58
  """
59
+
60
+ # 0) Get the response of user query from LLAMA
61
+
62
+ messages = [
63
+ {
64
+ "role": "user",
65
+ "content": [
66
+ {
67
+ "type": "text",
68
+ "text": text
69
+ } #,
70
+ # {
71
+ # "type": "image_url",
72
+ # "image_url": {
73
+ # "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
74
+ # }
75
+ }
76
+ ]
77
+ }
78
+ ]
79
+
80
+ response_from_llama = client.chat.completions.create(
81
+ model="meta-llama/Llama-3.2-11B-Vision-Instruct",
82
+ messages=messages,
83
+ max_tokens=500)
84
+
85
  # 1) Map engine to the correct build_model + generate_long_form_tts
86
  build_fn, gen_fn = ENGINES[engine]
87
 
 
105
  voicepack.eval()
106
 
107
  # 6) Generate TTS
108
+ audio, phonemes = gen_fn(model, response_from_llama, voicepack, speed=speed)
109
  sr = 22050 # or your actual sample rate
110
 
111
  return (sr, audio) # Gradio expects (sample_rate, np_array)