Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import openai
|
2 |
import whisper
|
|
|
3 |
import gradio as gr
|
4 |
import soundfile as sf
|
5 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
@@ -9,7 +10,7 @@ from datasets import load_dataset
|
|
9 |
model = whisper.load_model("base")
|
10 |
|
11 |
# Set OpenAI API key
|
12 |
-
openai.api_key = os.getenv("OPENAI_API_KEY")
|
13 |
|
14 |
# Function to translate audio to text using Whisper
|
15 |
def translate_audio(audio):
|
@@ -38,10 +39,16 @@ def audio_response(text, output_path="speech.wav"):
|
|
38 |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
39 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
40 |
inputs = processor(text=text, return_tensors="pt")
|
|
|
|
|
41 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
42 |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
|
|
|
|
43 |
with torch.no_grad():
|
44 |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
|
|
|
|
45 |
sf.write(output_path, speech.numpy(), samplerate=16000)
|
46 |
return output_path
|
47 |
|
|
|
1 |
import openai
|
2 |
import whisper
|
3 |
+
import torch
|
4 |
import gradio as gr
|
5 |
import soundfile as sf
|
6 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
|
|
10 |
model = whisper.load_model("base")
|
11 |
|
12 |
# Set OpenAI API key
|
13 |
+
openai.api_key = os.getenv("OPENAI_API_KEY") # Make sure to replace with your actual API key if needed
|
14 |
|
15 |
# Function to translate audio to text using Whisper
|
16 |
def translate_audio(audio):
|
|
|
39 |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
40 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
41 |
inputs = processor(text=text, return_tensors="pt")
|
42 |
+
|
43 |
+
# Load xvector containing speaker's voice characteristics
|
44 |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
45 |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
46 |
+
|
47 |
+
# Generate speech
|
48 |
with torch.no_grad():
|
49 |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
50 |
+
|
51 |
+
# Save the audio to a file
|
52 |
sf.write(output_path, speech.numpy(), samplerate=16000)
|
53 |
return output_path
|
54 |
|