Spaces:
Runtime error
Runtime error
import openai | |
import whisper | |
import torch | |
import gradio as gr | |
import soundfile as sf | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from datasets import load_dataset | |
# Load Whisper model for transcription | |
model = whisper.load_model("base") | |
# Set OpenAI API key | |
openai.api_key = os.getenv("OPENAI_API_KEY") # Make sure to replace with your actual API key if needed | |
# Function to translate audio to text using Whisper | |
def translate_audio(audio): | |
audio = whisper.load_audio(audio) | |
audio = whisper.pad_or_trim(audio) | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0) | |
result = whisper.decode(model, mel, options) | |
return result.text | |
# Function to generate text response using GPT-4 | |
def text_response(prompt): | |
response = openai.ChatCompletion.create( | |
model="gpt-4", # Replace with the GPT-4o-mini model if needed | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": prompt}, | |
], | |
max_tokens=150, | |
) | |
return response['choices'][0]['message']['content'].strip() | |
# Function to convert text to speech using SpeechT5 | |
def audio_response(text, output_path="speech.wav"): | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
inputs = processor(text=text, return_tensors="pt") | |
# Load xvector containing speaker's voice characteristics | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
# Generate speech | |
with torch.no_grad(): | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
# Save the audio to a file | |
sf.write(output_path, speech.numpy(), samplerate=16000) | |
return output_path | |
# Function to handle full process: Transcription -> Text Generation -> Text-to-Speech | |
def transcribe_(audio): | |
transcription = translate_audio(audio) # Step 1: Convert audio to text | |
response = text_response(transcription) # Step 2: Generate text response from GPT-4 | |
tts_audio = audio_response(response) # Step 3: Convert text response to speech | |
return transcription, response, tts_audio | |
# Define Gradio interface outputs | |
output_1 = gr.Textbox(label="Speech to Text") | |
output_2 = gr.Textbox(label="GPT-4 Output") | |
output_3 = gr.Audio(label="Text to Speech") | |
# Launch Gradio interface | |
gr.Interface( | |
title='AI Voice Assistant', | |
fn=transcribe_, | |
inputs=[ | |
gr.Audio(source="microphone", type="filepath"), | |
], | |
outputs=[ | |
output_1, output_2, output_3 | |
] | |
).launch(share=True) | |