Adinarayana02's picture
Update app.py
a92ca55 verified
import openai
import whisper
import torch
import gradio as gr
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
# Load Whisper model for transcription
model = whisper.load_model("base")
# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY") # Make sure to replace with your actual API key if needed
# Function to translate audio to text using Whisper
def translate_audio(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0)
result = whisper.decode(model, mel, options)
return result.text
# Function to generate text response using GPT-4
def text_response(prompt):
response = openai.ChatCompletion.create(
model="gpt-4", # Replace with the GPT-4o-mini model if needed
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
],
max_tokens=150,
)
return response['choices'][0]['message']['content'].strip()
# Function to convert text to speech using SpeechT5
def audio_response(text, output_path="speech.wav"):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=text, return_tensors="pt")
# Load xvector containing speaker's voice characteristics
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# Generate speech
with torch.no_grad():
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
# Save the audio to a file
sf.write(output_path, speech.numpy(), samplerate=16000)
return output_path
# Function to handle full process: Transcription -> Text Generation -> Text-to-Speech
def transcribe_(audio):
transcription = translate_audio(audio) # Step 1: Convert audio to text
response = text_response(transcription) # Step 2: Generate text response from GPT-4
tts_audio = audio_response(response) # Step 3: Convert text response to speech
return transcription, response, tts_audio
# Define Gradio interface outputs
output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="GPT-4 Output")
output_3 = gr.Audio(label="Text to Speech")
# Launch Gradio interface
gr.Interface(
title='AI Voice Assistant',
fn=transcribe_,
inputs=[
gr.Audio(source="microphone", type="filepath"),
],
outputs=[
output_1, output_2, output_3
]
).launch(share=True)