Adinarayana02's picture
Update app.py
a92ca55 verified
raw
history blame
2.95 kB
import openai
import whisper
import torch
import gradio as gr
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
# Load Whisper model for transcription
model = whisper.load_model("base")
# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY") # Make sure to replace with your actual API key if needed
# Function to translate audio to text using Whisper
def translate_audio(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0)
result = whisper.decode(model, mel, options)
return result.text
# Function to generate text response using GPT-4
def text_response(prompt):
response = openai.ChatCompletion.create(
model="gpt-4", # Replace with the GPT-4o-mini model if needed
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
],
max_tokens=150,
)
return response['choices'][0]['message']['content'].strip()
# Function to convert text to speech using SpeechT5
def audio_response(text, output_path="speech.wav"):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=text, return_tensors="pt")
# Load xvector containing speaker's voice characteristics
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# Generate speech
with torch.no_grad():
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
# Save the audio to a file
sf.write(output_path, speech.numpy(), samplerate=16000)
return output_path
# Function to handle full process: Transcription -> Text Generation -> Text-to-Speech
def transcribe_(audio):
transcription = translate_audio(audio) # Step 1: Convert audio to text
response = text_response(transcription) # Step 2: Generate text response from GPT-4
tts_audio = audio_response(response) # Step 3: Convert text response to speech
return transcription, response, tts_audio
# Define Gradio interface outputs
output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="GPT-4 Output")
output_3 = gr.Audio(label="Text to Speech")
# Launch Gradio interface
gr.Interface(
title='AI Voice Assistant',
fn=transcribe_,
inputs=[
gr.Audio(source="microphone", type="filepath"),
],
outputs=[
output_1, output_2, output_3
]
).launch(share=True)