import openai import whisper import torch import gradio as gr import soundfile as sf from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset # Load Whisper model for transcription model = whisper.load_model("base") # Set OpenAI API key openai.api_key = os.getenv("OPENAI_API_KEY") # Make sure to replace with your actual API key if needed # Function to translate audio to text using Whisper def translate_audio(audio): audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0) result = whisper.decode(model, mel, options) return result.text # Function to generate text response using GPT-4 def text_response(prompt): response = openai.ChatCompletion.create( model="gpt-4", # Replace with the GPT-4o-mini model if needed messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}, ], max_tokens=150, ) return response['choices'][0]['message']['content'].strip() # Function to convert text to speech using SpeechT5 def audio_response(text, output_path="speech.wav"): processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") inputs = processor(text=text, return_tensors="pt") # Load xvector containing speaker's voice characteristics embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # Generate speech with torch.no_grad(): speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) # Save the audio to a file sf.write(output_path, speech.numpy(), samplerate=16000) return output_path # Function to handle full process: Transcription -> Text Generation -> Text-to-Speech def transcribe_(audio): transcription = translate_audio(audio) # Step 1: Convert audio to text response = text_response(transcription) # Step 2: Generate text response from GPT-4 tts_audio = audio_response(response) # Step 3: Convert text response to speech return transcription, response, tts_audio # Define Gradio interface outputs output_1 = gr.Textbox(label="Speech to Text") output_2 = gr.Textbox(label="GPT-4 Output") output_3 = gr.Audio(label="Text to Speech") # Launch Gradio interface gr.Interface( title='AI Voice Assistant', fn=transcribe_, inputs=[ gr.Audio(source="microphone", type="filepath"), ], outputs=[ output_1, output_2, output_3 ] ).launch(share=True)