import openai
import whisper
import torch
import gradio as gr
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset

# Load Whisper model for transcription
model = whisper.load_model("base")

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")  # Make sure to replace with your actual API key if needed

# Function to translate audio to text using Whisper
def translate_audio(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0)
    result = whisper.decode(model, mel, options)
    return result.text

# Function to generate text response using GPT-4
def text_response(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",  # Replace with the GPT-4o-mini model if needed
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=150,
    )
    return response['choices'][0]['message']['content'].strip()

# Function to convert text to speech using SpeechT5
def audio_response(text, output_path="speech.wav"):
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    inputs = processor(text=text, return_tensors="pt")
    
    # Load xvector containing speaker's voice characteristics
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    
    # Generate speech
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    
    # Save the audio to a file
    sf.write(output_path, speech.numpy(), samplerate=16000)
    return output_path

# Function to handle full process: Transcription -> Text Generation -> Text-to-Speech
def transcribe_(audio):
    transcription = translate_audio(audio)  # Step 1: Convert audio to text
    response = text_response(transcription)  # Step 2: Generate text response from GPT-4
    tts_audio = audio_response(response)  # Step 3: Convert text response to speech
    return transcription, response, tts_audio

# Define Gradio interface outputs
output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="GPT-4 Output")
output_3 = gr.Audio(label="Text to Speech")

# Launch Gradio interface
gr.Interface(
    title='AI Voice Assistant',
    fn=transcribe_,
    inputs=[
        gr.Audio(source="microphone", type="filepath"),
    ],
    outputs=[
        output_1, output_2, output_3
    ]
).launch(share=True)