File size: 2,951 Bytes
bd7d7a1
b29d045
a92ca55
b29d045
 
bd7d7a1
b29d045
9ece451
bd7d7a1
b29d045
9ece451
bd7d7a1
a92ca55
b29d045
bd7d7a1
b29d045
 
 
 
 
 
 
 
bd7d7a1
 
 
 
 
 
 
 
 
 
 
 
 
b29d045
 
 
 
 
a92ca55
 
b29d045
 
a92ca55
 
b29d045
 
a92ca55
 
bd7d7a1
b29d045
 
bd7d7a1
 
 
 
 
 
b29d045
9ece451
b29d045
bd7d7a1
 
b29d045
9ece451
b29d045
 
 
 
9ece451
b29d045
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import openai
import whisper
import torch
import gradio as gr
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset

# Load Whisper model for transcription
model = whisper.load_model("base")

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")  # Make sure to replace with your actual API key if needed

# Function to translate audio to text using Whisper
def translate_audio(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0)
    result = whisper.decode(model, mel, options)
    return result.text

# Function to generate text response using GPT-4
def text_response(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4",  # Replace with the GPT-4o-mini model if needed
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=150,
    )
    return response['choices'][0]['message']['content'].strip()

# Function to convert text to speech using SpeechT5
def audio_response(text, output_path="speech.wav"):
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    inputs = processor(text=text, return_tensors="pt")
    
    # Load xvector containing speaker's voice characteristics
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    
    # Generate speech
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    
    # Save the audio to a file
    sf.write(output_path, speech.numpy(), samplerate=16000)
    return output_path

# Function to handle full process: Transcription -> Text Generation -> Text-to-Speech
def transcribe_(audio):
    transcription = translate_audio(audio)  # Step 1: Convert audio to text
    response = text_response(transcription)  # Step 2: Generate text response from GPT-4
    tts_audio = audio_response(response)  # Step 3: Convert text response to speech
    return transcription, response, tts_audio

# Define Gradio interface outputs
output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="GPT-4 Output")
output_3 = gr.Audio(label="Text to Speech")

# Launch Gradio interface
gr.Interface(
    title='AI Voice Assistant',
    fn=transcribe_,
    inputs=[
        gr.Audio(source="microphone", type="filepath"),
    ],
    outputs=[
        output_1, output_2, output_3
    ]
).launch(share=True)