Adinarayana02 commited on
Commit
bd7d7a1
Β·
verified Β·
1 Parent(s): 9ece451

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -71
app.py CHANGED
@@ -1,104 +1,61 @@
1
- import torch
2
- from llama_index.core.prompts import PromptTemplate
3
- from transformers import AutoTokenizer
4
- from llama_index.core import Settings
5
- import os
6
- import time
7
- from llama_index.llms.text_generation_inference import TextGenerationInference
8
  import whisper
9
  import gradio as gr
10
- from gtts import gTTS
11
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
12
  import soundfile as sf
 
13
  from datasets import load_dataset
14
 
15
- # Load Whisper model
16
  model = whisper.load_model("base")
17
 
18
- # Load Hugging Face API Token
19
- HF_API_TOKEN = os.getenv("HF_TOKEN")
20
 
21
- # Function to translate audio to text
22
  def translate_audio(audio):
23
- # Load and process audio
24
  audio = whisper.load_audio(audio)
25
  audio = whisper.pad_or_trim(audio)
26
-
27
- # Convert audio to log-Mel spectrogram
28
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
29
-
30
- # Decode audio to text
31
  options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0)
32
  result = whisper.decode(model, mel, options)
33
  return result.text
34
 
35
- # Function to convert text to speech
 
 
 
 
 
 
 
 
 
 
 
 
36
  def audio_response(text, output_path="speech.wav"):
37
- # Load processor, model, and vocoder
38
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
39
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
40
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
41
-
42
- # Process input text
43
  inputs = processor(text=text, return_tensors="pt")
44
-
45
- # Load xvector for speaker's voice characteristics
46
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
47
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
48
-
49
- # Generate speech
50
  with torch.no_grad():
51
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
52
-
53
- # Save audio file
54
- sf.write(output_path, speech.numpy(), samplerate=16000) # Adjust sample rate as necessary
55
-
56
  return output_path
57
 
58
- # Function to generate a prompt from conversation history
59
- def messages_to_prompt(messages):
60
- default_system_prompt = "You are an AI chatbot designed to assist with user queries in a friendly and conversational manner."
61
- prompt = default_system_prompt + "\n"
62
-
63
- for message in messages:
64
- if message.role == 'system':
65
- prompt += f"{message.content}</s>\n"
66
- elif message.role == 'user':
67
- prompt += f"{message.content}</s>\n"
68
- elif message.role == 'assistant':
69
- prompt += f"{message.content}</s>\n"
70
-
71
- return prompt
72
-
73
- # Function to process LLM response into a prompt format
74
- def completion_to_prompt(completion):
75
- return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"
76
-
77
- # Configure LLM settings
78
- Settings.llm = TextGenerationInference(
79
- model_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct",
80
- token=HF_API_TOKEN,
81
- messages_to_prompt=messages_to_prompt,
82
- completion_to_prompt=completion_to_prompt
83
- )
84
-
85
- # Function to generate text response from LLM
86
- def text_response(t):
87
- time.sleep(1) # Adjust delay as needed
88
- response = Settings.llm.complete(t)
89
- return response.text
90
-
91
- # Function to transcribe audio, generate a text response, and convert it to audio
92
- def transcribe_(a):
93
- t1 = translate_audio(a) # Transcribe audio to text
94
- t2 = text_response(t1) # Generate text response from LLM
95
- t3 = audio_response(t2) # Convert text response to speech
96
- return (t1, t2, t3)
97
 
98
  # Define Gradio interface outputs
99
  output_1 = gr.Textbox(label="Speech to Text")
100
- output_2 = gr.Textbox(label="LLM Output")
101
- output_3 = gr.Audio(label="LLM output to audio")
102
 
103
  # Launch Gradio interface
104
  gr.Interface(
 
1
+ import openai
 
 
 
 
 
 
2
  import whisper
3
  import gradio as gr
 
 
4
  import soundfile as sf
5
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
  from datasets import load_dataset
7
 
8
+ # Load Whisper model for transcription
9
  model = whisper.load_model("base")
10
 
11
+ # Set OpenAI API key
12
+ openai.api_key = os.getenv("OPENAI_API_KEY")
13
 
14
+ # Function to translate audio to text using Whisper
15
  def translate_audio(audio):
 
16
  audio = whisper.load_audio(audio)
17
  audio = whisper.pad_or_trim(audio)
 
 
18
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
 
 
19
  options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0)
20
  result = whisper.decode(model, mel, options)
21
  return result.text
22
 
23
+ # Function to generate text response using GPT-4
24
+ def text_response(prompt):
25
+ response = openai.ChatCompletion.create(
26
+ model="gpt-4", # Replace with the GPT-4o-mini model if needed
27
+ messages=[
28
+ {"role": "system", "content": "You are a helpful assistant."},
29
+ {"role": "user", "content": prompt},
30
+ ],
31
+ max_tokens=150,
32
+ )
33
+ return response['choices'][0]['message']['content'].strip()
34
+
35
+ # Function to convert text to speech using SpeechT5
36
  def audio_response(text, output_path="speech.wav"):
 
37
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
38
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
39
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
 
40
  inputs = processor(text=text, return_tensors="pt")
 
 
41
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
42
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
 
43
  with torch.no_grad():
44
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
45
+ sf.write(output_path, speech.numpy(), samplerate=16000)
 
 
 
46
  return output_path
47
 
48
+ # Function to handle full process: Transcription -> Text Generation -> Text-to-Speech
49
+ def transcribe_(audio):
50
+ transcription = translate_audio(audio) # Step 1: Convert audio to text
51
+ response = text_response(transcription) # Step 2: Generate text response from GPT-4
52
+ tts_audio = audio_response(response) # Step 3: Convert text response to speech
53
+ return transcription, response, tts_audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  # Define Gradio interface outputs
56
  output_1 = gr.Textbox(label="Speech to Text")
57
+ output_2 = gr.Textbox(label="GPT-4 Output")
58
+ output_3 = gr.Audio(label="Text to Speech")
59
 
60
  # Launch Gradio interface
61
  gr.Interface(