Athspi commited on
Commit
60d2e23
·
verified ·
1 Parent(s): dbffdf4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +233 -83
app.py CHANGED
@@ -3,7 +3,16 @@ import whisper
3
  import torch
4
  import os
5
  from pydub import AudioSegment, silence
6
- from faster_whisper import WhisperModel # Import faster-whisper
 
 
 
 
 
 
 
 
 
7
 
8
  # Mapping of model names to Whisper model sizes
9
  MODELS = {
@@ -122,32 +131,48 @@ LANGUAGE_NAME_TO_CODE = {
122
  # Reverse mapping of language codes to full language names
123
  CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
124
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def detect_language(audio_file):
126
  """Detect the language of the audio file."""
127
- # Define device and compute type for faster-whisper
128
- device = "cuda" if torch.cuda.is_available() else "cpu"
129
- compute_type = "float32" if device == "cuda" else "int8"
130
-
131
- # Load the faster-whisper model for language detection
132
- model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
133
-
134
- # Convert audio to 16kHz mono for better compatibility
135
- audio = AudioSegment.from_file(audio_file)
136
- audio = audio.set_frame_rate(16000).set_channels(1)
137
- processed_audio_path = "processed_audio.wav"
138
- audio.export(processed_audio_path, format="wav")
139
-
140
- # Detect the language using faster-whisper
141
- segments, info = model.transcribe(processed_audio_path, task="translate", language=None)
142
- detected_language_code = info.language
143
 
144
- # Get the full language name from the code
145
- detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
146
-
147
- # Clean up processed audio file
148
- os.remove(processed_audio_path)
149
-
150
- return f"Detected Language: {detected_language}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
153
  """
@@ -161,81 +186,188 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
161
  Returns:
162
  str: Path to the output audio file with silence removed.
163
  """
164
- # Load the audio file
165
- audio = AudioSegment.from_file(audio_file)
166
 
167
- # Detect silent chunks
168
- silent_chunks = silence.detect_silence(
169
- audio,
170
- min_silence_len=min_silence_len,
171
- silence_thresh=silence_threshold
172
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
- # Remove silent chunks
175
- non_silent_audio = AudioSegment.empty()
176
- start = 0
177
- for chunk in silent_chunks:
178
- non_silent_audio += audio[start:chunk[0]] # Add non-silent part
179
- start = chunk[1] # Move to the end of the silent chunk
180
- non_silent_audio += audio[start:] # Add the remaining part
181
 
182
- # Export the processed audio
183
- output_path = "silence_removed_audio.wav"
184
- non_silent_audio.export(output_path, format="wav")
 
 
 
185
 
186
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
189
  """Transcribe the audio file."""
190
- # Convert audio to 16kHz mono for better compatibility
191
- audio = AudioSegment.from_file(audio_file)
192
- audio = audio.set_frame_rate(16000).set_channels(1)
193
- processed_audio_path = "processed_audio.wav"
194
- audio.export(processed_audio_path, format="wav")
195
 
196
- # Load the appropriate model
197
- if model_size == "Faster Whisper Large v3":
198
- # Define device and compute type for faster-whisper
199
- device = "cuda" if torch.cuda.is_available() else "cpu"
200
- compute_type = "float32" if device == "cuda" else "int8"
201
 
202
- # Use faster-whisper for the Systran model
203
- model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
204
- segments, info = model.transcribe(
205
- processed_audio_path,
206
- task="transcribe",
207
- word_timestamps=True,
208
- repetition_penalty=1.1,
209
- temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
210
- )
211
- transcription = " ".join([segment.text for segment in segments])
212
- detected_language_code = info.language
213
- detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
214
- else:
215
- # Use the standard Whisper model
216
- model = whisper.load_model(MODELS[model_size])
217
 
218
- # Transcribe the audio
219
- if language == "Auto Detect":
220
- result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
221
- detected_language_code = result.get("language", "unknown")
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
223
  else:
224
- language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
225
- result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
226
- detected_language = language
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- transcription = result["text"]
229
-
230
- # Clean up processed audio file
231
- os.remove(processed_audio_path)
232
-
233
- # Return transcription and detected language
234
- return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
 
 
235
 
236
  # Define the Gradio interface
237
  with gr.Blocks() as demo:
238
- gr.Markdown("# Audio Transcription and Language Detector") # Updated title
239
 
240
  with gr.Tab("Detect Language"):
241
  gr.Markdown("Upload an audio file to detect its language.")
@@ -276,6 +408,19 @@ with gr.Blocks() as demo:
276
  silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
277
  silence_button = gr.Button("Remove Silence")
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  # Link buttons to functions
280
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
281
  transcribe_button.click(
@@ -288,6 +433,11 @@ with gr.Blocks() as demo:
288
  inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
289
  outputs=silence_output
290
  )
 
 
 
 
 
291
 
292
  # Launch the Gradio interface
293
  demo.launch()
 
3
  import torch
4
  import os
5
  from pydub import AudioSegment, silence
6
+ from faster_whisper import WhisperModel
7
+ import numpy as np
8
+ from scipy.io import wavfile
9
+ from scipy.signal import correlate
10
+ import tempfile
11
+ import logging
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
 
17
  # Mapping of model names to Whisper model sizes
18
  MODELS = {
 
131
  # Reverse mapping of language codes to full language names
132
  CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
133
 
134
+ def convert_to_wav(audio_file):
135
+ """Convert any audio file to WAV format."""
136
+ audio = AudioSegment.from_file(audio_file)
137
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
138
+ wav_path = temp_wav.name
139
+ audio.export(wav_path, format="wav")
140
+ return wav_path
141
+
142
+ def resample_audio(audio_segment, target_sample_rate):
143
+ """Resample an audio segment to the target sample rate."""
144
+ return audio_segment.set_frame_rate(target_sample_rate)
145
+
146
  def detect_language(audio_file):
147
  """Detect the language of the audio file."""
148
+ if audio_file is None:
149
+ return "Error: No audio file uploaded."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ try:
152
+ # Convert audio to WAV format
153
+ wav_path = convert_to_wav(audio_file)
154
+
155
+ # Define device and compute type for faster-whisper
156
+ device = "cuda" if torch.cuda.is_available() else "cpu"
157
+ compute_type = "float32" if device == "cuda" else "int8"
158
+
159
+ # Load the faster-whisper model for language detection
160
+ model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
161
+
162
+ # Detect the language using faster-whisper
163
+ segments, info = model.transcribe(wav_path, task="translate", language=None)
164
+ detected_language_code = info.language
165
+
166
+ # Get the full language name from the code
167
+ detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
168
+
169
+ # Clean up temporary WAV file
170
+ os.remove(wav_path)
171
+
172
+ return f"Detected Language: {detected_language}"
173
+ except Exception as e:
174
+ logger.error(f"Error in detect_language: {str(e)}")
175
+ return f"Error: {str(e)}"
176
 
177
  def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
178
  """
 
186
  Returns:
187
  str: Path to the output audio file with silence removed.
188
  """
189
+ if audio_file is None:
190
+ return None
191
 
192
+ try:
193
+ # Convert audio to WAV format
194
+ wav_path = convert_to_wav(audio_file)
195
+
196
+ # Load the audio file
197
+ audio = AudioSegment.from_file(wav_path)
198
+
199
+ # Detect silent chunks
200
+ silent_chunks = silence.detect_silence(
201
+ audio,
202
+ min_silence_len=min_silence_len,
203
+ silence_thresh=silence_threshold
204
+ )
205
+
206
+ # Remove silent chunks
207
+ non_silent_audio = AudioSegment.empty()
208
+ start = 0
209
+ for chunk in silent_chunks:
210
+ non_silent_audio += audio[start:chunk[0]] # Add non-silent part
211
+ start = chunk[1] # Move to the end of the silent chunk
212
+ non_silent_audio += audio[start:] # Add the remaining part
213
+
214
+ # Export the processed audio
215
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
216
+ output_path = temp_output.name
217
+ non_silent_audio.export(output_path, format="wav")
218
+
219
+ # Clean up temporary WAV file
220
+ os.remove(wav_path)
221
+
222
+ return output_path
223
+ except Exception as e:
224
+ logger.error(f"Error in remove_silence: {str(e)}")
225
+ return f"Error: {str(e)}"
226
+
227
+ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
228
+ """
229
+ Detect the target audio in the main audio and trim the main audio to include only the detected segments.
230
 
231
+ Args:
232
+ main_audio (str): Path to the main audio file.
233
+ target_audio (str): Path to the target audio file.
234
+ threshold (float): Detection threshold (0 to 1). Higher values mean stricter detection.
 
 
 
235
 
236
+ Returns:
237
+ str: Path to the trimmed audio file.
238
+ str: Detected timestamps in the format "start-end (in seconds)".
239
+ """
240
+ if main_audio is None or target_audio is None:
241
+ return None, "Error: Please upload both main and target audio files."
242
 
243
+ try:
244
+ # Convert audio files to WAV format
245
+ main_wav_path = convert_to_wav(main_audio)
246
+ target_wav_path = convert_to_wav(target_audio)
247
+
248
+ # Load audio files
249
+ main_rate, main_data = wavfile.read(main_wav_path)
250
+ target_rate, target_data = wavfile.read(target_wav_path)
251
+
252
+ # Ensure both audio files have the same sample rate
253
+ if main_rate != target_rate:
254
+ logger.warning(f"Sample rates differ: main_audio={main_rate}, target_audio={target_rate}. Resampling target audio.")
255
+ target_segment = AudioSegment.from_file(target_wav_path)
256
+ target_segment = resample_audio(target_segment, main_rate)
257
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_resampled:
258
+ resampled_path = temp_resampled.name
259
+ target_segment.export(resampled_path, format="wav")
260
+ target_rate, target_data = wavfile.read(resampled_path)
261
+
262
+ # Normalize audio data
263
+ main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
264
+ target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
265
+
266
+ # Perform cross-correlation to detect the target audio in the main audio
267
+ correlation = correlate(main_data, target_data, mode='valid')
268
+ correlation = np.abs(correlation)
269
+ max_corr = np.max(correlation)
270
+
271
+ # Find the peak in the cross-correlation result
272
+ peak_index = np.argmax(correlation)
273
+ peak_value = correlation[peak_index]
274
+
275
+ # Check if the peak value exceeds the threshold
276
+ if peak_value < threshold * max_corr:
277
+ return None, "Error: Target audio not detected in the main audio."
278
+
279
+ # Calculate the start and end times of the target audio in the main audio
280
+ start_time = peak_index / main_rate
281
+ end_time = (peak_index + len(target_data)) / main_rate
282
+
283
+ # Trim the main audio to include only the detected segment
284
+ main_audio_segment = AudioSegment.from_file(main_wav_path)
285
+ start_ms = int(start_time * 1000)
286
+ end_ms = int(end_time * 1000)
287
+ trimmed_audio = main_audio_segment[start_ms:end_ms]
288
+
289
+ # Export the trimmed audio
290
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
291
+ output_path = temp_output.name
292
+ trimmed_audio.export(output_path, format="wav")
293
+
294
+ # Format timestamps
295
+ timestamps_str = f"{start_time:.2f}-{end_time:.2f}"
296
+
297
+ # Clean up temporary WAV files
298
+ os.remove(main_wav_path)
299
+ os.remove(target_wav_path)
300
+ if 'resampled_path' in locals():
301
+ os.remove(resampled_path)
302
+
303
+ return output_path, timestamps_str
304
+ except Exception as e:
305
+ logger.error(f"Error in detect_and_trim_audio: {str(e)}")
306
+ return None, f"Error: {str(e)}"
307
 
308
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
309
  """Transcribe the audio file."""
310
+ if audio_file is None:
311
+ return "Error: No audio file uploaded."
 
 
 
312
 
313
+ try:
314
+ # Convert audio to WAV format
315
+ wav_path = convert_to_wav(audio_file)
 
 
316
 
317
+ # Convert audio to 16kHz mono for better compatibility
318
+ audio = AudioSegment.from_file(wav_path)
319
+ audio = audio.set_frame_rate(16000).set_channels(1)
320
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_processed:
321
+ processed_audio_path = temp_processed.name
322
+ audio.export(processed_audio_path, format="wav")
 
 
 
 
 
 
 
 
 
323
 
324
+ # Load the appropriate model
325
+ if model_size == "Faster Whisper Large v3":
326
+ # Define device and compute type for faster-whisper
327
+ device = "cuda" if torch.cuda.is_available() else "cpu"
328
+ compute_type = "float32" if device == "cuda" else "int8"
329
+
330
+ # Use faster-whisper for the Systran model
331
+ model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
332
+ segments, info = model.transcribe(
333
+ processed_audio_path,
334
+ task="transcribe",
335
+ word_timestamps=True,
336
+ repetition_penalty=1.1,
337
+ temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
338
+ )
339
+ transcription = " ".join([segment.text for segment in segments])
340
+ detected_language_code = info.language
341
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
342
  else:
343
+ # Use the standard Whisper model
344
+ model = whisper.load_model(MODELS[model_size])
345
+
346
+ # Transcribe the audio
347
+ if language == "Auto Detect":
348
+ result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
349
+ detected_language_code = result.get("language", "unknown")
350
+ detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
351
+ else:
352
+ language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
353
+ result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
354
+ detected_language = language
355
+
356
+ transcription = result["text"]
357
 
358
+ # Clean up processed audio file
359
+ os.remove(processed_audio_path)
360
+ os.remove(wav_path)
361
+
362
+ # Return transcription and detected language
363
+ return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
364
+ except Exception as e:
365
+ logger.error(f"Error in transcribe_audio: {str(e)}")
366
+ return f"Error: {str(e)}"
367
 
368
  # Define the Gradio interface
369
  with gr.Blocks() as demo:
370
+ gr.Markdown("# Audio Processing Tool")
371
 
372
  with gr.Tab("Detect Language"):
373
  gr.Markdown("Upload an audio file to detect its language.")
 
408
  silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
409
  silence_button = gr.Button("Remove Silence")
410
 
411
+ with gr.Tab("Detect and Trim Audio"):
412
+ gr.Markdown("Upload a main audio file and a target audio file. The app will detect the target audio in the main audio and trim it.")
413
+ main_audio_input = gr.Audio(type="filepath", label="Upload Main Audio File")
414
+ target_audio_input = gr.Audio(type="filepath", label="Upload Target Audio File")
415
+ threshold_slider = gr.Slider(
416
+ minimum=0.1, maximum=1.0, value=0.5, step=0.1,
417
+ label="Detection Threshold",
418
+ info="Higher values mean stricter detection."
419
+ )
420
+ trimmed_audio_output = gr.Audio(label="Trimmed Audio", type="filepath")
421
+ timestamps_output = gr.Textbox(label="Detected Timestamps (in seconds)")
422
+ detect_trim_button = gr.Button("Detect and Trim")
423
+
424
  # Link buttons to functions
425
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
426
  transcribe_button.click(
 
433
  inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
434
  outputs=silence_output
435
  )
436
+ detect_trim_button.click(
437
+ detect_and_trim_audio,
438
+ inputs=[main_audio_input, target_audio_input, threshold_slider],
439
+ outputs=[trimmed_audio_output, timestamps_output]
440
+ )
441
 
442
  # Launch the Gradio interface
443
  demo.launch()