import gradio as gr import librosa from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline def load_and_fix_data(input_file, model_sampling_rate): speech, sample_rate = librosa.load(input_file) if len(speech.shape) > 1: speech = speech[:, 0] + speech[:, 1] if sample_rate != model_sampling_rate: speech = librosa.resample(speech, sample_rate, model_sampling_rate) return speech feature_extractor = AutoFeatureExtractor.from_pretrained("jonatasgrosman/wav2vec2-xls-r-1b-spanish") sampling_rate = feature_extractor.sampling_rate asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-xls-r-1b-spanish") model = AutoModelForSeq2SeqLM.from_pretrained('hackathon-pln-es/t5-small-spanish-nahuatl') tokenizer = AutoTokenizer.from_pretrained('hackathon-pln-es/t5-small-spanish-nahuatl') new_line = '\n' def predict_and_ctc_lm_decode(input_file): speech = load_and_fix_data(input_file, sampling_rate) transcribed_text = asr(speech, chunk_length_s=10, stride_length_s=1) transcribed_text = transcribed_text["text"] input_ids = tokenizer('translate Spanish to Nahuatl: ' + transcribed_text, return_tensors='pt').input_ids outputs = model.generate(input_ids, max_length=512) outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] return f"Spanish Audio Transcription: {transcribed_text} {new_line} Nahuatl Translation :{outputs}" description = """ This is a Gradio demo of Spanish Audio Transcriptions to Nahuatl Translation. To use this, simply provide an audio input (audio recording or via microphone), which will subsequently be transcribed and translated to the Nahuatl language. Pre-trained model used for Spanish ASR: [jonatasgrosman/wav2vec2-xls-r-1b-spanish](https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-spanish) Pre-trained model used for translating Spanish audio transcription to the Nahuatl language: [hackathon-pln-es/t5-small-spanish-nahuatl](https://huggingface.co/hackathon-pln-es/t5-small-spanish-nahuatl) """ gr.Interface( predict_and_ctc_lm_decode, inputs=[ gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio") ], outputs=[gr.outputs.Textbox()], examples=[["audio1.wav"], ["travel.wav"], ["sample_audio.wav"]], title="Spanish-Audio-Transcriptions-to-Nahuatl-Translation", description = description, #article="

", layout="horizontal", theme="huggingface", ).launch(enable_queue=True, cache_examples=True)