Spaces:
Sleeping
Sleeping
""" | |
File: app.py | |
Description: Translate text... | |
Author: Didier Guillevic | |
Date: 2024-09-07 | |
""" | |
import spaces | |
import logging | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.INFO) | |
import gradio as gr | |
import langdetect | |
from deep_translator import GoogleTranslator | |
from model_spacy import nlp_xx | |
import model_translation | |
m2m100 = model_translation.ModelM2M100() | |
def translate_with_Helsinki( | |
chunks, src_lang, tgt_lang, input_max_length, output_max_length) -> str: | |
"""Translate the chunks with the Helsinki model | |
""" | |
if src_lang not in translation.src_langs: | |
return ( | |
f"ISSUE: currently no model for language '{src_lang}'. " | |
"If wrong language, please specify language." | |
) | |
logger.info(f"LANG: {src_lang}, TEXT: {chunks[0][:50]}...") | |
tokenizer, model = translation.get_tokenizer_model_for_src_lang(src_lang) | |
translated_chunks = [] | |
for chunk in chunks: | |
# NOTE: The 'fa' (Persian) model has multiple target languages to choose from. | |
# We need to specifiy the desired languages among: fra ita por ron spa | |
# https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-fa-itc | |
# Prepend text with >>fra<< in order to translate in French. | |
if src_lang == 'fa': | |
chunk = ">>fra<< " + chunk | |
inputs = tokenizer( | |
chunk, return_tensors="pt", max_length=input_max_length, | |
truncation=True, padding="longest").to(model.device) | |
outputs = model.generate(**inputs, max_length=output_max_length) | |
translated_chunk = tokenizer.batch_decode( | |
outputs, skip_special_tokens=True)[0] | |
#logger.info(f"Text: {chunk}") | |
#logger.info(f"Translation: {translated_chunk}") | |
translated_chunks.append(translated_chunk) | |
return '\n'.join(translated_chunks) | |
def translate_text( | |
text: str, | |
src_lang: str, | |
tgt_lang: str | |
) -> str: | |
"""Translate the given text into English or French | |
""" | |
# src_lang among the supported languages? | |
# - make sure src_lang is not None | |
src_lang = src_lang if (src_lang and src_lang != "auto") else langdetect.detect(text) | |
if src_lang not in model_translation.language_codes.values(): | |
logging.error(f"Language detected {src_lang} not among supported language") | |
# tgt_lang: make sure it is not None. Default to 'en' if not set. | |
if tgt_lang not in model_translation.tgt_language_codes.values(): | |
tgt_lang = 'en' | |
# translate | |
translated_text_m2m100 = m2m100.translate(text, src_lang, tgt_lang) | |
translated_text_google_translate = GoogleTranslator( | |
source='auto', target='en').translate(text=text) | |
return ( | |
translated_text_m2m100, | |
translated_text_google_translate | |
) | |
# | |
# User interface | |
# | |
with gr.Blocks() as demo: | |
gr.Markdown(""" | |
## Text translation v0.0.3 | |
""") | |
# Input | |
input_text = gr.Textbox( | |
lines=5, | |
placeholder="Enter text to translate", | |
label="Text to translate", | |
render=True | |
) | |
# Output | |
output_text_m2m100 = gr.Textbox( | |
lines=4, | |
label="Facebook m2m100 (418M)", | |
render=True | |
) | |
output_text_google_translate = gr.Textbox( | |
lines=4, | |
label="Google Translate", | |
render=True | |
) | |
# Source and target languages | |
with gr.Row(): | |
src_lang = gr.Dropdown( | |
choices=model_translation.language_codes.items(), | |
value="auto", | |
label="Source language", | |
render=True | |
) | |
tgt_lang = gr.Dropdown( | |
choices=model_translation.tgt_language_codes.items(), | |
value="en", | |
label="Target language", | |
render=True | |
) | |
# Submit button | |
translate_btn = gr.Button("Translate") | |
translate_btn.click( | |
fn=translate_text, | |
inputs=[input_text, src_lang, tgt_lang], | |
outputs=[output_text_m2m100, output_text_google_translate] | |
) | |
with gr.Accordion("Documentation", open=False): | |
gr.Markdown(""" | |
- Models: serving Facebook M2M100 (418M) and Google Translate. | |
""") | |
if __name__ == "__main__": | |
demo.launch() | |