--- license: mit license_name: neuralaudioai-license license_link: LICENSE language: - en - es - ko - zh - pl - pt - uk - de - fr - el - ru pipeline_tag: text-to-speech library_name: transformers tags: - tts --- [![Open Demo](https://img.shields.io/badge/🤗-Open%20Demo-blue.svg)](https://neuralaudioai-na-base.hf.space) ## Model Information **NA_base** is a **state-of-the-art** open-source Text-to-Speech (TTS) model designed for **high-quality, real-time speech synthesis**. Built using cutting-edge neural architectures, **NA_base** is optimized for **speed, efficiency, and multilingual support**—making it the perfect choice for developers, businesses, and researchers. **Key Features**: - Supports **15 languages** - **Fast real-time inference** - Natural-sounding, **human-like speech** - Designed for **deployment in cloud, edge, and offline environments** ## How It Works **NA_base** leverages deep learning-based neural TTS techniques to synthesize speech from raw text. It is **lightweight**, efficient, and trained on high-quality datasets for robust generalization. ## Usage Install the required dependencies: ```bash pip install xcodec2 ``` ### Synthesizing Speech from Text ```python from transformers import AutoTokenizer, AutoModelForCausalLM import torch import soundfile as sf # Define the model model_name = "NeuralAudioAI/NA_base" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) model.eval().to("cuda") from xcodec2.modeling_xcodec2 import XCodec2Model # Load the Codec model codec_model_path = "NeuralAudioAI/xcodec2" Codec_model = XCodec2Model.from_pretrained(codec_model_path) Codec_model.eval().cuda() # Input text for synthesis input_text = "Dealing with family secrets is never easy. Yet, sometimes, omission is a form of protection, intending to safeguard some from the harsh truths. One day, I hope you understand the reasons behind my actions. Until then, please, bear with me." def ids_to_speech_tokens(speech_ids): """ Convert speech IDs into token strings """ return [f"<|s_{speech_id}|>" for speech_id in speech_ids] def extract_speech_ids(speech_tokens_str): """ Extract speech token IDs from the token strings """ speech_ids = [] for token_str in speech_tokens_str: if token_str.startswith('<|s_') and token_str.endswith('|>'): num_str = token_str[4:-2] speech_ids.append(int(num_str)) else: print(f"Unexpected token: {token_str}") return speech_ids # TTS Generation with torch.no_grad(): formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>" # Tokenize the input chat = [ {"role": "user", "content": "Convert the text to speech:" + formatted_text}, {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"} ] input_ids = tokenizer.apply_chat_template( chat, tokenize=True, return_tensors='pt', continue_final_message=True ).to("cuda") speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>') # Generate speech tokens outputs = model.generate( input_ids, max_length=2048, # Trained with a max length of 2048 eos_token_id=speech_end_id, do_sample=True, top_p=1, # Adjusts the diversity of generated content temperature=0.8, # Controls randomness in output ) # Extract the generated speech tokens generated_ids = outputs[0][input_ids.shape[1]:-1] speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) speech_tokens = extract_speech_ids(speech_tokens) # Convert to tensor for decoding speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0) # Decode to waveform gen_wav = Codec_model.decode_code(speech_tokens) # Save generated audio sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000) ``` ### Synthesizing Speech from Text ```python from transformers import AutoTokenizer, AutoModelForCausalLM import torch import soundfile as sf # Define the model model_name = "NeuralAudioAI/NA_base" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) model.eval().to("cuda") from xcodec2.modeling_xcodec2 import XCodec2Model # Load the Codec model codec_model_path = "NeuralAudioAI/xcodec2" Codec_model = XCodec2Model.from_pretrained(codec_model_path) Codec_model.eval().cuda() # Only 16kHz speech support! prompt_wav, sr = sf.read("prompt.wav") # Use an appropriate prompt speech file prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0) # Define input text prompt_text = "This is a sample prompt speech input." target_text = "This is the generated speech continuation." input_text = prompt_text + target_text def ids_to_speech_tokens(speech_ids): """ Convert speech IDs into token strings """ return [f"<|s_{speech_id}|>" for speech_id in speech_ids] def extract_speech_ids(speech_tokens_str): """ Extract speech token IDs from the token strings """ speech_ids = [] for token_str in speech_tokens_str: if token_str.startswith('<|s_') and token_str.endswith('|>'): num_str = token_str[4:-2] speech_ids.append(int(num_str)) else: print(f"Unexpected token: {token_str}") return speech_ids # TTS Generation with Speech Prompt with torch.no_grad(): # Encode the prompt wav vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav) print("Prompt VQ Code Shape:", vq_code_prompt.shape) vq_code_prompt = vq_code_prompt[0, 0, :] # Convert int 12345 to token <|s_12345|> speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt) formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>" # Tokenize the text and speech prefix chat = [ {"role": "user", "content": "Convert the text to speech:" + formatted_text}, {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)} ] input_ids = tokenizer.apply_chat_template( chat, tokenize=True, return_tensors='pt', continue_final_message=True ).to("cuda") speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>') # Generate the speech autoregressively outputs = model.generate( input_ids, max_length=2048, # Trained with a max length of 2048 eos_token_id=speech_end_id, do_sample=True, top_p=1, # Adjusts the diversity of generated content temperature=0.8, ) # Extract the speech tokens generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix):-1] speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) speech_tokens = extract_speech_ids(speech_tokens) # Convert to tensor for decoding speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0) # Decode to waveform gen_wav = Codec_model.decode_code(speech_tokens) # If only need the generated part # gen_wav = gen_wav[:, :, prompt_wav.shape[1]:] # Save generated audio sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000) ```