---
license: mit
license_name: neuralaudioai-license
license_link: LICENSE
language:
- en
- es
- ko
- zh
- pl
- pt
- uk
- de
- fr
- el
- ru
pipeline_tag: text-to-speech
library_name: transformers
tags:
- tts
---

[![Open Demo](https://img.shields.io/badge/🤗-Open%20Demo-blue.svg)](https://neuralaudioai-na-base.hf.space)

## Model Information
**NA_base** is a **state-of-the-art** open-source Text-to-Speech (TTS) model designed for **high-quality, real-time speech synthesis**. Built using cutting-edge neural architectures, **NA_base** is optimized for **speed, efficiency, and multilingual support**—making it the perfect choice for developers, businesses, and researchers.

**Key Features**:
- Supports **15 languages**  
- **Fast real-time inference**  
- Natural-sounding, **human-like speech**  
- Designed for **deployment in cloud, edge, and offline environments**  

## How It Works
**NA_base** leverages deep learning-based neural TTS techniques to synthesize speech from raw text. It is **lightweight**, efficient, and trained on high-quality datasets for robust generalization.

## Usage
Install the required dependencies:

```bash
pip install xcodec2
```
### Synthesizing Speech from Text
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import soundfile as sf

# Define the model
model_name = "NeuralAudioAI/NA_base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval().to("cuda")

from xcodec2.modeling_xcodec2 import XCodec2Model

# Load the Codec model
codec_model_path = "NeuralAudioAI/xcodec2"  
Codec_model = XCodec2Model.from_pretrained(codec_model_path)
Codec_model.eval().cuda()   

# Input text for synthesis
input_text = "Dealing with family secrets is never easy. Yet, sometimes, omission is a form of protection, intending to safeguard some from the harsh truths. One day, I hope you understand the reasons behind my actions. Until then, please, bear with me."

def ids_to_speech_tokens(speech_ids):
    """ Convert speech IDs into token strings """
    return [f"<|s_{speech_id}|>" for speech_id in speech_ids]

def extract_speech_ids(speech_tokens_str):
    """ Extract speech token IDs from the token strings """
    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith('<|s_') and token_str.endswith('|>'):
            num_str = token_str[4:-2]
            speech_ids.append(int(num_str))
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids

# TTS Generation
with torch.no_grad():
    formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"

    # Tokenize the input
    chat = [
        {"role": "user", "content": "Convert the text to speech:" + formatted_text},
        {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
    ]

    input_ids = tokenizer.apply_chat_template(
        chat, 
        tokenize=True, 
        return_tensors='pt', 
        continue_final_message=True
    ).to("cuda")

    speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')

    # Generate speech tokens
    outputs = model.generate(
        input_ids,
        max_length=2048,  # Trained with a max length of 2048
        eos_token_id=speech_end_id,
        do_sample=True,    
        top_p=1,  # Adjusts the diversity of generated content
        temperature=0.8,  # Controls randomness in output
    )

    # Extract the generated speech tokens
    generated_ids = outputs[0][input_ids.shape[1]:-1]
    speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)  
    speech_tokens = extract_speech_ids(speech_tokens)

    # Convert to tensor for decoding
    speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)

    # Decode to waveform
    gen_wav = Codec_model.decode_code(speech_tokens)

# Save generated audio
sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)
```
### Synthesizing Speech from Text
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import soundfile as sf

# Define the model
model_name = "NeuralAudioAI/NA_base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval().to("cuda")

from xcodec2.modeling_xcodec2 import XCodec2Model

# Load the Codec model
codec_model_path = "NeuralAudioAI/xcodec2"  
Codec_model = XCodec2Model.from_pretrained(codec_model_path)
Codec_model.eval().cuda()   

# Only 16kHz speech support!
prompt_wav, sr = sf.read("prompt.wav")  # Use an appropriate prompt speech file
prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0)  

# Define input text
prompt_text = "This is a sample prompt speech input."
target_text = "This is the generated speech continuation."
input_text = prompt_text + target_text

def ids_to_speech_tokens(speech_ids):
    """ Convert speech IDs into token strings """
    return [f"<|s_{speech_id}|>" for speech_id in speech_ids]

def extract_speech_ids(speech_tokens_str):
    """ Extract speech token IDs from the token strings """
    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith('<|s_') and token_str.endswith('|>'):
            num_str = token_str[4:-2]
            speech_ids.append(int(num_str))
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids

# TTS Generation with Speech Prompt
with torch.no_grad():
    # Encode the prompt wav
    vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
    print("Prompt VQ Code Shape:", vq_code_prompt.shape)   

    vq_code_prompt = vq_code_prompt[0, 0, :]
    # Convert int 12345 to token <|s_12345|>
    speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)

    formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"

    # Tokenize the text and speech prefix
    chat = [
        {"role": "user", "content": "Convert the text to speech:" + formatted_text},
        {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)}
    ]

    input_ids = tokenizer.apply_chat_template(
        chat, 
        tokenize=True, 
        return_tensors='pt', 
        continue_final_message=True
    ).to("cuda")

    speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')

    # Generate the speech autoregressively
    outputs = model.generate(
        input_ids,
        max_length=2048,  # Trained with a max length of 2048
        eos_token_id=speech_end_id,
        do_sample=True,
        top_p=1,  # Adjusts the diversity of generated content
        temperature=0.8,
    )

    # Extract the speech tokens
    generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix):-1]
    speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)  
    speech_tokens = extract_speech_ids(speech_tokens)

    # Convert to tensor for decoding
    speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)

    # Decode to waveform
    gen_wav = Codec_model.decode_code(speech_tokens)

    # If only need the generated part
    # gen_wav = gen_wav[:, :, prompt_wav.shape[1]:]

# Save generated audio
sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)
```