NeuralAudioAI/NA_base · Hugging Face

Model Information

NA_base is a state-of-the-art open-source Text-to-Speech (TTS) model designed for high-quality, real-time speech synthesis. Built using cutting-edge neural architectures, NA_base is optimized for speed, efficiency, and multilingual support—making it the perfect choice for developers, businesses, and researchers.

Key Features:

Supports 15 languages
Fast real-time inference
Natural-sounding, human-like speech
Designed for deployment in cloud, edge, and offline environments

How It Works

NA_base leverages deep learning-based neural TTS techniques to synthesize speech from raw text. It is lightweight, efficient, and trained on high-quality datasets for robust generalization.

Usage

Install the required dependencies:

pip install xcodec2

Synthesizing Speech from Text

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import soundfile as sf

# Define the model
model_name = "NeuralAudioAI/NA_base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval().to("cuda")

from xcodec2.modeling_xcodec2 import XCodec2Model

# Load the Codec model
codec_model_path = "NeuralAudioAI/xcodec2"  
Codec_model = XCodec2Model.from_pretrained(codec_model_path)
Codec_model.eval().cuda()   

# Input text for synthesis
input_text = "Dealing with family secrets is never easy. Yet, sometimes, omission is a form of protection, intending to safeguard some from the harsh truths. One day, I hope you understand the reasons behind my actions. Until then, please, bear with me."

def ids_to_speech_tokens(speech_ids):
    """ Convert speech IDs into token strings """
    return [f"<|s_{speech_id}|>" for speech_id in speech_ids]

def extract_speech_ids(speech_tokens_str):
    """ Extract speech token IDs from the token strings """
    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith('<|s_') and token_str.endswith('|>'):
            num_str = token_str[4:-2]
            speech_ids.append(int(num_str))
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids

# TTS Generation
with torch.no_grad():
    formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"

    # Tokenize the input
    chat = [
        {"role": "user", "content": "Convert the text to speech:" + formatted_text},
        {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
    ]

    input_ids = tokenizer.apply_chat_template(
        chat, 
        tokenize=True, 
        return_tensors='pt', 
        continue_final_message=True
    ).to("cuda")

    speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')

    # Generate speech tokens
    outputs = model.generate(
        input_ids,
        max_length=2048,  # Trained with a max length of 2048
        eos_token_id=speech_end_id,
        do_sample=True,    
        top_p=1,  # Adjusts the diversity of generated content
        temperature=0.8,  # Controls randomness in output
    )

    # Extract the generated speech tokens
    generated_ids = outputs[0][input_ids.shape[1]:-1]
    speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)  
    speech_tokens = extract_speech_ids(speech_tokens)

    # Convert to tensor for decoding
    speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)

    # Decode to waveform
    gen_wav = Codec_model.decode_code(speech_tokens)

# Save generated audio
sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)

Synthesizing Speech from Text

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import soundfile as sf

# Define the model
model_name = "NeuralAudioAI/NA_base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval().to("cuda")

from xcodec2.modeling_xcodec2 import XCodec2Model

# Load the Codec model
codec_model_path = "NeuralAudioAI/xcodec2"  
Codec_model = XCodec2Model.from_pretrained(codec_model_path)
Codec_model.eval().cuda()   

# Only 16kHz speech support!
prompt_wav, sr = sf.read("prompt.wav")  # Use an appropriate prompt speech file
prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0)  

# Define input text
prompt_text = "This is a sample prompt speech input."
target_text = "This is the generated speech continuation."
input_text = prompt_text + target_text

def ids_to_speech_tokens(speech_ids):
    """ Convert speech IDs into token strings """
    return [f"<|s_{speech_id}|>" for speech_id in speech_ids]

def extract_speech_ids(speech_tokens_str):
    """ Extract speech token IDs from the token strings """
    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith('<|s_') and token_str.endswith('|>'):
            num_str = token_str[4:-2]
            speech_ids.append(int(num_str))
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids

# TTS Generation with Speech Prompt
with torch.no_grad():
    # Encode the prompt wav
    vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
    print("Prompt VQ Code Shape:", vq_code_prompt.shape)   

    vq_code_prompt = vq_code_prompt[0, 0, :]
    # Convert int 12345 to token <|s_12345|>
    speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)

    formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"

    # Tokenize the text and speech prefix
    chat = [
        {"role": "user", "content": "Convert the text to speech:" + formatted_text},
        {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)}
    ]

    input_ids = tokenizer.apply_chat_template(
        chat, 
        tokenize=True, 
        return_tensors='pt', 
        continue_final_message=True
    ).to("cuda")

    speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')

    # Generate the speech autoregressively
    outputs = model.generate(
        input_ids,
        max_length=2048,  # Trained with a max length of 2048
        eos_token_id=speech_end_id,
        do_sample=True,
        top_p=1,  # Adjusts the diversity of generated content
        temperature=0.8,
    )

    # Extract the speech tokens
    generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix):-1]
    speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)  
    speech_tokens = extract_speech_ids(speech_tokens)

    # Convert to tensor for decoding
    speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)

    # Decode to waveform
    gen_wav = Codec_model.decode_code(speech_tokens)

    # If only need the generated part
    # gen_wav = gen_wav[:, :, prompt_wav.shape[1]:]

# Save generated audio
sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)

NeuralAudioAI
/

NA_base

You need to agree to share your contact information to access this model

Model Information

How It Works

Usage

Synthesizing Speech from Text

Synthesizing Speech from Text

Space using NeuralAudioAI/NA_base 1