Model Information
NA_base is a state-of-the-art open-source Text-to-Speech (TTS) model designed for high-quality, real-time speech synthesis. Built using cutting-edge neural architectures, NA_base is optimized for speed, efficiency, and multilingual support—making it the perfect choice for developers, businesses, and researchers.
Key Features:
- Supports 15 languages
- Fast real-time inference
- Natural-sounding, human-like speech
- Designed for deployment in cloud, edge, and offline environments
How It Works
NA_base leverages deep learning-based neural TTS techniques to synthesize speech from raw text. It is lightweight, efficient, and trained on high-quality datasets for robust generalization.
Usage
Install the required dependencies:
pip install xcodec2
Synthesizing Speech from Text
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import soundfile as sf
# Define the model
model_name = "NeuralAudioAI/NA_base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval().to("cuda")
from xcodec2.modeling_xcodec2 import XCodec2Model
# Load the Codec model
codec_model_path = "NeuralAudioAI/xcodec2"
Codec_model = XCodec2Model.from_pretrained(codec_model_path)
Codec_model.eval().cuda()
# Input text for synthesis
input_text = "Dealing with family secrets is never easy. Yet, sometimes, omission is a form of protection, intending to safeguard some from the harsh truths. One day, I hope you understand the reasons behind my actions. Until then, please, bear with me."
def ids_to_speech_tokens(speech_ids):
""" Convert speech IDs into token strings """
return [f"<|s_{speech_id}|>" for speech_id in speech_ids]
def extract_speech_ids(speech_tokens_str):
""" Extract speech token IDs from the token strings """
speech_ids = []
for token_str in speech_tokens_str:
if token_str.startswith('<|s_') and token_str.endswith('|>'):
num_str = token_str[4:-2]
speech_ids.append(int(num_str))
else:
print(f"Unexpected token: {token_str}")
return speech_ids
# TTS Generation
with torch.no_grad():
formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
# Tokenize the input
chat = [
{"role": "user", "content": "Convert the text to speech:" + formatted_text},
{"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
]
input_ids = tokenizer.apply_chat_template(
chat,
tokenize=True,
return_tensors='pt',
continue_final_message=True
).to("cuda")
speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
# Generate speech tokens
outputs = model.generate(
input_ids,
max_length=2048, # Trained with a max length of 2048
eos_token_id=speech_end_id,
do_sample=True,
top_p=1, # Adjusts the diversity of generated content
temperature=0.8, # Controls randomness in output
)
# Extract the generated speech tokens
generated_ids = outputs[0][input_ids.shape[1]:-1]
speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
speech_tokens = extract_speech_ids(speech_tokens)
# Convert to tensor for decoding
speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
# Decode to waveform
gen_wav = Codec_model.decode_code(speech_tokens)
# Save generated audio
sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)
Synthesizing Speech from Text
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import soundfile as sf
# Define the model
model_name = "NeuralAudioAI/NA_base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval().to("cuda")
from xcodec2.modeling_xcodec2 import XCodec2Model
# Load the Codec model
codec_model_path = "NeuralAudioAI/xcodec2"
Codec_model = XCodec2Model.from_pretrained(codec_model_path)
Codec_model.eval().cuda()
# Only 16kHz speech support!
prompt_wav, sr = sf.read("prompt.wav") # Use an appropriate prompt speech file
prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0)
# Define input text
prompt_text = "This is a sample prompt speech input."
target_text = "This is the generated speech continuation."
input_text = prompt_text + target_text
def ids_to_speech_tokens(speech_ids):
""" Convert speech IDs into token strings """
return [f"<|s_{speech_id}|>" for speech_id in speech_ids]
def extract_speech_ids(speech_tokens_str):
""" Extract speech token IDs from the token strings """
speech_ids = []
for token_str in speech_tokens_str:
if token_str.startswith('<|s_') and token_str.endswith('|>'):
num_str = token_str[4:-2]
speech_ids.append(int(num_str))
else:
print(f"Unexpected token: {token_str}")
return speech_ids
# TTS Generation with Speech Prompt
with torch.no_grad():
# Encode the prompt wav
vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
print("Prompt VQ Code Shape:", vq_code_prompt.shape)
vq_code_prompt = vq_code_prompt[0, 0, :]
# Convert int 12345 to token <|s_12345|>
speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)
formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
# Tokenize the text and speech prefix
chat = [
{"role": "user", "content": "Convert the text to speech:" + formatted_text},
{"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)}
]
input_ids = tokenizer.apply_chat_template(
chat,
tokenize=True,
return_tensors='pt',
continue_final_message=True
).to("cuda")
speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
# Generate the speech autoregressively
outputs = model.generate(
input_ids,
max_length=2048, # Trained with a max length of 2048
eos_token_id=speech_end_id,
do_sample=True,
top_p=1, # Adjusts the diversity of generated content
temperature=0.8,
)
# Extract the speech tokens
generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix):-1]
speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
speech_tokens = extract_speech_ids(speech_tokens)
# Convert to tensor for decoding
speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
# Decode to waveform
gen_wav = Codec_model.decode_code(speech_tokens)
# If only need the generated part
# gen_wav = gen_wav[:, :, prompt_wav.shape[1]:]
# Save generated audio
sf.write("gen.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)
- Downloads last month
- 38
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.