Yehor's picture
Fixes
71a422e
# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# Based on https://github.com/NVIDIA/flowtron/blob/master/data.py
# Original license text:
###############################################################################
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
###############################################################################
"""adapted from https://github.com/keithito/tacotron"""
import re
from string import punctuation
from functools import reduce
import torch
import torch.utils.data
#########
# REGEX #
#########
# Regular expression matching text enclosed in curly braces for encoding
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
# Regular expression separating words enclosed in curly braces for cleaning
_arpa_re = re.compile(r"{[^}]+}|\S+")
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text)
def remove_space_before_punctuation(text):
return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text)
class Cleaner:
def __init__(self, cleaner_names, phonemedict):
self.cleaner_names = cleaner_names
self.phonemedict = phonemedict
def __call__(self, text):
for cleaner_name in self.cleaner_names:
sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name)
for fn in sequence_fns:
text = fn(text)
text = [
reduce(lambda x, y: y(x), word_fns, split) if split[0] != "{" else split
for split in _arpa_re.findall(text)
]
text = " ".join(text)
text = remove_space_before_punctuation(text)
return text
def get_cleaner_fns(self, cleaner_name):
sequence_fns = [lowercase, collapse_whitespace]
word_fns = []
return sequence_fns, word_fns
def get_symbols():
_punctuation = "'.,?! "
_special = "-+"
_letters = "абвгґдежзийклмнопрстуфхцчшщьюяєії"
symbols = list(_punctuation + _special + _letters)
return symbols
class TextProcessing:
def __init__(
self,
symbol_set,
cleaner_name,
heteronyms_path,
phoneme_dict_path,
p_phoneme,
handle_phoneme,
handle_phoneme_ambiguous,
prepend_space_to_text=False,
append_space_to_text=False,
add_bos_eos_to_text=False,
encoding="latin-1",
):
self.phonemedict = {}
self.p_phoneme = p_phoneme
self.handle_phoneme = handle_phoneme
self.handle_phoneme_ambiguous = handle_phoneme_ambiguous
self.symbols = get_symbols()
self.cleaner_names = cleaner_name
self.cleaner = Cleaner(cleaner_name, self.phonemedict)
self.prepend_space_to_text = prepend_space_to_text
self.append_space_to_text = append_space_to_text
self.add_bos_eos_to_text = add_bos_eos_to_text
if add_bos_eos_to_text:
self.symbols.append("<bos>")
self.symbols.append("<eos>")
# Mappings from symbol to numeric ID and vice versa:
self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
def text_to_sequence(self, text):
sequence = []
# Check for curly braces and treat their contents as phoneme:
while len(text):
m = _curly_re.match(text)
if not m:
sequence += self.symbols_to_sequence(text)
break
sequence += self.symbols_to_sequence(m.group(1))
sequence += self.phoneme_to_sequence(m.group(2))
text = m.group(3)
return sequence
def sequence_to_text(self, sequence):
result = ""
for symbol_id in sequence:
if symbol_id in self.id_to_symbol:
s = self.id_to_symbol[symbol_id]
# Enclose phoneme back in curly braces:
if len(s) > 1 and s[0] == "@":
s = "{%s}" % s[1:]
result += s
return result.replace("}{", " ")
def clean_text(self, text):
text = self.cleaner(text)
return text
def symbols_to_sequence(self, symbols):
return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id]
def encode_text(self, text, return_all=False):
text_clean = self.clean_text(text)
text = text_clean
text_encoded = self.text_to_sequence(text)
if self.prepend_space_to_text:
text_encoded.insert(0, self.symbol_to_id[" "])
if self.append_space_to_text:
text_encoded.append(self.symbol_to_id[" "])
if self.add_bos_eos_to_text:
text_encoded.insert(0, self.symbol_to_id["<bos>"])
text_encoded.append(self.symbol_to_id["<eos>"])
if return_all:
return text_encoded, text_clean
return text_encoded
class TextProcessor(torch.utils.data.Dataset):
def __init__(
self,
datasets,
filter_length,
hop_length,
win_length,
sampling_rate,
n_mel_channels,
mel_fmin,
mel_fmax,
f0_min,
f0_max,
max_wav_value,
use_f0,
use_energy_avg,
use_log_f0,
use_scaled_energy,
symbol_set,
cleaner_names,
heteronyms_path,
phoneme_dict_path,
p_phoneme,
handle_phoneme="word",
handle_phoneme_ambiguous="ignore",
speaker_ids=None,
include_speakers=None,
n_frames=-1,
use_attn_prior_masking=True,
prepend_space_to_text=True,
append_space_to_text=True,
add_bos_eos_to_text=False,
betabinom_cache_path="",
betabinom_scaling_factor=0.05,
lmdb_cache_path="",
dur_min=None,
dur_max=None,
combine_speaker_and_emotion=False,
**kwargs,
):
self.tp = TextProcessing(
symbol_set,
cleaner_names,
heteronyms_path,
phoneme_dict_path,
p_phoneme=p_phoneme,
handle_phoneme=handle_phoneme,
handle_phoneme_ambiguous=handle_phoneme_ambiguous,
prepend_space_to_text=prepend_space_to_text,
append_space_to_text=append_space_to_text,
add_bos_eos_to_text=add_bos_eos_to_text,
)