Spaces:

Yehor
/

radtts-uk-vocos-demo

Running

App Files Files Community

radtts-uk-vocos-demo / data.py

Yehor

Fixes

71a422e 14 days ago

raw

history blame contribute delete

8.23 kB

	# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: MIT
	#
	# Permission is hereby granted, free of charge, to any person obtaining a
	# copy of this software and associated documentation files (the "Software"),
	# to deal in the Software without restriction, including without limitation
	# the rights to use, copy, modify, merge, publish, distribute, sublicense,
	# and/or sell copies of the Software, and to permit persons to whom the
	# Software is furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	# DEALINGS IN THE SOFTWARE.

	# Based on https://github.com/NVIDIA/flowtron/blob/master/data.py
	# Original license text:
	###############################################################################
	#
	# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	###############################################################################

	"""adapted from https://github.com/keithito/tacotron"""

	import re
	from string import punctuation
	from functools import reduce


	import torch
	import torch.utils.data

	#########
	# REGEX #
	#########

	# Regular expression matching text enclosed in curly braces for encoding
	_curly_re = re.compile(r"(.?)\{(.+?)\}(.)")

	# Regular expression matching whitespace:
	_whitespace_re = re.compile(r"\s+")

	# Regular expression separating words enclosed in curly braces for cleaning
	_arpa_re = re.compile(r"{[^}]+}\|\S+")


	def lowercase(text):
	return text.lower()


	def collapse_whitespace(text):
	return re.sub(_whitespace_re, " ", text)


	def remove_space_before_punctuation(text):
	return re.sub(r"\s([{}](?:\s\|$))".format(punctuation), r"\1", text)


	class Cleaner:
	def __init__(self, cleaner_names, phonemedict):
	self.cleaner_names = cleaner_names
	self.phonemedict = phonemedict

	def __call__(self, text):
	for cleaner_name in self.cleaner_names:
	sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name)
	for fn in sequence_fns:
	text = fn(text)

	text = [
	reduce(lambda x, y: y(x), word_fns, split) if split[0] != "{" else split
	for split in _arpa_re.findall(text)
	]
	text = " ".join(text)

	text = remove_space_before_punctuation(text)

	return text

	def get_cleaner_fns(self, cleaner_name):
	sequence_fns = [lowercase, collapse_whitespace]
	word_fns = []

	return sequence_fns, word_fns


	def get_symbols():
	_punctuation = "'.,?! "
	_special = "-+"
	_letters = "абвгґдежзийклмнопрстуфхцчшщьюяєії"

	symbols = list(_punctuation + _special + _letters)

	return symbols


	class TextProcessing:
	def __init__(
	self,
	symbol_set,
	cleaner_name,
	heteronyms_path,
	phoneme_dict_path,
	p_phoneme,
	handle_phoneme,
	handle_phoneme_ambiguous,
	prepend_space_to_text=False,
	append_space_to_text=False,
	add_bos_eos_to_text=False,
	encoding="latin-1",
	):
	self.phonemedict = {}

	self.p_phoneme = p_phoneme
	self.handle_phoneme = handle_phoneme
	self.handle_phoneme_ambiguous = handle_phoneme_ambiguous

	self.symbols = get_symbols()
	self.cleaner_names = cleaner_name
	self.cleaner = Cleaner(cleaner_name, self.phonemedict)

	self.prepend_space_to_text = prepend_space_to_text
	self.append_space_to_text = append_space_to_text
	self.add_bos_eos_to_text = add_bos_eos_to_text

	if add_bos_eos_to_text:
	self.symbols.append("<bos>")
	self.symbols.append("<eos>")

	# Mappings from symbol to numeric ID and vice versa:
	self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
	self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

	def text_to_sequence(self, text):
	sequence = []

	# Check for curly braces and treat their contents as phoneme:
	while len(text):
	m = _curly_re.match(text)
	if not m:
	sequence += self.symbols_to_sequence(text)
	break
	sequence += self.symbols_to_sequence(m.group(1))
	sequence += self.phoneme_to_sequence(m.group(2))
	text = m.group(3)

	return sequence

	def sequence_to_text(self, sequence):
	result = ""
	for symbol_id in sequence:
	if symbol_id in self.id_to_symbol:
	s = self.id_to_symbol[symbol_id]
	# Enclose phoneme back in curly braces:
	if len(s) > 1 and s[0] == "@":
	s = "{%s}" % s[1:]
	result += s
	return result.replace("}{", " ")

	def clean_text(self, text):
	text = self.cleaner(text)
	return text

	def symbols_to_sequence(self, symbols):
	return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id]

	def encode_text(self, text, return_all=False):
	text_clean = self.clean_text(text)
	text = text_clean

	text_encoded = self.text_to_sequence(text)

	if self.prepend_space_to_text:
	text_encoded.insert(0, self.symbol_to_id[" "])

	if self.append_space_to_text:
	text_encoded.append(self.symbol_to_id[" "])

	if self.add_bos_eos_to_text:
	text_encoded.insert(0, self.symbol_to_id["<bos>"])
	text_encoded.append(self.symbol_to_id["<eos>"])

	if return_all:
	return text_encoded, text_clean

	return text_encoded


	class TextProcessor(torch.utils.data.Dataset):
	def __init__(
	self,
	datasets,
	filter_length,
	hop_length,
	win_length,
	sampling_rate,
	n_mel_channels,
	mel_fmin,
	mel_fmax,
	f0_min,
	f0_max,
	max_wav_value,
	use_f0,
	use_energy_avg,
	use_log_f0,
	use_scaled_energy,
	symbol_set,
	cleaner_names,
	heteronyms_path,
	phoneme_dict_path,
	p_phoneme,
	handle_phoneme="word",
	handle_phoneme_ambiguous="ignore",
	speaker_ids=None,
	include_speakers=None,
	n_frames=-1,
	use_attn_prior_masking=True,
	prepend_space_to_text=True,
	append_space_to_text=True,
	add_bos_eos_to_text=False,
	betabinom_cache_path="",
	betabinom_scaling_factor=0.05,
	lmdb_cache_path="",
	dur_min=None,
	dur_max=None,
	combine_speaker_and_emotion=False,
	**kwargs,
	):
	self.tp = TextProcessing(
	symbol_set,
	cleaner_names,
	heteronyms_path,
	phoneme_dict_path,
	p_phoneme=p_phoneme,
	handle_phoneme=handle_phoneme,
	handle_phoneme_ambiguous=handle_phoneme_ambiguous,
	prepend_space_to_text=prepend_space_to_text,
	append_space_to_text=append_space_to_text,
	add_bos_eos_to_text=add_bos_eos_to_text,
	)