Spaces:

Yehor
/

radtts-uk-vocos-demo

Running

App Files Files Community

Yehor commited on 14 days ago

Commit

8a6f9a8

1 Parent(s): a7810c9

Fixes to the codebase

Browse files

Files changed (24) hide show

app.py +23 -49
attribute_prediction_model.py +2 -0
audio_processing.py +38 -37
autoregressive_flow.py +2 -6
common.py +32 -41
data.py +10 -153
export_weights.py +14 -0
loss.py +0 -228
partialconv1d.py +1 -2
radam.py +0 -114
radtts.py +18 -24
requirements.txt +0 -3
torch_env.py +19 -0
tts_text_processing/abbreviations.py +0 -57
tts_text_processing/acronyms.py +0 -69
tts_text_processing/cleaners.py +4 -75
tts_text_processing/cmudict.py +0 -140
tts_text_processing/datestime.py +0 -24
tts_text_processing/grapheme_dictionary.py +0 -37
tts_text_processing/heteronyms +0 -413
tts_text_processing/letters_and_numbers.py +0 -96
tts_text_processing/numerical.py +0 -175
tts_text_processing/symbols.py +0 -144
tts_text_processing/text_processing.py +11 -13

app.py CHANGED Viewed

@@ -6,38 +6,32 @@ import time
 from importlib.metadata import version
 from enum import Enum
-from huggingface_hub import hf_hub_download
-use_zerogpu = False
-try:
-    import spaces  # it's for ZeroGPU
-    use_zerogpu = True
-    print("ZeroGPU is available, changing inference call.")
-except ImportError:
-    print("ZeroGPU is not available, skipping...")
 import gradio as gr
-import torch
-import torchaudio
 # Vocos
 from vocos import Vocos
-# RAD-TTS code
-from radtts import RADTTS
-from data import Data
-from common import update_params
-use_cuda = torch.cuda.is_available()
-if use_cuda:
-    print("CUDA is available, setting correct inference_device variable.")
-    device = "cuda"
-else:
-    device = "cpu"
 def download_file_from_repo(
@@ -65,15 +59,13 @@ def download_file_from_repo(
 download_file_from_repo(
     "Yehor/radtts-uk",
-    "radtts-pp-dap-model/model_dap_84000.pt",
     "./models/",
 )
 # Init the model
-seed = 1234
 config = "configs/radtts-pp-dap-model.json"
-radtts_path = "models/radtts-pp-dap-model/model_dap_84000.pt"
 params = []
@@ -87,19 +79,11 @@ update_params(config, params)
 data_config = config["data_config"]
 model_config = config["model_config"]
-# Seed
-if use_cuda:
-    torch.cuda.manual_seed(seed)
-else:
-    torch.manual_seed(seed)
 # Load vocoder
 vocos = Vocos.from_pretrained("patriotyk/vocos-mel-hifigan-compat-44100khz").to(device)
 # Load RAD-TTS
-radtts = RADTTS(**model_config)
-if use_cuda:
-    radtts = radtts.cuda()
 radtts.enable_inverse_cache()  # cache inverse matrix for 1x1 invertible convs
@@ -151,6 +135,7 @@ tech_env = f"""
 #### Environment
 - Python: {sys.version}
 """.strip()
 tech_libraries = f"""
@@ -161,8 +146,6 @@ tech_libraries = f"""
 - scipy: {version("scipy")}
 - numba: {version("numba")}
 - librosa: {version("librosa")}
-- unidecode: {version("unidecode")}
-- inflect: {version("inflect")}
 """.strip()
@@ -218,25 +201,16 @@ def inference(text, voice):
     energy_mean = 0
     energy_std = 0
-    tensor_text = trainset.get_text(text)
-    speaker_id = trainset.get_speaker_id(speaker)
     speaker_id_text, speaker_id_attributes = speaker_id, speaker_id
     if speaker_text is not None:
-        speaker_id_text = trainset.get_speaker_id(speaker_text)
     if speaker_attributes is not None:
-        speaker_id_attributes = trainset.get_speaker_id(speaker_attributes)
-    if use_cuda:
-        tensor_text = tensor_text.cuda()
-        speaker_id = speaker_id.cuda()
-        if speaker_id_text is not None:
-            speaker_id_text = speaker_id_text.cuda()
-        if speaker_id_attributes is not None:
-            speaker_id_attributes = speaker_id_attributes.cuda()
     inference_start = time.time()

 from importlib.metadata import version
 from enum import Enum
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+# RAD-TTS code
+from radtts import RADTTS
+from data import Data
+from common import update_params
+from torch_env import device
 import gradio as gr
 # Vocos
 from vocos import Vocos
+use_zerogpu = False
+try:
+    import spaces  # it's for ZeroGPU
+    use_zerogpu = True
+    print("ZeroGPU is available, changing inference call.")
+except ImportError:
+    print("ZeroGPU is not available, skipping...")
 def download_file_from_repo(
 download_file_from_repo(
     "Yehor/radtts-uk",
+    "radtts-pp-dap-model/model_dap_84000_state.pt",
     "./models/",
 )
 # Init the model
 config = "configs/radtts-pp-dap-model.json"
+radtts_path = "models/radtts-pp-dap-model/model_dap_84000_state.pt"
 params = []
 data_config = config["data_config"]
 model_config = config["model_config"]
 # Load vocoder
 vocos = Vocos.from_pretrained("patriotyk/vocos-mel-hifigan-compat-44100khz").to(device)
 # Load RAD-TTS
+radtts = RADTTS(**model_config).to(device)
 radtts.enable_inverse_cache()  # cache inverse matrix for 1x1 invertible convs
 #### Environment
 - Python: {sys.version}
+- Torch device: {device}
 """.strip()
 tech_libraries = f"""
 - scipy: {version("scipy")}
 - numba: {version("numba")}
 - librosa: {version("librosa")}
 """.strip()
     energy_mean = 0
     energy_std = 0
+    tensor_text = trainset.get_text(text).to(device)
+    speaker_id = trainset.get_speaker_id(speaker).to(device)
     speaker_id_text, speaker_id_attributes = speaker_id, speaker_id
     if speaker_text is not None:
+        speaker_id_text = trainset.get_speaker_id(speaker_text).to(device)
     if speaker_attributes is not None:
+        speaker_id_attributes = trainset.get_speaker_id(speaker_attributes).to(device)
     inference_start = time.time()

attribute_prediction_model.py CHANGED Viewed

@@ -18,8 +18,10 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 import torch
 from torch import nn
 from common import ConvNorm, Invertible1x1Conv
 from common import AffineTransformationLayer, SplineTransformationLayer
 from common import ConvLSTMLinear

 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 import torch
 from torch import nn
 from common import ConvNorm, Invertible1x1Conv
 from common import AffineTransformationLayer, SplineTransformationLayer
 from common import ConvLSTMLinear

audio_processing.py CHANGED Viewed

@@ -18,12 +18,50 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 import torch
 import numpy as np
 from scipy.signal import get_window
 from librosa.filters import mel as librosa_mel_fn
 import librosa.util as librosa_util
 def window_sumsquare(
     window,
@@ -174,43 +212,6 @@ class TacotronSTFT(torch.nn.Module):
         return mel_output
-"""
-BSD 3-Clause License
-Copyright (c) 2017, Prem Seetharaman
-All rights reserved.
-* Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are met:
-* Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright notice, this
-  list of conditions and the following disclaimer in the
-  documentation and/or other materials provided with the distribution.
-* Neither the name of the copyright holder nor the names of its
-  contributors may be used to endorse or promote products derived from this
-  software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-import torch.nn.functional as F
-from torch.autograd import Variable
-from scipy.signal import get_window
-from librosa.util import pad_center, tiny
 class STFT(torch.nn.Module):
     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""

 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
+"""
+BSD 3-Clause License
+Copyright (c) 2017, Prem Seetharaman
+All rights reserved.
+* Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
 import torch
 import numpy as np
 from scipy.signal import get_window
 from librosa.filters import mel as librosa_mel_fn
 import librosa.util as librosa_util
+import torch.nn.functional as F
+from torch.autograd import Variable
+from librosa.util import pad_center, tiny
 def window_sumsquare(
     window,
         return mel_output
 class STFT(torch.nn.Module):
     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""

autoregressive_flow.py CHANGED Viewed

@@ -45,8 +45,7 @@ import torch
 from torch import nn
 from common import DenseLayer, SplineTransformationLayerAR
-use_cuda = torch.cuda.is_available()
 class AR_Back_Step(torch.nn.Module):
@@ -229,10 +228,7 @@ class AR_Step(torch.nn.Module):
             (1, residual.size(1), residual.size(2)), dtype=residual.dtype
         )
-        if use_cuda:
-            dummy = torch.tensor(data, device="cuda")
-        else:
-            dummy = torch.tensor(data)
         self.attr_lstm.flatten_parameters()

 from torch import nn
 from common import DenseLayer, SplineTransformationLayerAR
+from torch_env import device
 class AR_Back_Step(torch.nn.Module):
             (1, residual.size(1), residual.size(2)), dtype=residual.dtype
         )
+        dummy = torch.tensor(data, device=device)
         self.attr_lstm.flatten_parameters()

common.py CHANGED Viewed

@@ -62,34 +62,7 @@ from splines import (
 )
 from partialconv1d import PartialConv1d as pconv1d
 from typing import Tuple
-use_cuda = torch.cuda.is_available()
-if use_cuda:
-    device = "cuda"
-else:
-    device = "cpu"
-def update_params(config, params):
-    for param in params:
-        print(param)
-        k, v = param.split("=")
-        try:
-            v = ast.literal_eval(v)
-        except:
-            pass
-        k_split = k.split(".")
-        if len(k_split) > 1:
-            parent_k = k_split[0]
-            cur_param = [".".join(k_split[1:]) + "=" + str(v)]
-            update_params(config[parent_k], cur_param)
-        elif k in config and len(k_split) == 1:
-            print(f"overriding {k} with {v}")
-            config[k] = v
-        else:
-            print("{}, {} params not updated".format(k, v))
 def get_mask_from_lengths(lengths):
@@ -103,10 +76,7 @@ def get_mask_from_lengths(lengths):
     max_len = torch.max(lengths).item()
-    if use_cuda:
-        ids = torch.tensor(list(range(max_len)), dtype=torch.long, device="cuda")
-    else:
-        ids = torch.tensor(list(range(max_len)), dtype=torch.long, device="cpu")
     mask = (ids < lengths.unsqueeze(1)).bool()
@@ -172,7 +142,7 @@ class ConvNorm(torch.nn.Module):
             self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)
         )
         if self.use_weight_norm:
-            self.conv = nn.utils.weight_norm(self.conv)
     def forward(self, signal, mask=None):
         if self.use_partial_padding:
@@ -263,7 +233,7 @@ class ConvLSTMLinear(nn.Module):
                 dilation=1,
                 w_init_gain="relu",
             )
-            conv_layer = torch.nn.utils.weight_norm(conv_layer.conv, name="weight")
             convolutions.append(conv_layer)
         self.convolutions = nn.ModuleList(convolutions)
@@ -281,7 +251,7 @@ class ConvLSTMLinear(nn.Module):
             self.bilstm = nn.LSTM(
                 n_channels, lstm_channels, 1, batch_first=True, bidirectional=use_bilstm
             )
-            lstm_norm_fn_pntr = nn.utils.spectral_norm
             self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0")
             if self.lstm_type == "bilstm":
                 self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0_reverse")
@@ -391,10 +361,10 @@ class Encoder(nn.Module):
         if lstm_norm_fn is not None:
             if "spectral" in lstm_norm_fn:
                 print("Applying spectral norm to text encoder LSTM")
-                lstm_norm_fn_pntr = torch.nn.utils.spectral_norm
             elif "weight" in lstm_norm_fn:
                 print("Applying weight norm to text encoder LSTM")
-                lstm_norm_fn_pntr = torch.nn.utils.weight_norm
             self.lstm = lstm_norm_fn_pntr(self.lstm, "weight_hh_l0")
             self.lstm = lstm_norm_fn_pntr(self.lstm, "weight_hh_l0_reverse")
@@ -450,7 +420,7 @@ class Invertible1x1ConvLUS(torch.nn.Module):
         # Ensure determinant is 1.0 not -1.0
         if torch.det(W) < 0:
             W[:, 0] = -1 * W[:, 0]
-        p, lower, upper = torch.lu_unpack(*torch.lu(W))
         self.register_buffer("p", p)
         # diagonals of lower will always be 1s anyway
@@ -616,7 +586,7 @@ class WN(torch.nn.Module):
         self.in_layers = torch.nn.ModuleList()
         self.res_skip_layers = torch.nn.ModuleList()
         start = torch.nn.Conv1d(n_in_channels + n_context_dim, n_channels, 1)
-        start = torch.nn.utils.weight_norm(start, name="weight")
         self.start = start
         self.softplus = torch.nn.Softplus()
         self.affine_activation = affine_activation
@@ -645,7 +615,7 @@ class WN(torch.nn.Module):
             # in_layer = nn.utils.weight_norm(in_layer)
             self.in_layers.append(in_layer)
             res_skip_layer = nn.Conv1d(n_channels, n_channels, 1)
-            res_skip_layer = nn.utils.weight_norm(res_skip_layer)
             self.res_skip_layers.append(res_skip_layer)
     def forward(
@@ -823,7 +793,7 @@ class SplineTransformationLayer(torch.nn.Module):
         # output is unnormalized bin weights
     def forward(self, z, context, inverse=False, seq_lens=None):
-        b_s, c_s, t_s = z.size(0), z.size(1), z.size(2)
         # condition on z_0, transform z_1
         n_half = self.half_mel_channels
@@ -1085,3 +1055,24 @@ class ConvAttention(torch.nn.Module):
         attn = self.softmax(attn)  # softmax along T2
         return attn, attn_logprob

 )
 from partialconv1d import PartialConv1d as pconv1d
 from typing import Tuple
+from torch_env import device
 def get_mask_from_lengths(lengths):
     max_len = torch.max(lengths).item()
+    ids = torch.tensor(list(range(max_len)), dtype=torch.long, device=device)
     mask = (ids < lengths.unsqueeze(1)).bool()
             self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)
         )
         if self.use_weight_norm:
+            self.conv = torch.nn.utils.parametrizations.weight_norm(self.conv)
     def forward(self, signal, mask=None):
         if self.use_partial_padding:
                 dilation=1,
                 w_init_gain="relu",
             )
+            conv_layer = torch.nn.utils.parametrizations.weight_norm(conv_layer.conv, name="weight")
             convolutions.append(conv_layer)
         self.convolutions = nn.ModuleList(convolutions)
             self.bilstm = nn.LSTM(
                 n_channels, lstm_channels, 1, batch_first=True, bidirectional=use_bilstm
             )
+            lstm_norm_fn_pntr = torch.nn.utils.parametrizations.spectral_norm
             self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0")
             if self.lstm_type == "bilstm":
                 self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0_reverse")
         if lstm_norm_fn is not None:
             if "spectral" in lstm_norm_fn:
                 print("Applying spectral norm to text encoder LSTM")
+                lstm_norm_fn_pntr = torch.nn.utils.parametrizations.spectral_norm
             elif "weight" in lstm_norm_fn:
                 print("Applying weight norm to text encoder LSTM")
+                lstm_norm_fn_pntr = torch.nn.utils.parametrizations.weight_norm
             self.lstm = lstm_norm_fn_pntr(self.lstm, "weight_hh_l0")
             self.lstm = lstm_norm_fn_pntr(self.lstm, "weight_hh_l0_reverse")
         # Ensure determinant is 1.0 not -1.0
         if torch.det(W) < 0:
             W[:, 0] = -1 * W[:, 0]
+        p, lower, upper = torch.lu_unpack(*torch.linalg.lu_factor(W))
         self.register_buffer("p", p)
         # diagonals of lower will always be 1s anyway
         self.in_layers = torch.nn.ModuleList()
         self.res_skip_layers = torch.nn.ModuleList()
         start = torch.nn.Conv1d(n_in_channels + n_context_dim, n_channels, 1)
+        start = torch.nn.utils.parametrizations.weight_norm(start, name="weight")
         self.start = start
         self.softplus = torch.nn.Softplus()
         self.affine_activation = affine_activation
             # in_layer = nn.utils.weight_norm(in_layer)
             self.in_layers.append(in_layer)
             res_skip_layer = nn.Conv1d(n_channels, n_channels, 1)
+            res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer)
             self.res_skip_layers.append(res_skip_layer)
     def forward(
         # output is unnormalized bin weights
     def forward(self, z, context, inverse=False, seq_lens=None):
+        b_s, _, t_s = z.size(0), z.size(1), z.size(2)
         # condition on z_0, transform z_1
         n_half = self.half_mel_channels
         attn = self.softmax(attn)  # softmax along T2
         return attn, attn_logprob
+def update_params(config, params):
+    for param in params:
+        print(param)
+        k, v = param.split("=")
+        try:
+            v = ast.literal_eval(v)
+        except Exception as e:
+            print(e)
+        k_split = k.split(".")
+        if len(k_split) > 1:
+            parent_k = k_split[0]
+            cur_param = [".".join(k_split[1:]) + "=" + str(v)]
+            update_params(config[parent_k], cur_param)
+        elif k in config and len(k_split) == 1:
+            print(f"overriding {k} with {v}")
+            config[k] = v
+        else:
+            print("{}, {} params not updated".format(k, v))

data.py CHANGED Viewed

@@ -39,21 +39,21 @@
 ###############################################################################
 import os
-import argparse
-import json
-import numpy as np
-import lmdb
 import pickle as pkl
 import torch
 import torch.utils.data
 from scipy.io.wavfile import read
-from audio_processing import TacotronSTFT
-from tts_text_processing.text_processing import TextProcessing
 from scipy.stats import betabinom
-from librosa import pyin
-from common import update_params
 from scipy.ndimage import distance_transform_edt as distance_transform
 def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=0.05):
     P = phoneme_count
@@ -401,7 +401,8 @@ class Data(torch.utils.data.Dataset):
             elif os.path.exists(f0_path):
                 try:
                     dikt = torch.load(f0_path)
-                except:
                     print(f"f0 loading from {f0_path} is broken, recomputing.")
             if dikt is not None:
@@ -460,147 +461,3 @@ class Data(torch.utils.data.Dataset):
     def __len__(self):
         return len(self.data)
-class DataCollate:
-    """Zero-pads model inputs and targets given number of steps"""
-    def __init__(self, n_frames_per_step=1):
-        self.n_frames_per_step = n_frames_per_step
-    def __call__(self, batch):
-        """Collate from normalized data"""
-        # Right zero-pad all one-hot text sequences to max input length
-        input_lengths, ids_sorted_decreasing = torch.sort(
-            torch.LongTensor([len(x["text_encoded"]) for x in batch]),
-            dim=0,
-            descending=True,
-        )
-        max_input_len = input_lengths[0]
-        text_padded = torch.LongTensor(len(batch), max_input_len)
-        text_padded.zero_()
-        for i in range(len(ids_sorted_decreasing)):
-            text = batch[ids_sorted_decreasing[i]]["text_encoded"]
-            text_padded[i, : text.size(0)] = text
-        # Right zero-pad mel-spec
-        num_mel_channels = batch[0]["mel"].size(0)
-        max_target_len = max([x["mel"].size(1) for x in batch])
-        # include mel padded, gate padded and speaker ids
-        mel_padded = torch.FloatTensor(len(batch), num_mel_channels, max_target_len)
-        mel_padded.zero_()
-        f0_padded = None
-        p_voiced_padded = None
-        voiced_mask_padded = None
-        energy_avg_padded = None
-        if batch[0]["f0"] is not None:
-            f0_padded = torch.FloatTensor(len(batch), max_target_len)
-            f0_padded.zero_()
-        if batch[0]["p_voiced"] is not None:
-            p_voiced_padded = torch.FloatTensor(len(batch), max_target_len)
-            p_voiced_padded.zero_()
-        if batch[0]["voiced_mask"] is not None:
-            voiced_mask_padded = torch.FloatTensor(len(batch), max_target_len)
-            voiced_mask_padded.zero_()
-        if batch[0]["energy_avg"] is not None:
-            energy_avg_padded = torch.FloatTensor(len(batch), max_target_len)
-            energy_avg_padded.zero_()
-        attn_prior_padded = torch.FloatTensor(len(batch), max_target_len, max_input_len)
-        attn_prior_padded.zero_()
-        output_lengths = torch.LongTensor(len(batch))
-        speaker_ids = torch.LongTensor(len(batch))
-        audiopaths = []
-        for i in range(len(ids_sorted_decreasing)):
-            mel = batch[ids_sorted_decreasing[i]]["mel"]
-            mel_padded[i, :, : mel.size(1)] = mel
-            if batch[ids_sorted_decreasing[i]]["f0"] is not None:
-                f0 = batch[ids_sorted_decreasing[i]]["f0"]
-                f0_padded[i, : len(f0)] = f0
-            if batch[ids_sorted_decreasing[i]]["voiced_mask"] is not None:
-                voiced_mask = batch[ids_sorted_decreasing[i]]["voiced_mask"]
-                voiced_mask_padded[i, : len(f0)] = voiced_mask
-            if batch[ids_sorted_decreasing[i]]["p_voiced"] is not None:
-                p_voiced = batch[ids_sorted_decreasing[i]]["p_voiced"]
-                p_voiced_padded[i, : len(f0)] = p_voiced
-            if batch[ids_sorted_decreasing[i]]["energy_avg"] is not None:
-                energy_avg = batch[ids_sorted_decreasing[i]]["energy_avg"]
-                energy_avg_padded[i, : len(energy_avg)] = energy_avg
-            output_lengths[i] = mel.size(1)
-            speaker_ids[i] = batch[ids_sorted_decreasing[i]]["speaker_id"]
-            audiopath = batch[ids_sorted_decreasing[i]]["audiopath"]
-            audiopaths.append(audiopath)
-            cur_attn_prior = batch[ids_sorted_decreasing[i]]["attn_prior"]
-            if cur_attn_prior is None:
-                attn_prior_padded = None
-            else:
-                attn_prior_padded[
-                    i, : cur_attn_prior.size(0), : cur_attn_prior.size(1)
-                ] = cur_attn_prior
-        return {
-            "mel": mel_padded,
-            "speaker_ids": speaker_ids,
-            "text": text_padded,
-            "input_lengths": input_lengths,
-            "output_lengths": output_lengths,
-            "audiopaths": audiopaths,
-            "attn_prior": attn_prior_padded,
-            "f0": f0_padded,
-            "p_voiced": p_voiced_padded,
-            "voiced_mask": voiced_mask_padded,
-            "energy_avg": energy_avg_padded,
-        }
-# ===================================================================
-# Takes directory of clean audio and makes directory of spectrograms
-# Useful for making test sets
-# ===================================================================
-if __name__ == "__main__":
-    # Get defaults so it can work with no Sacred
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", type=str, help="JSON file for configuration")
-    parser.add_argument("-p", "--params", nargs="+", default=[])
-    args = parser.parse_args()
-    args.rank = 0
-    # Parse configs.  Globals nicer in this case
-    with open(args.config) as f:
-        data = f.read()
-    config = json.loads(data)
-    update_params(config, args.params)
-    print(config)
-    data_config = config["data_config"]
-    ignore_keys = ["training_files", "validation_files"]
-    trainset = Data(
-        data_config["training_files"],
-        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
-    )
-    valset = Data(
-        data_config["validation_files"],
-        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
-        speaker_ids=trainset.speaker_ids,
-    )
-    collate_fn = DataCollate()
-    for dataset in (trainset, valset):
-        for i, batch in enumerate(dataset):
-            out = batch
-            print("{}/{}".format(i, len(dataset)))

 ###############################################################################
 import os
 import pickle as pkl
+import lmdb
 import torch
 import torch.utils.data
+import numpy as np
+from librosa import pyin
 from scipy.io.wavfile import read
 from scipy.stats import betabinom
 from scipy.ndimage import distance_transform_edt as distance_transform
+from audio_processing import TacotronSTFT
+from tts_text_processing.text_processing import TextProcessing
 def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=0.05):
     P = phoneme_count
             elif os.path.exists(f0_path):
                 try:
                     dikt = torch.load(f0_path)
+                except Exception as e:
+                    print(e)
                     print(f"f0 loading from {f0_path} is broken, recomputing.")
             if dikt is not None:
     def __len__(self):
         return len(self.data)

export_weights.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+radtts_path = "models/radtts-pp-dap-model/model_dap_84000.pt"
+radtts_path_state = "models/radtts-pp-dap-model/model_dap_84000_state.pt"
+checkpoint_dict = torch.load(radtts_path, map_location="cpu")
+del checkpoint_dict['iteration']
+del checkpoint_dict['optimizer']
+del checkpoint_dict['learning_rate']
+print(checkpoint_dict.keys())
+torch.save(checkpoint_dict, radtts_path_state)

loss.py DELETED Viewed

@@ -1,228 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: MIT
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-from common import get_mask_from_lengths
-def compute_flow_loss(
-    z, log_det_W_list, log_s_list, n_elements, n_dims, mask, sigma=1.0
-):
-    log_det_W_total = 0.0
-    for i, log_s in enumerate(log_s_list):
-        if i == 0:
-            log_s_total = torch.sum(log_s * mask)
-            if len(log_det_W_list):
-                log_det_W_total = log_det_W_list[i]
-        else:
-            log_s_total = log_s_total + torch.sum(log_s * mask)
-            if len(log_det_W_list):
-                log_det_W_total += log_det_W_list[i]
-    if len(log_det_W_list):
-        log_det_W_total *= n_elements
-    z = z * mask
-    prior_NLL = torch.sum(z * z) / (2 * sigma * sigma)
-    loss = prior_NLL - log_s_total - log_det_W_total
-    denom = n_elements * n_dims
-    loss = loss / denom
-    loss_prior = prior_NLL / denom
-    return loss, loss_prior
-def compute_regression_loss(x_hat, x, mask, name=False):
-    x = x[:, None] if len(x.shape) == 2 else x  # add channel dim
-    mask = mask[:, None] if len(mask.shape) == 2 else mask  # add channel dim
-    assert len(x.shape) == len(mask.shape)
-    x = x * mask
-    x_hat = x_hat * mask
-    if name == "vpred":
-        loss = F.binary_cross_entropy_with_logits(x_hat, x, reduction="sum")
-    else:
-        loss = F.mse_loss(x_hat, x, reduction="sum")
-    loss = loss / mask.sum()
-    loss_dict = {"loss_{}".format(name): loss}
-    return loss_dict
-class AttributePredictionLoss(torch.nn.Module):
-    def __init__(self, name, model_config, loss_weight, sigma=1.0):
-        super(AttributePredictionLoss, self).__init__()
-        self.name = name
-        self.sigma = sigma
-        self.model_name = model_config["name"]
-        self.loss_weight = loss_weight
-        self.n_group_size = 1
-        if "n_group_size" in model_config["hparams"]:
-            self.n_group_size = model_config["hparams"]["n_group_size"]
-    def forward(self, model_output, lens):
-        mask = get_mask_from_lengths(lens // self.n_group_size)
-        mask = mask[:, None].float()
-        loss_dict = {}
-        if "z" in model_output:
-            n_elements = lens.sum() // self.n_group_size
-            n_dims = model_output["z"].size(1)
-            loss, loss_prior = compute_flow_loss(
-                model_output["z"],
-                model_output["log_det_W_list"],
-                model_output["log_s_list"],
-                n_elements,
-                n_dims,
-                mask,
-                self.sigma,
-            )
-            loss_dict = {
-                "loss_{}".format(self.name): (loss, self.loss_weight),
-                "loss_prior_{}".format(self.name): (loss_prior, 0.0),
-            }
-        elif "x_hat" in model_output:
-            loss_dict = compute_regression_loss(
-                model_output["x_hat"], model_output["x"], mask, self.name
-            )
-            for k, v in loss_dict.items():
-                loss_dict[k] = (v, self.loss_weight)
-        if len(loss_dict) == 0:
-            raise Exception("loss not supported")
-        return loss_dict
-class AttentionCTCLoss(torch.nn.Module):
-    def __init__(self, blank_logprob=-1):
-        super(AttentionCTCLoss, self).__init__()
-        self.log_softmax = torch.nn.LogSoftmax(dim=3)
-        self.blank_logprob = blank_logprob
-        self.CTCLoss = nn.CTCLoss(zero_infinity=True)
-    def forward(self, attn_logprob, in_lens, out_lens):
-        key_lens = in_lens
-        query_lens = out_lens
-        attn_logprob_padded = F.pad(
-            input=attn_logprob, pad=(1, 0, 0, 0, 0, 0, 0, 0), value=self.blank_logprob
-        )
-        cost_total = 0.0
-        for bid in range(attn_logprob.shape[0]):
-            target_seq = torch.arange(1, key_lens[bid] + 1).unsqueeze(0)
-            curr_logprob = attn_logprob_padded[bid].permute(1, 0, 2)[
-                : query_lens[bid], :, : key_lens[bid] + 1
-            ]
-            curr_logprob = self.log_softmax(curr_logprob[None])[0]
-            ctc_cost = self.CTCLoss(
-                curr_logprob,
-                target_seq,
-                input_lengths=query_lens[bid : bid + 1],
-                target_lengths=key_lens[bid : bid + 1],
-            )
-            cost_total += ctc_cost
-        cost = cost_total / attn_logprob.shape[0]
-        return cost
-class AttentionBinarizationLoss(torch.nn.Module):
-    def __init__(self):
-        super(AttentionBinarizationLoss, self).__init__()
-    def forward(self, hard_attention, soft_attention):
-        log_sum = torch.log(soft_attention[hard_attention == 1]).sum()
-        return -log_sum / hard_attention.sum()
-class RADTTSLoss(torch.nn.Module):
-    def __init__(
-        self,
-        sigma=1.0,
-        n_group_size=1,
-        dur_model_config=None,
-        f0_model_config=None,
-        energy_model_config=None,
-        vpred_model_config=None,
-        loss_weights=None,
-    ):
-        super(RADTTSLoss, self).__init__()
-        self.sigma = sigma
-        self.n_group_size = n_group_size
-        self.loss_weights = loss_weights
-        self.attn_ctc_loss = AttentionCTCLoss(
-            blank_logprob=loss_weights.get("blank_logprob", -1)
-        )
-        self.loss_fns = {}
-        if dur_model_config is not None:
-            self.loss_fns["duration_model_outputs"] = AttributePredictionLoss(
-                "duration", dur_model_config, loss_weights["dur_loss_weight"]
-            )
-        if f0_model_config is not None:
-            self.loss_fns["f0_model_outputs"] = AttributePredictionLoss(
-                "f0", f0_model_config, loss_weights["f0_loss_weight"], sigma=1.0
-            )
-        if energy_model_config is not None:
-            self.loss_fns["energy_model_outputs"] = AttributePredictionLoss(
-                "energy", energy_model_config, loss_weights["energy_loss_weight"]
-            )
-        if vpred_model_config is not None:
-            self.loss_fns["vpred_model_outputs"] = AttributePredictionLoss(
-                "vpred", vpred_model_config, loss_weights["vpred_loss_weight"]
-            )
-    def forward(self, model_output, in_lens, out_lens):
-        loss_dict = {}
-        if len(model_output["z_mel"]):
-            n_elements = out_lens.sum() // self.n_group_size
-            mask = get_mask_from_lengths(out_lens // self.n_group_size)
-            mask = mask[:, None].float()
-            n_dims = model_output["z_mel"].size(1)
-            loss_mel, loss_prior_mel = compute_flow_loss(
-                model_output["z_mel"],
-                model_output["log_det_W_list"],
-                model_output["log_s_list"],
-                n_elements,
-                n_dims,
-                mask,
-                self.sigma,
-            )
-            loss_dict["loss_mel"] = (loss_mel, 1.0)  # loss, weight
-            loss_dict["loss_prior_mel"] = (loss_prior_mel, 0.0)
-        ctc_cost = self.attn_ctc_loss(model_output["attn_logprob"], in_lens, out_lens)
-        loss_dict["loss_ctc"] = (ctc_cost, self.loss_weights["ctc_loss_weight"])
-        for k in model_output:
-            if k in self.loss_fns:
-                if model_output[k] is not None and len(model_output[k]) > 0:
-                    t_lens = in_lens if "dur" in k else out_lens
-                    mout = model_output[k]
-                    for loss_name, v in self.loss_fns[k](mout, t_lens).items():
-                        loss_dict[loss_name] = v
-        return loss_dict

partialconv1d.py CHANGED Viewed

@@ -13,10 +13,9 @@
 import torch
 import torch.nn.functional as F
-from torch import nn
-class PartialConv1d(nn.Conv1d):
     def __init__(self, *args, **kwargs):
         self.multi_channel = False
         self.return_mask = False

 import torch
 import torch.nn.functional as F
+class PartialConv1d(torch.nn.Conv1d):
     def __init__(self, *args, **kwargs):
         self.multi_channel = False
         self.return_mask = False

radam.py DELETED Viewed

@@ -1,114 +0,0 @@
-# Original source taken from https://github.com/LiyuanLucasLiu/RAdam
-#
-# Copyright 2019 Liyuan Liu
-#
-#   Licensed under the Apache License, Version 2.0 (the "License");
-#   you may not use this file except in compliance with the License.
-#   You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing, software
-#   distributed under the License is distributed on an "AS IS" BASIS,
-#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#   See the License for the specific language governing permissions and
-#   limitations under the License.
-import math
-import torch
-# pylint: disable=no-name-in-module
-from torch.optim.optimizer import Optimizer
-class RAdam(Optimizer):
-    """RAdam optimizer"""
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
-        """
-        Init
-        :param params: parameters to optimize
-        :param lr: learning rate
-        :param betas: beta
-        :param eps: numerical precision
-        :param weight_decay: weight decay weight
-        """
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
-        self.buffer = [[None, None, None] for _ in range(10)]
-        super().__init__(params, defaults)
-    def step(self, closure=None):
-        loss = None
-        if closure is not None:
-            loss = closure()
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data.float()
-                if grad.is_sparse:
-                    raise RuntimeError("RAdam does not support sparse gradients")
-                p_data_fp32 = p.data.float()
-                state = self.state[p]
-                if len(state) == 0:
-                    state["step"] = 0
-                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
-                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
-                else:
-                    state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
-                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                beta1, beta2 = group["betas"]
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                state["step"] += 1
-                buffered = self.buffer[int(state["step"] % 10)]
-                if state["step"] == buffered[0]:
-                    N_sma, step_size = buffered[1], buffered[2]
-                else:
-                    buffered[0] = state["step"]
-                    beta2_t = beta2 ** state["step"]
-                    N_sma_max = 2 / (1 - beta2) - 1
-                    N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
-                    buffered[1] = N_sma
-                    # more conservative since it's an approximated value
-                    if N_sma >= 5:
-                        step_size = (
-                            group["lr"]
-                            * math.sqrt(
-                                (1 - beta2_t)
-                                * (N_sma - 4)
-                                / (N_sma_max - 4)
-                                * (N_sma - 2)
-                                / N_sma
-                                * N_sma_max
-                                / (N_sma_max - 2)
-                            )
-                            / (1 - beta1 ** state["step"])
-                        )
-                    else:
-                        step_size = group["lr"] / (1 - beta1 ** state["step"])
-                    buffered[2] = step_size
-                if group["weight_decay"] != 0:
-                    p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)
-                # more conservative since it's an approximated value
-                if N_sma >= 5:
-                    denom = exp_avg_sq.sqrt().add_(group["eps"])
-                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
-                else:
-                    p_data_fp32.add_(-step_size, exp_avg)
-                p.data.copy_(p_data_fp32)
-        return loss

radtts.py CHANGED Viewed

@@ -28,8 +28,7 @@ from common import AffineTransformationLayer, LinearNorm, ExponentialClass
 from common import get_mask_from_lengths
 from attribute_prediction_model import get_attribute_prediction_model
 from alignment import mas_width1 as mas
-use_cuda = torch.cuda.is_available()
 class FlowStep(nn.Module):
@@ -202,10 +201,10 @@ class RADTTS(torch.nn.Module):
                 if context_lstm_norm is not None:
                     if "spectral" in context_lstm_norm:
                         print("Applying spectral norm to context encoder LSTM")
-                        lstm_norm_fn_pntr = torch.nn.utils.spectral_norm
                     elif "weight" in context_lstm_norm:
                         print("Applying weight norm to context encoder LSTM")
-                        lstm_norm_fn_pntr = torch.nn.utils.weight_norm
                     self.context_lstm = lstm_norm_fn_pntr(
                         self.context_lstm, "weight_hh_l0"
@@ -688,11 +687,10 @@ class RADTTS(torch.nn.Module):
         if dur is None:
             # get token durations
-            z_dur = torch.empty(batch_size, 1, n_tokens, dtype=torch.float32)
-            if use_cuda:
-                z_dur = z_dur.cuda()
-            z_dur = z_dur.normal_() * sigma_dur
             dur = self.dur_pred_layer.infer(z_dur, txt_enc, spk_vec_text)
             if dur.shape[-1] < txt_enc.shape[-1]:
@@ -752,9 +750,7 @@ class RADTTS(torch.nn.Module):
                         dtype=torch.float32,
                     )
                     * sigma_f0
-                )
-                if use_cuda:
-                    z_f0 = z_f0.cuda()
                 f0 = self.infer_f0(
                     z_f0,
@@ -780,13 +776,11 @@ class RADTTS(torch.nn.Module):
                         n_energy_feature_channels,
                         max_n_frames,
                         dtype=torch.float32,
                     )
                     * sigma_energy
                 )
-                if use_cuda:
-                    z_energy_avg = z_energy_avg.cuda()
                 energy_avg = self.infer_energy(
                     z_energy_avg, ap_txt_enc_time_expanded, spk_vec, out_lens
                 )[:, 0]
@@ -829,9 +823,7 @@ class RADTTS(torch.nn.Module):
             80 * self.n_group_size,
             max_n_frames // self.n_group_size,
             dtype=torch.float32,
-        )
-        if use_cuda:
-            residual = residual.cuda()
         residual = residual * sigma
@@ -921,15 +913,17 @@ class RADTTS(torch.nn.Module):
             try:
                 nn.utils.remove_spectral_norm(module, name="weight_hh_l0")
                 print("Removed spectral norm from {}".format(name))
-            except:
-                pass
             try:
                 nn.utils.remove_spectral_norm(module, name="weight_hh_l0_reverse")
                 print("Removed spectral norm from {}".format(name))
-            except:
-                pass
             try:
                 nn.utils.remove_weight_norm(module)
                 print("Removed wnorm from {}".format(name))
-            except:
-                pass

 from common import get_mask_from_lengths
 from attribute_prediction_model import get_attribute_prediction_model
 from alignment import mas_width1 as mas
+from torch_env import device
 class FlowStep(nn.Module):
                 if context_lstm_norm is not None:
                     if "spectral" in context_lstm_norm:
                         print("Applying spectral norm to context encoder LSTM")
+                        lstm_norm_fn_pntr = torch.nn.utils.parametrizations.spectral_norm
                     elif "weight" in context_lstm_norm:
                         print("Applying weight norm to context encoder LSTM")
+                        lstm_norm_fn_pntr = torch.nn.utils.parametrizations.weight_norm
                     self.context_lstm = lstm_norm_fn_pntr(
                         self.context_lstm, "weight_hh_l0"
         if dur is None:
             # get token durations
+            z_dur = (
+                torch.randn(batch_size, 1, n_tokens, dtype=torch.float32, device=device)
+                * sigma_dur
+            )
             dur = self.dur_pred_layer.infer(z_dur, txt_enc, spk_vec_text)
             if dur.shape[-1] < txt_enc.shape[-1]:
                         dtype=torch.float32,
                     )
                     * sigma_f0
+                ).to(device)
                 f0 = self.infer_f0(
                     z_f0,
                         n_energy_feature_channels,
                         max_n_frames,
                         dtype=torch.float32,
+                        device=device,
                     )
                     * sigma_energy
                 )
                 energy_avg = self.infer_energy(
                     z_energy_avg, ap_txt_enc_time_expanded, spk_vec, out_lens
                 )[:, 0]
             80 * self.n_group_size,
             max_n_frames // self.n_group_size,
             dtype=torch.float32,
+        ).to(device)
         residual = residual * sigma
             try:
                 nn.utils.remove_spectral_norm(module, name="weight_hh_l0")
                 print("Removed spectral norm from {}".format(name))
+            except Exception as e:
+                print(e)
             try:
                 nn.utils.remove_spectral_norm(module, name="weight_hh_l0_reverse")
                 print("Removed spectral norm from {}".format(name))
+            except Exception as e:
+                print(e)
             try:
                 nn.utils.remove_weight_norm(module)
                 print("Removed wnorm from {}".format(name))
+            except Exception as e:
+                print(e)

requirements.txt CHANGED Viewed

@@ -9,7 +9,4 @@ numba
 lmdb
 librosa
-unidecode
-inflect
 git+https://github.com/langtech-bsc/vocos.git@matcha

 lmdb
 librosa
 git+https://github.com/langtech-bsc/vocos.git@matcha

torch_env.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import torch
+seed = 1234
+# use_mps = torch.mps.is_available()
+use_mps = False
+use_cuda = torch.cuda.is_available()
+if use_mps:
+    device = "mps"
+    torch.mps.manual_seed(seed)
+elif use_cuda:
+    device = "cuda"
+    torch.cuda.manual_seed(seed)
+else:
+    device = "cpu"
+    torch.manual_seed(seed)
+print(f"Inference device: {device}")

tts_text_processing/abbreviations.py DELETED Viewed

@@ -1,57 +0,0 @@
-import re
-_no_period_re = re.compile(r"(No[.])(?=[ ]?[0-9])")
-_percent_re = re.compile(r"([ ]?[%])")
-_half_re = re.compile("([0-9]½)|(½)")
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [
-    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-    for x in [
-        ("mrs", "misess"),
-        ("ms", "miss"),
-        ("mr", "mister"),
-        ("dr", "doctor"),
-        ("st", "saint"),
-        ("co", "company"),
-        ("jr", "junior"),
-        ("maj", "major"),
-        ("gen", "general"),
-        ("drs", "doctors"),
-        ("rev", "reverend"),
-        ("lt", "lieutenant"),
-        ("hon", "honorable"),
-        ("sgt", "sergeant"),
-        ("capt", "captain"),
-        ("esq", "esquire"),
-        ("ltd", "limited"),
-        ("col", "colonel"),
-        ("ft", "fort"),
-    ]
-]
-def _expand_no_period(m):
-    word = m.group(0)
-    if word[0] == "N":
-        return "Number"
-    return "number"
-def _expand_percent(m):
-    return " percent"
-def _expand_half(m):
-    word = m.group(1)
-    if word is None:
-        return "half"
-    return word[0] + " and a half"
-def normalize_abbreviations(text):
-    text = re.sub(_no_period_re, _expand_no_period, text)
-    text = re.sub(_percent_re, _expand_percent, text)
-    text = re.sub(_half_re, _expand_half, text)
-    return text

tts_text_processing/acronyms.py DELETED Viewed

@@ -1,69 +0,0 @@
-import re
-_letter_to_arpabet = {
-    "A": "EY1",
-    "B": "B IY1",
-    "C": "S IY1",
-    "D": "D IY1",
-    "E": "IY1",
-    "F": "EH1 F",
-    "G": "JH IY1",
-    "H": "EY1 CH",
-    "I": "AY1",
-    "J": "JH EY1",
-    "K": "K EY1",
-    "L": "EH1 L",
-    "M": "EH1 M",
-    "N": "EH1 N",
-    "O": "OW1",
-    "P": "P IY1",
-    "Q": "K Y UW1",
-    "R": "AA1 R",
-    "S": "EH1 S",
-    "T": "T IY1",
-    "U": "Y UW1",
-    "V": "V IY1",
-    "X": "EH1 K S",
-    "Y": "W AY1",
-    "W": "D AH1 B AH0 L Y UW0",
-    "Z": "Z IY1",
-    "s": "Z",
-}
-# must ignore roman numerals
-# _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)')
-_acronym_re = re.compile(r"([A-Z][A-Z]+)s?")
-class AcronymNormalizer(object):
-    def __init__(self, phoneme_dict):
-        self.phoneme_dict = phoneme_dict
-    def normalize_acronyms(self, text):
-        def _expand_acronyms(m, add_spaces=True):
-            acronym = m.group(0)
-            # remove dots if they exist
-            acronym = re.sub("\.", "", acronym)
-            acronym = "".join(acronym.split())
-            arpabet = self.phoneme_dict.lookup(acronym)
-            if arpabet is None:
-                acronym = list(acronym)
-                arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
-                # temporary fix
-                if arpabet[-1] == "{Z}" and len(arpabet) > 1:
-                    arpabet[-2] = arpabet[-2][:-1] + " " + arpabet[-1][1:]
-                    del arpabet[-1]
-                arpabet = " ".join(arpabet)
-            elif len(arpabet) == 1:
-                arpabet = "{" + arpabet[0] + "}"
-            else:
-                arpabet = acronym
-            return arpabet
-        text = re.sub(_acronym_re, _expand_acronyms, text)
-        return text
-    def __call__(self, text):
-        return self.normalize_acronyms(text)

tts_text_processing/cleaners.py CHANGED Viewed

@@ -1,26 +1,8 @@
 """adapted from https://github.com/keithito/tacotron"""
-"""
-Cleaners are transformations that run over the input text at both training and eval time.
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-    1. "english_cleaners" for English text
-    2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-         the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-    3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-         the symbols in symbols.py to match your data).
-"""
 import re
 from string import punctuation
 from functools import reduce
-from unidecode import unidecode
-from .numerical import normalize_numbers, normalize_currency
-from .acronyms import AcronymNormalizer
-from .datestime import normalize_datestime
-from .letters_and_numbers import normalize_letters_and_numbers
-from .abbreviations import normalize_abbreviations
 # Regular expression matching whitespace:
@@ -30,26 +12,6 @@ _whitespace_re = re.compile(r"\s+")
 _arpa_re = re.compile(r"{[^}]+}|\S+")
-def expand_abbreviations(text):
-    return normalize_abbreviations(text)
-def expand_numbers(text):
-    return normalize_numbers(text)
-def expand_currency(text):
-    return normalize_currency(text)
-def expand_datestime(text):
-    return normalize_datestime(text)
-def expand_letters_and_numbers(text):
-    return normalize_letters_and_numbers(text)
 def lowercase(text):
     return text.lower()
@@ -58,21 +20,6 @@ def collapse_whitespace(text):
     return re.sub(_whitespace_re, " ", text)
-def separate_acronyms(text):
-    text = re.sub(r"([0-9]+)([a-zA-Z]+)", r"\1 \2", text)
-    text = re.sub(r"([a-zA-Z]+)([0-9]+)", r"\1 \2", text)
-    return text
-def convert_to_ascii(text):
-    return unidecode(text)
-def dehyphenize_compound_words(text):
-    text = re.sub(r"(?<=[a-zA-Z0-9])-(?=[a-zA-Z])", " ", text)
-    return text
 def remove_space_before_punctuation(text):
     return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text)
@@ -81,7 +28,6 @@ class Cleaner(object):
     def __init__(self, cleaner_names, phonemedict):
         self.cleaner_names = cleaner_names
         self.phonemedict = phonemedict
-        self.acronym_normalizer = AcronymNormalizer(self.phonemedict)
     def __call__(self, text):
         for cleaner_name in self.cleaner_names:
@@ -94,30 +40,13 @@ class Cleaner(object):
                 for split in _arpa_re.findall(text)
             ]
             text = " ".join(text)
         text = remove_space_before_punctuation(text)
         return text
     def get_cleaner_fns(self, cleaner_name):
-        if cleaner_name == "basic_cleaners":
-            sequence_fns = [lowercase, collapse_whitespace]
-            word_fns = []
-        elif cleaner_name == "english_cleaners":
-            sequence_fns = [collapse_whitespace, convert_to_ascii, lowercase]
-            word_fns = [expand_numbers, expand_abbreviations]
-        elif cleaner_name == "radtts_cleaners":
-            sequence_fns = [
-                collapse_whitespace,
-                expand_currency,
-                expand_datestime,
-                expand_letters_and_numbers,
-            ]
-            word_fns = [expand_numbers, expand_abbreviations]
-        elif cleaner_name == "ukrainian_cleaners":
-            sequence_fns = [lowercase, collapse_whitespace]
-            word_fns = []
-        elif cleaner_name == "transliteration_cleaners":
-            sequence_fns = [convert_to_ascii, lowercase, collapse_whitespace]
-        else:
-            raise Exception("{} cleaner not supported".format(cleaner_name))
         return sequence_fns, word_fns

 """adapted from https://github.com/keithito/tacotron"""
 import re
 from string import punctuation
 from functools import reduce
 # Regular expression matching whitespace:
 _arpa_re = re.compile(r"{[^}]+}|\S+")
 def lowercase(text):
     return text.lower()
     return re.sub(_whitespace_re, " ", text)
 def remove_space_before_punctuation(text):
     return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text)
     def __init__(self, cleaner_names, phonemedict):
         self.cleaner_names = cleaner_names
         self.phonemedict = phonemedict
     def __call__(self, text):
         for cleaner_name in self.cleaner_names:
                 for split in _arpa_re.findall(text)
             ]
             text = " ".join(text)
         text = remove_space_before_punctuation(text)
         return text
     def get_cleaner_fns(self, cleaner_name):
+        sequence_fns = [lowercase, collapse_whitespace]
+        word_fns = []
         return sequence_fns, word_fns

tts_text_processing/cmudict.py DELETED Viewed

@@ -1,140 +0,0 @@
-"""adapted from https://github.com/keithito/tacotron"""
-import re
-valid_symbols = [
-    "AA",
-    "AA0",
-    "AA1",
-    "AA2",
-    "AE",
-    "AE0",
-    "AE1",
-    "AE2",
-    "AH",
-    "AH0",
-    "AH1",
-    "AH2",
-    "AO",
-    "AO0",
-    "AO1",
-    "AO2",
-    "AW",
-    "AW0",
-    "AW1",
-    "AW2",
-    "AY",
-    "AY0",
-    "AY1",
-    "AY2",
-    "B",
-    "CH",
-    "D",
-    "DH",
-    "EH",
-    "EH0",
-    "EH1",
-    "EH2",
-    "ER",
-    "ER0",
-    "ER1",
-    "ER2",
-    "EY",
-    "EY0",
-    "EY1",
-    "EY2",
-    "F",
-    "G",
-    "HH",
-    "IH",
-    "IH0",
-    "IH1",
-    "IH2",
-    "IY",
-    "IY0",
-    "IY1",
-    "IY2",
-    "JH",
-    "K",
-    "L",
-    "M",
-    "N",
-    "NG",
-    "OW",
-    "OW0",
-    "OW1",
-    "OW2",
-    "OY",
-    "OY0",
-    "OY1",
-    "OY2",
-    "P",
-    "R",
-    "S",
-    "SH",
-    "T",
-    "TH",
-    "UH",
-    "UH0",
-    "UH1",
-    "UH2",
-    "UW",
-    "UW0",
-    "UW1",
-    "UW2",
-    "V",
-    "W",
-    "Y",
-    "Z",
-    "ZH",
-]
-_valid_symbol_set = set(valid_symbols)
-class CMUDict:
-    """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
-    def __init__(self, file_or_path, keep_ambiguous=True):
-        if isinstance(file_or_path, str):
-            with open(file_or_path, encoding="latin-1") as f:
-                entries = _parse_cmudict(f)
-        else:
-            entries = _parse_cmudict(file_or_path)
-        if not keep_ambiguous:
-            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
-        self._entries = entries
-    def __len__(self):
-        return len(self._entries)
-    def lookup(self, word):
-        """Returns list of ARPAbet pronunciations of the given word."""
-        return self._entries.get(word.upper())
-_alt_re = re.compile(r"\([0-9]+\)")
-def _parse_cmudict(file):
-    cmudict = {}
-    for line in file:
-        if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
-            parts = line.split("  ")
-            word = re.sub(_alt_re, "", parts[0])
-            pronunciation = _get_pronunciation(parts[1])
-            if pronunciation:
-                if word in cmudict:
-                    cmudict[word].append(pronunciation)
-                else:
-                    cmudict[word] = [pronunciation]
-    return cmudict
-def _get_pronunciation(s):
-    parts = s.strip().split(" ")
-    for part in parts:
-        if part not in _valid_symbol_set:
-            return None
-    return " ".join(parts)

tts_text_processing/datestime.py DELETED Viewed

@@ -1,24 +0,0 @@
-"""adapted from https://github.com/keithito/tacotron"""
-import re
-_ampm_re = re.compile(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):?([0-5][0-9])?\s*([AaPp][Mm]\b)")
-def _expand_ampm(m):
-    matches = list(m.groups(0))
-    txt = matches[0]
-    txt = txt if int(matches[1]) == 0 else txt + " " + matches[1]
-    if matches[2][0].lower() == "a":
-        txt += " a.m."
-    elif matches[2][0].lower() == "p":
-        txt += " p.m."
-    return txt
-def normalize_datestime(text):
-    text = re.sub(_ampm_re, _expand_ampm, text)
-    # text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])?", r"\1 \2", text)
-    return text

tts_text_processing/grapheme_dictionary.py DELETED Viewed

@@ -1,37 +0,0 @@
-"""adapted from https://github.com/keithito/tacotron"""
-import re
-_alt_re = re.compile(r"\([0-9]+\)")
-class Grapheme2PhonemeDictionary:
-    """Thin wrapper around g2p data."""
-    def __init__(self, file_or_path, keep_ambiguous=True, encoding="latin-1"):
-        with open(file_or_path, encoding=encoding) as f:
-            entries = _parse_g2p(f)
-        if not keep_ambiguous:
-            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
-        self._entries = entries
-    def __len__(self):
-        return len(self._entries)
-    def lookup(self, word):
-        """Returns list of pronunciations of the given word."""
-        return self._entries.get(word.upper())
-def _parse_g2p(file):
-    g2p = {}
-    for line in file:
-        if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
-            parts = line.split("  ")
-            word = re.sub(_alt_re, "", parts[0])
-            pronunciation = parts[1].strip()
-            if word in g2p:
-                g2p[word].append(pronunciation)
-            else:
-                g2p[word] = [pronunciation]
-    return g2p

tts_text_processing/heteronyms DELETED Viewed

@@ -1,413 +0,0 @@
-abject
-abrogate
-absent
-abstract
-abuse
-ache
-acre
-acuminate
-addict
-address
-adduct
-adele
-advocate
-affect
-affiliate
-agape
-aged
-agglomerate
-aggregate
-agonic
-agora
-allied
-ally
-alternate
-alum
-am
-analyses
-andrea
-animate
-apply
-appropriate
-approximate
-ares
-arithmetic
-arsenic
-articulate
-associate
-attribute
-august
-axes
-ay
-aye
-bases
-bass
-bathed
-bested
-bifurcate
-blessed
-blotto
-bow
-bowed
-bowman
-brassy
-buffet
-bustier
-carbonate
-celtic
-choral
-chumash
-close
-closer
-coax
-coincidence
-color coordinate
-colour coordinate
-comber
-combine
-combs
-committee
-commune
-compact
-complex
-compound
-compress
-concert
-conduct
-confine
-confines
-conflict
-conglomerate
-conscript
-conserve
-consist
-console
-consort
-construct
-consult
-consummate
-content
-contest
-contract
-contracts
-contrast
-converse
-convert
-convict
-coop
-coordinate
-covey
-crooked
-curate
-cussed
-decollate
-decrease
-defect
-defense
-delegate
-deliberate
-denier
-desert
-detail
-deviate
-diagnoses
-diffuse
-digest
-discard
-discharge
-discount
-do
-document
-does
-dogged
-domesticate
-dominican
-dove
-dr
-drawer
-duplicate
-egress
-ejaculate
-eject
-elaborate
-ellipses
-email
-emu
-entrace
-entrance
-escort
-estimate
-eta
-etna
-evening
-excise
-excuse
-exploit
-export
-extract
-fine
-flower
-forbear
-four-legged
-frequent
-furrier
-gallant
-gel
-geminate
-gillie
-glower
-gotham
-graduate
-haggis
-heavy
-hinder
-house
-housewife
-impact
-imped
-implant
-implement
-import
-impress
-incense
-incline
-increase
-infix
-insert
-instar
-insult
-integral
-intercept
-interchange
-interflow
-interleaf
-intermediate
-intern
-interspace
-intimate
-intrigue
-invalid
-invert
-invite
-irony
-jagged
-jesses
-julies
-kite
-laminate
-laos
-lather
-lead
-learned
-leasing
-lech
-legitimate
-lied
-lima
-lipread
-live
-lower
-lunged
-maas
-magdalen
-manes
-mare
-marked
-merchandise
-merlion
-minute
-misconduct
-misled
-misprint
-mobile
-moderate
-mong
-moped
-moth
-mouth
-mow
-mpg
-multiply
-mush
-nana
-nice
-nice
-number
-numerate
-nun
-object
-opiate
-ornament
-outbox
-outcry
-outpour
-outreach
-outride
-outright
-outside
-outwork
-overall
-overbid
-overcall
-overcast
-overfall
-overflow
-overhaul
-overhead
-overlap
-overlay
-overuse
-overweight
-overwork
-pace
-palled
-palling
-para
-pasty
-pate
-pauline
-pedal
-peer
-perfect
-periodic
-permit
-pervert
-pinta
-placer
-platy
-polish
-polish
-poll
-pontificate
-postulate
-pram
-prayer
-precipitate
-predate
-predicate
-prefix
-preposition
-present
-pretest
-primer
-proceeds
-produce
-progress
-project
-proportionate
-prospect
-protest
-pussy
-putter
-putting
-quite
-ragged
-raven
-re
-read
-reading
-reading
-real
-rebel
-recall
-recap
-recitative
-recollect
-record
-recreate
-recreation
-redress
-refill
-refund
-refuse
-reject
-relay
-remake
-repaint
-reprint
-reread
-rerun
-resent
-reside
-resign
-respray
-resume
-retard
-retest
-retread
-rewrite
-root
-routed
-routing
-row
-rugged
-rummy
-sais
-sake
-sambuca
-saucier
-second
-secrete
-secreted
-secreting
-segment
-separate
-sewer
-shirk
-shower
-sin
-skied
-slaver
-slough
-sow
-spoof
-squid
-stingy
-subject
-subordinate
-subvert
-supply
-supposed
-survey
-suspect
-syringes
-tabulate
-tales
-tarrier
-tarry
-taxes
-taxis
-tear
-theron
-thou
-three-legged
-tier
-tinged
-torment
-transfer
-transform
-transplant
-transport
-transpose
-tush
-two-legged
-unionised
-unionized
-update
-uplift
-upset
-use
-used
-vale
-violist
-viva
-ware
-whinged
-whoop
-wicked
-wind
-windy
-wino
-won
-worsted
-wound

tts_text_processing/letters_and_numbers.py DELETED Viewed

@@ -1,96 +0,0 @@
-"""adapted from https://github.com/keithito/tacotron"""
-import re
-_letters_and_numbers_re = re.compile(
-    r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE
-)
-_hardware_re = re.compile(
-    "([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)", re.IGNORECASE
-)
-_hardware_key = {
-    "tb": "terabyte",
-    "gb": "gigabyte",
-    "mb": "megabyte",
-    "kb": "kilobyte",
-    "ghz": "gigahertz",
-    "mhz": "megahertz",
-    "khz": "kilohertz",
-    "hz": "hertz",
-    "mm": "millimeter",
-    "cm": "centimeter",
-    "km": "kilometer",
-}
-_dimension_re = re.compile(
-    r"\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b"
-)
-_dimension_key = {"m": "meter", "in": "inch", "inch": "inch"}
-def _expand_letters_and_numbers(m):
-    text = re.split(r"(\d+)", m.group(0))
-    # remove trailing space
-    if text[-1] == "":
-        text = text[:-1]
-    elif text[0] == "":
-        text = text[1:]
-    # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc...
-    if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit():
-        text[-2] = text[-2] + text[-1]
-        text = text[:-1]
-    # for combining digits 2 by 2
-    new_text = []
-    for i in range(len(text)):
-        string = text[i]
-        if string.isdigit() and len(string) < 5:
-            # heuristics
-            if len(string) > 2 and string[-2] == "0":
-                if string[-1] == "0":
-                    string = [string]
-                else:
-                    string = [string[:-3], string[-2], string[-1]]
-            elif len(string) % 2 == 0:
-                string = [string[i : i + 2] for i in range(0, len(string), 2)]
-            elif len(string) > 2:
-                string = [string[0]] + [
-                    string[i : i + 2] for i in range(1, len(string), 2)
-                ]
-            new_text.extend(string)
-        else:
-            new_text.append(string)
-    text = new_text
-    text = " ".join(text)
-    return text
-def _expand_hardware(m):
-    quantity, measure = m.groups(0)
-    measure = _hardware_key[measure.lower()]
-    if measure[-1] != "z" and float(quantity.replace(",", "")) > 1:
-        return "{} {}s".format(quantity, measure)
-    return "{} {}".format(quantity, measure)
-def _expand_dimension(m):
-    text = "".join([x for x in m.groups(0) if x != 0])
-    text = text.replace(" x ", " by ")
-    text = text.replace("x", " by ")
-    if text.endswith(tuple(_dimension_key.keys())):
-        if text[-2].isdigit():
-            text = "{} {}".format(text[:-1], _dimension_key[text[-1:]])
-        elif text[-3].isdigit():
-            text = "{} {}".format(text[:-2], _dimension_key[text[-2:]])
-    return text
-def normalize_letters_and_numbers(text):
-    text = re.sub(_hardware_re, _expand_hardware, text)
-    text = re.sub(_dimension_re, _expand_dimension, text)
-    text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text)
-    return text

tts_text_processing/numerical.py DELETED Viewed

@@ -1,175 +0,0 @@
-"""adapted from https://github.com/keithito/tacotron"""
-import inflect
-import re
-_magnitudes = ["trillion", "billion", "million", "thousand", "hundred", "m", "b", "t"]
-_magnitudes_key = {"m": "million", "b": "billion", "t": "trillion"}
-_measurements = "(f|c|k|d|m)"
-_measurements_key = {"f": "fahrenheit", "c": "celsius", "k": "thousand", "m": "meters"}
-_currency_key = {"$": "dollar", "£": "pound", "€": "euro", "₩": "won"}
-_inflect = inflect.engine()
-_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
-_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
-_currency_re = re.compile(
-    r"([\$€£₩])([0-9\.\,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]))?".format(
-        "|".join(_magnitudes)
-    ),
-    re.IGNORECASE,
-)
-_measurement_re = re.compile(
-    r"([0-9\.\,]*[0-9]+(\s)?{}\b)".format(_measurements), re.IGNORECASE
-)
-_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
-# _range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?')
-_roman_re = re.compile(
-    r"\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b"
-)  # avoid I
-_multiply_re = re.compile(r"(\b[0-9]+)(x)([0-9]+)")
-_number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+")
-def _remove_commas(m):
-    return m.group(1).replace(",", "")
-def _expand_decimal_point(m):
-    return m.group(1).replace(".", " point ")
-def _expand_currency(m):
-    currency = _currency_key[m.group(1)]
-    quantity = m.group(2)
-    magnitude = m.group(3)
-    # remove commas from quantity to be able to convert to numerical
-    quantity = quantity.replace(",", "")
-    # check for million, billion, etc...
-    if magnitude is not None and magnitude.lower() in _magnitudes:
-        if len(magnitude) == 1:
-            magnitude = _magnitudes_key[magnitude.lower()]
-        return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency + "s")
-    parts = quantity.split(".")
-    if len(parts) > 2:
-        return quantity + " " + currency + "s"  # Unexpected format
-    dollars = int(parts[0]) if parts[0] else 0
-    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents:
-        dollar_unit = currency if dollars == 1 else currency + "s"
-        cent_unit = "cent" if cents == 1 else "cents"
-        return "{} {}, {} {}".format(
-            _expand_hundreds(dollars),
-            dollar_unit,
-            _inflect.number_to_words(cents),
-            cent_unit,
-        )
-    elif dollars:
-        dollar_unit = currency if dollars == 1 else currency + "s"
-        return "{} {}".format(_expand_hundreds(dollars), dollar_unit)
-    elif cents:
-        cent_unit = "cent" if cents == 1 else "cents"
-        return "{} {}".format(_inflect.number_to_words(cents), cent_unit)
-    else:
-        return "zero" + " " + currency + "s"
-def _expand_hundreds(text):
-    number = float(text)
-    if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0):
-        return _inflect.number_to_words(int(number / 100)) + " hundred"
-    else:
-        return _inflect.number_to_words(text)
-def _expand_ordinal(m):
-    return _inflect.number_to_words(m.group(0))
-def _expand_measurement(m):
-    _, number, measurement = re.split("(\d+(?:\.\d+)?)", m.group(0))
-    number = _inflect.number_to_words(number)
-    measurement = "".join(measurement.split())
-    measurement = _measurements_key[measurement.lower()]
-    return "{} {}".format(number, measurement)
-def _expand_range(m):
-    return " to "
-def _expand_multiply(m):
-    left = m.group(1)
-    right = m.group(3)
-    return "{} by {}".format(left, right)
-def _expand_roman(m):
-    # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
-    roman_numerals = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
-    result = 0
-    num = m.group(0)
-    for i, c in enumerate(num):
-        if (i + 1) == len(num) or roman_numerals[c] >= roman_numerals[num[i + 1]]:
-            result += roman_numerals[c]
-        else:
-            result -= roman_numerals[c]
-    return str(result)
-def _expand_number(m):
-    _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0))
-    number = int(number)
-    if (
-        number > 1000
-        and number < 10000
-        and (number % 100 == 0)
-        and (number % 1000 != 0)
-    ):
-        text = _inflect.number_to_words(number // 100) + " hundred"
-    elif number > 1000 and number < 3000:
-        if number == 2000:
-            text = "two thousand"
-        elif number > 2000 and number < 2010:
-            text = "two thousand " + _inflect.number_to_words(number % 100)
-        elif number % 100 == 0:
-            text = _inflect.number_to_words(number // 100) + " hundred"
-        else:
-            number = _inflect.number_to_words(
-                number, andword="", zero="oh", group=2
-            ).replace(", ", " ")
-            number = re.sub(r"-", " ", number)
-            text = number
-    else:
-        number = _inflect.number_to_words(number, andword="and")
-        number = re.sub(r"-", " ", number)
-        number = re.sub(r",", "", number)
-        text = number
-    if suffix in ("'s", "s"):
-        if text[-1] == "y":
-            text = text[:-1] + "ies"
-        else:
-            text = text + suffix
-    return text
-def normalize_currency(text):
-    return re.sub(_currency_re, _expand_currency, text)
-def normalize_numbers(text):
-    text = re.sub(_comma_number_re, _remove_commas, text)
-    text = re.sub(_currency_re, _expand_currency, text)
-    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-    text = re.sub(_ordinal_re, _expand_ordinal, text)
-    # text = re.sub(_range_re, _expand_range, text)
-    # text = re.sub(_measurement_re, _expand_measurement, text)
-    text = re.sub(_roman_re, _expand_roman, text)
-    text = re.sub(_multiply_re, _expand_multiply, text)
-    text = re.sub(_number_re, _expand_number, text)
-    return text

tts_text_processing/symbols.py DELETED Viewed

@@ -1,144 +0,0 @@
-"""adapted from https://github.com/keithito/tacotron"""
-"""
-Defines the set of symbols used in text input to the model.
-The default is a set of ASCII characters that works well for English or text
-that has been run through Unidecode. For other data, you can modify
-_characters."""
-arpabet = [
-    "AA",
-    "AA0",
-    "AA1",
-    "AA2",
-    "AE",
-    "AE0",
-    "AE1",
-    "AE2",
-    "AH",
-    "AH0",
-    "AH1",
-    "AH2",
-    "AO",
-    "AO0",
-    "AO1",
-    "AO2",
-    "AW",
-    "AW0",
-    "AW1",
-    "AW2",
-    "AY",
-    "AY0",
-    "AY1",
-    "AY2",
-    "B",
-    "CH",
-    "D",
-    "DH",
-    "EH",
-    "EH0",
-    "EH1",
-    "EH2",
-    "ER",
-    "ER0",
-    "ER1",
-    "ER2",
-    "EY",
-    "EY0",
-    "EY1",
-    "EY2",
-    "F",
-    "G",
-    "HH",
-    "IH",
-    "IH0",
-    "IH1",
-    "IH2",
-    "IY",
-    "IY0",
-    "IY1",
-    "IY2",
-    "JH",
-    "K",
-    "L",
-    "M",
-    "N",
-    "NG",
-    "OW",
-    "OW0",
-    "OW1",
-    "OW2",
-    "OY",
-    "OY0",
-    "OY1",
-    "OY2",
-    "P",
-    "R",
-    "S",
-    "SH",
-    "T",
-    "TH",
-    "UH",
-    "UH0",
-    "UH1",
-    "UH2",
-    "UW",
-    "UW0",
-    "UW1",
-    "UW2",
-    "V",
-    "W",
-    "Y",
-    "Z",
-    "ZH",
-]
-def get_symbols(symbol_set):
-    if symbol_set == "english_basic":
-        _pad = "_"
-        _punctuation = "!'\"(),.:;? "
-        _special = "-"
-        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-        _arpabet = ["@" + s for s in arpabet]
-        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
-    elif symbol_set == "english_basic_lowercase":
-        _pad = "_"
-        _punctuation = "!'\"(),.:;? "
-        _special = "-"
-        _letters = "abcdefghijklmnopqrstuvwxyz"
-        _arpabet = ["@" + s for s in arpabet]
-        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
-    elif symbol_set == "english_expanded":
-        _punctuation = "!'\",.:;? "
-        _math = "#%&*+-/[]()"
-        _special = "_@©°½—₩€$"
-        _accented = "áçéêëñöøćž"
-        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-        _arpabet = ["@" + s for s in arpabet]
-        symbols = (
-            list(_punctuation + _math + _special + _accented + _letters) + _arpabet
-        )
-    elif symbol_set == "ukrainian":
-        _punctuation = "'.,?! "
-        _special = "-+"
-        _letters = "абвгґдежзийклмнопрстуфхцчшщьюяєії"
-        symbols = list(_punctuation + _special + _letters)
-    elif symbol_set == "radtts":
-        _punctuation = "!'\",.:;? "
-        _math = "#%&*+-/[]()"
-        _special = "_@©°½—₩€$"
-        _accented = "áçéêëñöøćž"
-        _numbers = "0123456789"
-        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-        _arpabet = ["@" + s for s in arpabet]
-        symbols = (
-            list(_punctuation + _math + _special + _accented + _numbers + _letters)
-            + _arpabet
-        )
-    else:
-        raise Exception("{} symbol set does not exist".format(symbol_set))
-    return symbols

tts_text_processing/text_processing.py CHANGED Viewed

@@ -2,9 +2,8 @@
 import re
 import numpy as np
 from .cleaners import Cleaner
-from .symbols import get_symbols
-from .grapheme_dictionary import Grapheme2PhonemeDictionary
 #########
@@ -20,11 +19,14 @@ _words_re = re.compile(
 )
-def lines_to_list(filename):
-    with open(filename, encoding="utf-8") as f:
-        lines = f.readlines()
-    lines = [l.rstrip() for l in lines]
-    return lines
 class TextProcessing(object):
@@ -42,18 +44,14 @@ class TextProcessing(object):
         add_bos_eos_to_text=False,
         encoding="latin-1",
     ):
-        if heteronyms_path is not None and heteronyms_path != "":
-            self.heteronyms = set(lines_to_list(heteronyms_path))
-        else:
-            self.heteronyms = []
-        # phoneme dict
         self.phonemedict = {}
         self.p_phoneme = p_phoneme
         self.handle_phoneme = handle_phoneme
         self.handle_phoneme_ambiguous = handle_phoneme_ambiguous
-        self.symbols = get_symbols(symbol_set)
         self.cleaner_names = cleaner_name
         self.cleaner = Cleaner(cleaner_name, self.phonemedict)

 import re
 import numpy as np
 from .cleaners import Cleaner
 #########
 )
+def get_symbols():
+    _punctuation = "'.,?! "
+    _special = "-+"
+    _letters = "абвгґдежзийклмнопрстуфхцчшщьюяєії"
+    symbols = list(_punctuation + _special + _letters)
+    return symbols
 class TextProcessing(object):
         add_bos_eos_to_text=False,
         encoding="latin-1",
     ):
+        self.heteronyms = []
         self.phonemedict = {}
         self.p_phoneme = p_phoneme
         self.handle_phoneme = handle_phoneme
         self.handle_phoneme_ambiguous = handle_phoneme_ambiguous
+        self.symbols = get_symbols()
         self.cleaner_names = cleaner_name
         self.cleaner = Cleaner(cleaner_name, self.phonemedict)