Spaces:

Yehor
/

radtts-uk-vocos-demo

Running

+Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,29 @@
 ---
-title: Radtts Uk Vocos Demo
-emoji: 📈
-colorFrom: indigo
-colorTo: indigo
 sdk: gradio
 sdk_version: 5.19.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+license: apache-2.0
+title: RAD-TTS++ Ukrainian (Vocos)
 sdk: gradio
+emoji: 🎧
+colorFrom: blue
+colorTo: gray
+short_description: Use RAD-TTS++ model to synthesize text in Ukrainian
 sdk_version: 5.19.0
 ---
+## Install
+```shell
+uv venv --python 3.10
+source .venv/bin/activate
+uv pip install -r requirements.txt
+# in development mode
+uv pip install -r requirements-dev.txt
+```
+## Run
+```shell
+python app.py
+```

alignment.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+import numpy as np
+from numba import jit
+@jit(nopython=True)
+def mas_width1(attn_map):
+    """mas with hardcoded width=1"""
+    # assumes mel x text
+    opt = np.zeros_like(attn_map)
+    attn_map = np.log(attn_map)
+    attn_map[0, 1:] = -np.inf
+    log_p = np.zeros_like(attn_map)
+    log_p[0, :] = attn_map[0, :]
+    prev_ind = np.zeros_like(attn_map, dtype=np.int64)
+    for i in range(1, attn_map.shape[0]):
+        for j in range(attn_map.shape[1]):  # for each text dim
+            prev_log = log_p[i - 1, j]
+            prev_j = j
+            if j - 1 >= 0 and log_p[i - 1, j - 1] >= log_p[i - 1, j]:
+                prev_log = log_p[i - 1, j - 1]
+                prev_j = j - 1
+            log_p[i, j] = attn_map[i, j] + prev_log
+            prev_ind[i, j] = prev_j
+    # now backtrack
+    curr_text_idx = attn_map.shape[1] - 1
+    for i in range(attn_map.shape[0] - 1, -1, -1):
+        opt[i, curr_text_idx] = 1
+        curr_text_idx = prev_ind[i, curr_text_idx]
+    opt[0, curr_text_idx] = 1
+    return opt

app.py ADDED Viewed

	@@ -0,0 +1,356 @@

+import os
+import sys
+import json
+import time
+from importlib.metadata import version
+from enum import Enum
+from huggingface_hub import hf_hub_download
+use_zerogpu = False
+try:
+    import spaces  # it's for ZeroGPU
+    use_zerogpu = True
+    print("ZeroGPU is available, changing inference call.")
+except ImportError:
+    print("ZeroGPU is not available, skipping...")
+import gradio as gr
+import torch
+import torchaudio
+# Vocos
+from vocos import Vocos
+# RAD-TTS code
+from radtts import RADTTS
+from data import Data
+from common import update_params
+use_cuda = torch.cuda.is_available()
+if use_cuda:
+    print("CUDA is available, setting correct inference_device variable.")
+    device = "cuda"
+else:
+    device = "cpu"
+def download_file_from_repo(
+    repo_id: str,
+    filename: str,
+    local_dir: str = ".",
+    repo_type: str = "model",
+) -> str:
+    try:
+        os.makedirs(local_dir, exist_ok=True)
+        file_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            local_dir=local_dir,
+            cache_dir=None,
+            force_download=False,
+            repo_type=repo_type,
+        )
+        return file_path
+    except Exception as e:
+        raise Exception(f"An error occurred during download: {e}") from e
+download_file_from_repo(
+    "Yehor/radtts-uk",
+    "radtts-pp-dap-model/model_dap_84000.pt",
+    "./models/",
+)
+# Init the model
+seed = 1234
+config = "configs/radtts-pp-dap-model.json"
+radtts_path = "models/radtts-pp-dap-model/model_dap_84000.pt"
+params = []
+# Load the config
+with open(config) as f:
+    data = f.read()
+config = json.loads(data)
+update_params(config, params)
+data_config = config["data_config"]
+model_config = config["model_config"]
+# Seed
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+# Load vocoder
+vocos = Vocos.from_pretrained("patriotyk/vocos-mel-hifigan-compat-44100khz").to(device)
+# Load RAD-TTS
+if use_cuda:
+    radtts = RADTTS(**model_config).cuda()
+else:
+    radtts = RADTTS(**model_config)
+radtts.enable_inverse_cache()  # cache inverse matrix for 1x1 invertible convs
+checkpoint_dict = torch.load(radtts_path, map_location="cpu")  # todo: CPU?
+radtts.load_state_dict(checkpoint_dict["state_dict"], strict=False)
+radtts.eval()
+print(f"Loaded checkpoint '{radtts_path}')")
+ignore_keys = ["training_files", "validation_files"]
+trainset = Data(
+    data_config["training_files"],
+    **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
+)
+# Config
+concurrency_limit = 5
+title = "RAD-TTS++ Ukrainian"
+# https://www.tablesgenerator.com/markdown_tables
+authors_table = """
+## Authors
+Follow them on social networks and **contact** if you need any help or have any questions:
+| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
+|-------------------------------------------------------------------------------------------------|
+| https://t.me/smlkw in Telegram                                                                  |
+| https://x.com/yehor_smoliakov at X                                                              |
+| https://github.com/egorsmkv at GitHub                                                           |
+| https://huggingface.co/Yehor at Hugging Face                                                    |
+| or use [email protected]                                                                       |
+""".strip()
+description_head = f"""
+# {title}
+## Overview
+Type your text in Ukrainian and select a voice to synthesize speech using [the RAD-TTS++ model](https://huggingface.co/Yehor/radtts-uk) and [Vocos](https://huggingface.co/patriotyk/vocos-mel-hifigan-compat-44100khz) with 44100 Hz.
+""".strip()
+description_foot = f"""
+{authors_table}
+""".strip()
+tech_env = f"""
+#### Environment
+- Python: {sys.version}
+""".strip()
+tech_libraries = f"""
+#### Libraries
+- gradio: {version("gradio")}
+- torch: {version("torch")}
+- scipy: {version("scipy")}
+- numba: {version("numba")}
+- librosa: {version("librosa")}
+- unidecode: {version("unidecode")}
+- inflect: {version("inflect")}
+""".strip()
+class VoiceOption(Enum):
+    Tetiana = "Tetiana (female) 👩"
+    Mykyta = "Mykyta (male) 👨"
+    Lada = "Lada (female) 👩"
+voice_mapping = {
+    VoiceOption.Tetiana.value: "tetiana",
+    VoiceOption.Mykyta.value: "mykyta",
+    VoiceOption.Lada.value: "lada",
+}
+examples = [
+    [
+        "Прокинувся ґазда вранці. Пішов, вичистив з-під коня, вичистив з-під бика, вичистив з-під овечок, вибрав молодняк, відніс його набік.",
+        VoiceOption.Mykyta.value,
+    ],
+    [
+        "Пішов взяв сіна, дав корові. Пішов взяв сіна, дав бикові. Ячміню коняці насипав. Зайшов почистив корову, зайшов ��очистив бика, зайшов почистив коня, за яйця його мацнув.",
+        VoiceOption.Lada.value,
+    ],
+    [
+        "Кінь ногою здригнув, на хазяїна ласкавим оком подивився. Тоді дядько пішов відкрив курей, гусей, качок, повиносив їм зерна, огірків нарізаних, нагодував. Коли чує – з хати дружина кличе. Зайшов. Дітки повмивані, сидять за столом, всі чекають тата. Взяв він ложку, перехрестив дітей, перехрестив лоба, почали снідати. Поснідали, він дістав пряників, роздав дітям. Діти зібралися, пішли в школу. Дядько вийшов, сів на призьбі, взяв сапку, почав мантачити. Мантачив-мантачив, коли – жінка виходить. Він їй ту сапку дає, ласкаво за сраку вщипнув, жінка до нього лагідно всміхнулася, пішла на город – сапати. Коли – йде пастух і товар кличе в череду. Повідмикав дядько овечок, коровку, бика, коня, все відпустив. Сів попри хати, дістав табАку, відірвав шмат газети, насипав, наслинив собі гарну таку цигарку. Благодать божа – і сонечко вже здійнялося над деревами. Дядько встромив цигарку в рота, дістав сірники, тільки чиркати – коли раптом з хати: Доброе утро! Московское время – шесть часов утра! Витяг дядько цигарку с рота, сплюнув набік, і сам собі каже: Ана маєш. Прокинулись, бляді!",
+        VoiceOption.Tetiana.value,
+    ],
+]
+def inference(text, voice):
+    if not text:
+        raise gr.Error("Please paste your text.")
+    gr.Info("Starting...", duration=0.5)
+    speaker = voice_mapping[voice]
+    speaker = speaker_text = speaker_attributes = speaker
+    n_takes = 1
+    sigma = 0.8  # sampling sigma for decoder
+    sigma_tkndur = 0.666  # sampling sigma for duration
+    sigma_f0 = 1.0  # sampling sigma for f0
+    sigma_energy = 1.0  # sampling sigma for energy avg
+    token_dur_scaling = 1.0
+    f0_mean = 0
+    f0_std = 0
+    energy_mean = 0
+    energy_std = 0
+    if use_cuda:
+        speaker_id = trainset.get_speaker_id(speaker).cuda()
+        speaker_id_text, speaker_id_attributes = speaker_id, speaker_id
+        if speaker_text is not None:
+            speaker_id_text = trainset.get_speaker_id(speaker_text).cuda()
+        if speaker_attributes is not None:
+            speaker_id_attributes = trainset.get_speaker_id(speaker_attributes).cuda()
+        tensor_text = trainset.get_text(text).cuda()[None]
+    else:
+        speaker_id = trainset.get_speaker_id(speaker)
+        speaker_id_text, speaker_id_attributes = speaker_id, speaker_id
+        if speaker_text is not None:
+            speaker_id_text = trainset.get_speaker_id(speaker_text)
+        if speaker_attributes is not None:
+            speaker_id_attributes = trainset.get_speaker_id(speaker_attributes)
+        tensor_text = trainset.get_text(text)[None]
+    inference_start = time.time()
+    for take in range(n_takes):
+        with torch.autocast(device, enabled=False):
+            with torch.inference_mode():
+                outputs = radtts.infer(
+                    speaker_id,
+                    tensor_text,
+                    sigma,
+                    sigma_tkndur,
+                    sigma_f0,
+                    sigma_energy,
+                    token_dur_scaling,
+                    token_duration_max=100,
+                    speaker_id_text=speaker_id_text,
+                    speaker_id_attributes=speaker_id_attributes,
+                    f0_mean=f0_mean,
+                    f0_std=f0_std,
+                    energy_mean=energy_mean,
+                    energy_std=energy_std,
+                    use_cuda=use_cuda,
+                )
+                mel = outputs["mel"]
+                gr.Info(
+                    "Synthesized MEL spectrogram, converting to WAVE.", duration=0.5
+                )
+                wav_gen = vocos.decode(mel)
+                wav_gen_float = wav_gen.cpu()
+                torchaudio.save("audio.wav", wav_gen_float, 44_100, encoding="PCM_S")
+                duration = len(wav_gen_float[0]) / 44_100
+    elapsed_time = time.time() - inference_start
+    rtf = elapsed_time / duration
+    speed_ratio = duration / elapsed_time
+    speech_rate = len(text.split(" ")) / duration
+    rtf_value = f"Real-Time Factor: {round(rtf, 4)}, time: {round(elapsed_time, 4)} seconds, audio duration: {round(duration, 4)} seconds. Speed ratio: {round(speed_ratio, 2)}x. Speech rate: {round(speech_rate, 4)} words-per-second."
+    gr.Success("Finished!", duration=0.5)
+    return [gr.Audio("audio.wav"), rtf_value]
+try:
+    @spaces.GPU
+    def inference_zerogpu(text, voice):
+        return inference(text, voice)
+except NameError:
+    print("ZeroGPU is not available, skipping...")
+def inference_cpu(text, voice):
+    return inference(text, voice)
+demo = gr.Blocks(
+    title=title,
+    analytics_enabled=False,
+    theme=gr.themes.Base(),
+)
+with demo:
+    gr.Markdown(description_head)
+    gr.Markdown("## Usage")
+    with gr.Row():
+        with gr.Column():
+            audio = gr.Audio(label="Synthesized audio")
+            rtf = gr.Markdown(
+                label="Real-Time Factor",
+                value="Here you will see how fast the model and the speaker is.",
+            )
+    with gr.Row():
+        with gr.Column():
+            text = gr.Text(
+                label="Text",
+                value="Сл+ава Укра+їні! — українське вітання, національне гасло.",
+            )
+            voice = gr.Radio(
+                label="Voice",
+                choices=[option.value for option in VoiceOption],
+                value=VoiceOption.Tetiana.value,
+            )
+    gr.Button("Run").click(
+        inference_zerogpu if use_zerogpu else inference_cpu,
+        concurrency_limit=concurrency_limit,
+        inputs=[text, voice],
+        outputs=[audio, rtf],
+    )
+    with gr.Row():
+        gr.Examples(
+            label="Choose an example",
+            inputs=[text, voice],
+            examples=examples,
+        )
+    gr.Markdown(description_foot)
+    gr.Markdown("### Gradio app uses:")
+    gr.Markdown(tech_env)
+    gr.Markdown(tech_libraries)
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch()

attribute_prediction_model.py ADDED Viewed

	@@ -0,0 +1,402 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+import torch
+from torch import nn
+from common import ConvNorm, Invertible1x1Conv
+from common import AffineTransformationLayer, SplineTransformationLayer
+from common import ConvLSTMLinear
+from transformer import FFTransformer
+from autoregressive_flow import AR_Step, AR_Back_Step
+def get_attribute_prediction_model(config):
+    name = config["name"]
+    hparams = config["hparams"]
+    if name == "dap":
+        model = DAP(**hparams)
+    elif name == "bgap":
+        model = BGAP(**hparams)
+    elif name == "agap":
+        model = AGAP(**hparams)
+    else:
+        raise Exception("{} model is not supported".format(name))
+    return model
+class AttributeProcessing:
+    def __init__(self, take_log_of_input=False):
+        super(AttributeProcessing).__init__()
+        self.take_log_of_input = take_log_of_input
+    def normalize(self, x):
+        if self.take_log_of_input:
+            x = torch.log(x + 1)
+        return x
+    def denormalize(self, x):
+        if self.take_log_of_input:
+            x = torch.exp(x) - 1
+        return x
+class BottleneckLayerLayer(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        reduction_factor,
+        norm="weightnorm",
+        non_linearity="relu",
+        kernel_size=3,
+        use_partial_padding=False,
+    ):
+        super(BottleneckLayerLayer, self).__init__()
+        self.reduction_factor = reduction_factor
+        reduced_dim = int(in_dim / reduction_factor)
+        self.out_dim = reduced_dim
+        if self.reduction_factor > 1:
+            fn = ConvNorm(
+                in_dim,
+                reduced_dim,
+                kernel_size=kernel_size,
+                use_weight_norm=(norm == "weightnorm"),
+            )
+            if norm == "instancenorm":
+                fn = nn.Sequential(fn, nn.InstanceNorm1d(reduced_dim, affine=True))
+            self.projection_fn = fn
+            self.non_linearity = nn.ReLU()
+            if non_linearity == "leakyrelu":
+                self.non_linearity = nn.LeakyReLU()
+    def forward(self, x):
+        if self.reduction_factor > 1:
+            x = self.projection_fn(x)
+            x = self.non_linearity(x)
+        return x
+class DAP(nn.Module):
+    def __init__(
+        self,
+        n_speaker_dim,
+        bottleneck_hparams,
+        take_log_of_input,
+        arch_hparams,
+        use_transformer=False,
+    ):
+        super(DAP, self).__init__()
+        self.attribute_processing = AttributeProcessing(take_log_of_input)
+        self.bottleneck_layer = BottleneckLayerLayer(**bottleneck_hparams)
+        arch_hparams["in_dim"] = self.bottleneck_layer.out_dim + n_speaker_dim
+        if use_transformer:
+            self.feat_pred_fn = FFTransformer(**arch_hparams)
+        else:
+            self.feat_pred_fn = ConvLSTMLinear(**arch_hparams)
+    def forward(self, txt_enc, spk_emb, x, lens):
+        if x is not None:
+            x = self.attribute_processing.normalize(x)
+        txt_enc = self.bottleneck_layer(txt_enc)
+        spk_emb_expanded = spk_emb[..., None].expand(-1, -1, txt_enc.shape[2])
+        context = torch.cat((txt_enc, spk_emb_expanded), 1)
+        x_hat = self.feat_pred_fn(context, lens)
+        outputs = {"x_hat": x_hat, "x": x}
+        return outputs
+    def infer(self, z, txt_enc, spk_emb, lens=None):
+        x_hat = self.forward(txt_enc, spk_emb, x=None, lens=lens)["x_hat"]
+        x_hat = self.attribute_processing.denormalize(x_hat)
+        return x_hat
+class BGAP(torch.nn.Module):
+    def __init__(
+        self,
+        n_in_dim,
+        n_speaker_dim,
+        bottleneck_hparams,
+        n_flows,
+        n_group_size,
+        n_layers,
+        with_dilation,
+        kernel_size,
+        scaling_fn,
+        take_log_of_input=False,
+        n_channels=1024,
+        use_quadratic=False,
+        n_bins=8,
+        n_spline_steps=2,
+    ):
+        super(BGAP, self).__init__()
+        # assert(n_group_size % 2 == 0)
+        self.n_flows = n_flows
+        self.n_group_size = n_group_size
+        self.transforms = torch.nn.ModuleList()
+        self.convinv = torch.nn.ModuleList()
+        self.n_speaker_dim = n_speaker_dim
+        self.scaling_fn = scaling_fn
+        self.attribute_processing = AttributeProcessing(take_log_of_input)
+        self.n_spline_steps = n_spline_steps
+        self.bottleneck_layer = BottleneckLayerLayer(**bottleneck_hparams)
+        n_txt_reduced_dim = self.bottleneck_layer.out_dim
+        context_dim = n_txt_reduced_dim * n_group_size + n_speaker_dim
+        if self.n_group_size > 1:
+            self.unfold_params = {
+                "kernel_size": (n_group_size, 1),
+                "stride": n_group_size,
+                "padding": 0,
+                "dilation": 1,
+            }
+            self.unfold = nn.Unfold(**self.unfold_params)
+        for k in range(n_flows):
+            self.convinv.append(Invertible1x1Conv(n_in_dim * n_group_size))
+            if k >= n_flows - self.n_spline_steps:
+                left = -3
+                right = 3
+                top = 3
+                bottom = -3
+                self.transforms.append(
+                    SplineTransformationLayer(
+                        n_in_dim * n_group_size,
+                        context_dim,
+                        n_layers,
+                        with_dilation=with_dilation,
+                        kernel_size=kernel_size,
+                        scaling_fn=scaling_fn,
+                        n_channels=n_channels,
+                        top=top,
+                        bottom=bottom,
+                        left=left,
+                        right=right,
+                        use_quadratic=use_quadratic,
+                        n_bins=n_bins,
+                    )
+                )
+            else:
+                self.transforms.append(
+                    AffineTransformationLayer(
+                        n_in_dim * n_group_size,
+                        context_dim,
+                        n_layers,
+                        with_dilation=with_dilation,
+                        kernel_size=kernel_size,
+                        scaling_fn=scaling_fn,
+                        affine_model="simple_conv",
+                        n_channels=n_channels,
+                    )
+                )
+    def fold(self, data):
+        """Inverse of the self.unfold(data.unsqueeze(-1)) operation used for
+        the grouping or "squeeze" operation on input
+        Args:
+            data: B x C x T tensor of temporal data
+        """
+        output_size = (data.shape[2] * self.n_group_size, 1)
+        data = nn.functional.fold(
+            data, output_size=output_size, **self.unfold_params
+        ).squeeze(-1)
+        return data
+    def preprocess_context(self, txt_emb, speaker_vecs, std_scale=None):
+        if self.n_group_size > 1:
+            txt_emb = self.unfold(txt_emb[..., None])
+        speaker_vecs = speaker_vecs[..., None].expand(-1, -1, txt_emb.shape[2])
+        context = torch.cat((txt_emb, speaker_vecs), 1)
+        return context
+    def forward(self, txt_enc, spk_emb, x, lens):
+        """x<tensor>: duration or pitch or energy average"""
+        assert txt_enc.size(2) >= x.size(1)
+        if len(x.shape) == 2:
+            # add channel dimension
+            x = x[:, None]
+        txt_enc = self.bottleneck_layer(txt_enc)
+        # lens including padded values
+        lens_grouped = (lens // self.n_group_size).long()
+        context = self.preprocess_context(txt_enc, spk_emb)
+        x = self.unfold(x[..., None])
+        log_s_list, log_det_W_list = [], []
+        for k in range(self.n_flows):
+            x, log_s = self.transforms[k](x, context, seq_lens=lens_grouped)
+            x, log_det_W = self.convinv[k](x)
+            log_det_W_list.append(log_det_W)
+            log_s_list.append(log_s)
+        # prepare outputs
+        outputs = {"z": x, "log_det_W_list": log_det_W_list, "log_s_list": log_s_list}
+        return outputs
+    def infer(self, z, txt_enc, spk_emb, seq_lens):
+        txt_enc = self.bottleneck_layer(txt_enc)
+        context = self.preprocess_context(txt_enc, spk_emb)
+        lens_grouped = (seq_lens // self.n_group_size).long()
+        z = self.unfold(z[..., None])
+        for k in reversed(range(self.n_flows)):
+            z = self.convinv[k](z, inverse=True)
+            z = self.transforms[k].forward(
+                z, context, inverse=True, seq_lens=lens_grouped
+            )
+        # z mapped to input domain
+        x_hat = self.fold(z)
+        # pad on the way out
+        return x_hat
+class AGAP(torch.nn.Module):
+    def __init__(
+        self,
+        n_in_dim,
+        n_speaker_dim,
+        n_flows,
+        n_hidden,
+        n_lstm_layers,
+        bottleneck_hparams,
+        scaling_fn="exp",
+        take_log_of_input=False,
+        p_dropout=0.0,
+        setup="",
+        spline_flow_params=None,
+        n_group_size=1,
+    ):
+        super(AGAP, self).__init__()
+        self.flows = torch.nn.ModuleList()
+        self.n_group_size = n_group_size
+        self.n_speaker_dim = n_speaker_dim
+        self.attribute_processing = AttributeProcessing(take_log_of_input)
+        self.n_in_dim = n_in_dim
+        self.bottleneck_layer = BottleneckLayerLayer(**bottleneck_hparams)
+        n_txt_reduced_dim = self.bottleneck_layer.out_dim
+        if self.n_group_size > 1:
+            self.unfold_params = {
+                "kernel_size": (n_group_size, 1),
+                "stride": n_group_size,
+                "padding": 0,
+                "dilation": 1,
+            }
+            self.unfold = nn.Unfold(**self.unfold_params)
+        if spline_flow_params is not None:
+            spline_flow_params["n_in_channels"] *= self.n_group_size
+        for i in range(n_flows):
+            if i % 2 == 0:
+                self.flows.append(
+                    AR_Step(
+                        n_in_dim * n_group_size,
+                        n_speaker_dim,
+                        n_txt_reduced_dim * n_group_size,
+                        n_hidden,
+                        n_lstm_layers,
+                        scaling_fn,
+                        spline_flow_params,
+                    )
+                )
+            else:
+                self.flows.append(
+                    AR_Back_Step(
+                        n_in_dim * n_group_size,
+                        n_speaker_dim,
+                        n_txt_reduced_dim * n_group_size,
+                        n_hidden,
+                        n_lstm_layers,
+                        scaling_fn,
+                        spline_flow_params,
+                    )
+                )
+    def fold(self, data):
+        """Inverse of the self.unfold(data.unsqueeze(-1)) operation used for
+        the grouping or "squeeze" operation on input
+        Args:
+            data: B x C x T tensor of temporal data
+        """
+        output_size = (data.shape[2] * self.n_group_size, 1)
+        data = nn.functional.fold(
+            data, output_size=output_size, **self.unfold_params
+        ).squeeze(-1)
+        return data
+    def preprocess_context(self, txt_emb, speaker_vecs):
+        if self.n_group_size > 1:
+            txt_emb = self.unfold(txt_emb[..., None])
+        speaker_vecs = speaker_vecs[..., None].expand(-1, -1, txt_emb.shape[2])
+        context = torch.cat((txt_emb, speaker_vecs), 1)
+        return context
+    def forward(self, txt_emb, spk_emb, x, lens):
+        """x<tensor>: duration or pitch or energy average"""
+        x = x[:, None] if len(x.shape) == 2 else x  # add channel dimension
+        if self.n_group_size > 1:
+            x = self.unfold(x[..., None])
+        x = x.permute(2, 0, 1)  # permute to time, batch, dims
+        x = self.attribute_processing.normalize(x)
+        txt_emb = self.bottleneck_layer(txt_emb)
+        context = self.preprocess_context(txt_emb, spk_emb)
+        context = context.permute(2, 0, 1)  # permute to time, batch, dims
+        lens_groupped = (lens / self.n_group_size).long()
+        log_s_list = []
+        for i, flow in enumerate(self.flows):
+            x, log_s = flow(x, context, lens_groupped)
+            log_s_list.append(log_s)
+        x = x.permute(1, 2, 0)  # x mapped to z
+        log_s_list = [log_s_elt.permute(1, 2, 0) for log_s_elt in log_s_list]
+        outputs = {"z": x, "log_s_list": log_s_list, "log_det_W_list": []}
+        return outputs
+    def infer(self, z, txt_emb, spk_emb, seq_lens=None):
+        if self.n_group_size > 1:
+            n_frames = z.shape[2]
+            z = self.unfold(z[..., None])
+        z = z.permute(2, 0, 1)  # permute to time, batch, dims
+        txt_emb = self.bottleneck_layer(txt_emb)
+        context = self.preprocess_context(txt_emb, spk_emb)
+        context = context.permute(2, 0, 1)  # permute to time, batch, dims
+        for i, flow in enumerate(reversed(self.flows)):
+            z = flow.infer(z, context)
+        x_hat = z.permute(1, 2, 0)
+        if self.n_group_size > 1:
+            x_hat = self.fold(x_hat)
+            if n_frames > x_hat.shape[2]:
+                m = nn.ReflectionPad1d((0, n_frames - x_hat.shape[2]))
+                x_hat = m(x_hat)
+        x_hat = self.attribute_processing.denormalize(x_hat)
+        return x_hat

audio_processing.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+import torch
+import numpy as np
+from scipy.signal import get_window
+from librosa.filters import mel as librosa_mel_fn
+import librosa.util as librosa_util
+def window_sumsquare(
+    window,
+    n_frames,
+    hop_length=200,
+    win_length=800,
+    n_fft=800,
+    dtype=np.float32,
+    norm=None,
+):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
+    win_sq = librosa_util.pad_center(win_sq, size=n_fft)
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+    return x
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+class TacotronSTFT(torch.nn.Module):
+    def __init__(
+        self,
+        filter_length=1024,
+        hop_length=256,
+        win_length=1024,
+        n_mel_channels=80,
+        sampling_rate=22050,
+        mel_fmin=0.0,
+        mel_fmax=None,
+    ):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(
+            sr=sampling_rate,
+            n_fft=filter_length,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+    def spectral_normalize(self, magnitudes):
+        output = dynamic_range_compression(magnitudes)
+        return output
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+    def mel_spectrogram(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert torch.min(y.data) >= -1
+        assert torch.max(y.data) <= 1
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output)
+        return mel_output
+"""
+BSD 3-Clause License
+Copyright (c) 2017, Prem Seetharaman
+All rights reserved.
+* Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch.nn.functional as F
+from torch.autograd import Variable
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(
+        self, filter_length=800, hop_length=200, win_length=800, window="hann"
+    ):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
+        )
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
+        )
+        if window is not None:
+            assert win_length >= filter_length
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, size=filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        self.num_samples = num_samples
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode="reflect",
+        )
+        input_data = input_data.squeeze(1)
+        forward_transform = F.conv1d(
+            input_data,
+            Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window,
+                magnitude.size(-1),
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                n_fft=self.filter_length,
+                dtype=np.float32,
+            )
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0]
+            )
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False
+            )
+            window_sum = window_sum.to(magnitude.device)
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
+                approx_nonzero_indices
+            ]
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
+        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
+        return inverse_transform
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction

autoregressive_flow.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# AR_Back_Step and AR_Step based on implementation from
+# https://github.com/NVIDIA/flowtron/blob/master/flowtron.py
+# Original license text:
+###############################################################################
+#
+#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+###############################################################################
+# Original Author and Contact: Rafael Valle
+# Modification by Rafael Valle
+import torch
+from torch import nn
+from common import DenseLayer, SplineTransformationLayerAR
+class AR_Back_Step(torch.nn.Module):
+    def __init__(
+        self,
+        n_attr_channels,
+        n_speaker_dim,
+        n_text_dim,
+        n_hidden,
+        n_lstm_layers,
+        scaling_fn,
+        spline_flow_params=None,
+    ):
+        super(AR_Back_Step, self).__init__()
+        self.ar_step = AR_Step(
+            n_attr_channels,
+            n_speaker_dim,
+            n_text_dim,
+            n_hidden,
+            n_lstm_layers,
+            scaling_fn,
+            spline_flow_params,
+        )
+    def forward(self, mel, context, lens):
+        mel = torch.flip(mel, (0,))
+        context = torch.flip(context, (0,))
+        # backwards flow, send padded zeros back to end
+        for k in range(mel.size(1)):
+            mel[:, k] = mel[:, k].roll(lens[k].item(), dims=0)
+            context[:, k] = context[:, k].roll(lens[k].item(), dims=0)
+        mel, log_s = self.ar_step(mel, context, lens)
+        # move padded zeros back to beginning
+        for k in range(mel.size(1)):
+            mel[:, k] = mel[:, k].roll(-lens[k].item(), dims=0)
+        return torch.flip(mel, (0,)), log_s
+    def infer(self, residual, context):
+        residual = self.ar_step.infer(
+            torch.flip(residual, (0,)), torch.flip(context, (0,))
+        )
+        residual = torch.flip(residual, (0,))
+        return residual
+class AR_Step(torch.nn.Module):
+    def __init__(
+        self,
+        n_attr_channels,
+        n_speaker_dim,
+        n_text_channels,
+        n_hidden,
+        n_lstm_layers,
+        scaling_fn,
+        spline_flow_params=None,
+    ):
+        super(AR_Step, self).__init__()
+        if spline_flow_params is not None:
+            self.spline_flow = SplineTransformationLayerAR(**spline_flow_params)
+        else:
+            self.n_out_dims = n_attr_channels
+            self.conv = torch.nn.Conv1d(n_hidden, 2 * n_attr_channels, 1)
+            self.conv.weight.data = 0.0 * self.conv.weight.data
+            self.conv.bias.data = 0.0 * self.conv.bias.data
+        self.attr_lstm = torch.nn.LSTM(n_attr_channels, n_hidden)
+        self.lstm = torch.nn.LSTM(
+            n_hidden + n_text_channels + n_speaker_dim, n_hidden, n_lstm_layers
+        )
+        if spline_flow_params is None:
+            self.dense_layer = DenseLayer(in_dim=n_hidden, sizes=[n_hidden, n_hidden])
+            self.scaling_fn = scaling_fn
+    def run_padded_sequence(
+        self, sorted_idx, unsort_idx, lens, padded_data, recurrent_model
+    ):
+        """Sorts input data by previded ordering (and un-ordering) and runs the
+        packed data through the recurrent model
+        Args:
+            sorted_idx (torch.tensor): 1D sorting index
+            unsort_idx (torch.tensor): 1D unsorting index (inverse sorted_idx)
+            lens: lengths of input data (sorted in descending order)
+            padded_data (torch.tensor): input sequences (padded)
+            recurrent_model (nn.Module): recurrent model to run data through
+        Returns:
+            hidden_vectors (torch.tensor): outputs of the RNN, in the original,
+            unsorted, ordering
+        """
+        # sort the data by decreasing length using provided index
+        # we assume batch index is in dim=1
+        padded_data = padded_data[:, sorted_idx]
+        padded_data = nn.utils.rnn.pack_padded_sequence(padded_data, lens.cpu())
+        hidden_vectors = recurrent_model(padded_data)[0]
+        hidden_vectors, _ = nn.utils.rnn.pad_packed_sequence(hidden_vectors)
+        # unsort the results at dim=1 and return
+        hidden_vectors = hidden_vectors[:, unsort_idx]
+        return hidden_vectors
+    def get_scaling_and_logs(self, scale_unconstrained):
+        if self.scaling_fn == "translate":
+            s = torch.exp(scale_unconstrained * 0)
+            log_s = scale_unconstrained * 0
+        elif self.scaling_fn == "exp":
+            s = torch.exp(scale_unconstrained)
+            log_s = scale_unconstrained  # log(exp
+        elif self.scaling_fn == "tanh":
+            s = torch.tanh(scale_unconstrained) + 1 + 1e-6
+            log_s = torch.log(s)
+        elif self.scaling_fn == "sigmoid":
+            s = torch.sigmoid(scale_unconstrained + 10) + 1e-6
+            log_s = torch.log(s)
+        else:
+            raise Exception("Scaling fn {} not supp.".format(self.scaling_fn))
+        return s, log_s
+    def forward(self, mel, context, lens):
+        dummy = torch.FloatTensor(1, mel.size(1), mel.size(2)).zero_()
+        dummy = dummy.type(mel.type())
+        # seq_len x batch x dim
+        mel0 = torch.cat([dummy, mel[:-1]], 0)
+        self.lstm.flatten_parameters()
+        self.attr_lstm.flatten_parameters()
+        if lens is not None:
+            # collect decreasing length indices
+            lens, ids = torch.sort(lens, descending=True)
+            original_ids = [0] * lens.size(0)
+            for i, ids_i in enumerate(ids):
+                original_ids[ids_i] = i
+            # mel_seq_len x batch x hidden_dim
+            mel_hidden = self.run_padded_sequence(
+                ids, original_ids, lens, mel0, self.attr_lstm
+            )
+        else:
+            mel_hidden = self.attr_lstm(mel0)[0]
+        decoder_input = torch.cat((mel_hidden, context), -1)
+        if lens is not None:
+            # reorder, run padded sequence and undo reordering
+            lstm_hidden = self.run_padded_sequence(
+                ids, original_ids, lens, decoder_input, self.lstm
+            )
+        else:
+            lstm_hidden = self.lstm(decoder_input)[0]
+        if hasattr(self, "spline_flow"):
+            # spline flow fn expects inputs to be batch, channel, time
+            lstm_hidden = lstm_hidden.permute(1, 2, 0)
+            mel = mel.permute(1, 2, 0)
+            mel, log_s = self.spline_flow(mel, lstm_hidden, inverse=False)
+            mel = mel.permute(2, 0, 1)
+            log_s = log_s.permute(2, 0, 1)
+        else:
+            lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0)
+            decoder_output = self.conv(lstm_hidden).permute(2, 0, 1)
+            scale, log_s = self.get_scaling_and_logs(
+                decoder_output[:, :, : self.n_out_dims]
+            )
+            bias = decoder_output[:, :, self.n_out_dims :]
+            mel = scale * mel + bias
+        return mel, log_s
+    def infer(self, residual, context):
+        total_output = []  # seems 10FPS faster than pre-allocation
+        output = None
+        dummy = torch.cuda.FloatTensor(1, residual.size(1), residual.size(2)).zero_()
+        self.attr_lstm.flatten_parameters()
+        for i in range(0, residual.size(0)):
+            if i == 0:
+                output = dummy
+                mel_hidden, (h, c) = self.attr_lstm(output)
+            else:
+                mel_hidden, (h, c) = self.attr_lstm(output, (h, c))
+            decoder_input = torch.cat((mel_hidden, context[i][None]), -1)
+            if i == 0:
+                lstm_hidden, (h1, c1) = self.lstm(decoder_input)
+            else:
+                lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1))
+            if hasattr(self, "spline_flow"):
+                # expects inputs to be batch, channel, time
+                lstm_hidden = lstm_hidden.permute(1, 2, 0)
+                output = residual[i : i + 1].permute(1, 2, 0)
+                output = self.spline_flow(output, lstm_hidden, inverse=True)
+                output = output.permute(2, 0, 1)
+            else:
+                lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0)
+                decoder_output = self.conv(lstm_hidden).permute(2, 0, 1)
+                s, log_s = self.get_scaling_and_logs(
+                    decoder_output[:, :, : decoder_output.size(2) // 2]
+                )
+                b = decoder_output[:, :, decoder_output.size(2) // 2 :]
+                output = (residual[i : i + 1] - b) / s
+            total_output.append(output)
+        total_output = torch.cat(total_output, 0)
+        return total_output

common.py ADDED Viewed

	@@ -0,0 +1,1083 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# 1x1InvertibleConv and WN based on implementation from WaveGlow https://github.com/NVIDIA/waveglow/blob/master/glow.py
+# Original license:
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+import ast
+from splines import (
+    piecewise_linear_transform,
+    piecewise_linear_inverse_transform,
+    unbounded_piecewise_quadratic_transform,
+)
+from partialconv1d import PartialConv1d as pconv1d
+from typing import Tuple
+use_cuda = torch.cuda.is_available()
+if use_cuda:
+    device = "cuda"
+else:
+    device = "cpu"
+def update_params(config, params):
+    for param in params:
+        print(param)
+        k, v = param.split("=")
+        try:
+            v = ast.literal_eval(v)
+        except:
+            pass
+        k_split = k.split(".")
+        if len(k_split) > 1:
+            parent_k = k_split[0]
+            cur_param = [".".join(k_split[1:]) + "=" + str(v)]
+            update_params(config[parent_k], cur_param)
+        elif k in config and len(k_split) == 1:
+            print(f"overriding {k} with {v}")
+            config[k] = v
+        else:
+            print("{}, {} params not updated".format(k, v))
+def get_mask_from_lengths(lengths):
+    """Constructs binary mask from a 1D torch tensor of input lengths
+    Args:
+        lengths (torch.tensor): 1D tensor
+    Returns:
+        mask (torch.tensor): num_sequences x max_length x 1 binary tensor
+    """
+    max_len = torch.max(lengths).item()
+    if torch.cuda.is_available():
+        ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
+    else:
+        ids = torch.arange(0, max_len, out=torch.LongTensor(max_len))
+    mask = (ids < lengths.unsqueeze(1)).bool()
+    return mask
+class ExponentialClass(torch.nn.Module):
+    def __init__(self):
+        super(ExponentialClass, self).__init__()
+    def forward(self, x):
+        return torch.exp(x)
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain="linear"):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain)
+        )
+    def forward(self, x):
+        return self.linear_layer(x)
+class ConvNorm(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=None,
+        dilation=1,
+        bias=True,
+        w_init_gain="linear",
+        use_partial_padding=False,
+        use_weight_norm=False,
+    ):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert kernel_size % 2 == 1
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.use_partial_padding = use_partial_padding
+        self.use_weight_norm = use_weight_norm
+        conv_fn = torch.nn.Conv1d
+        if self.use_partial_padding:
+            conv_fn = pconv1d
+        self.conv = conv_fn(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)
+        )
+        if self.use_weight_norm:
+            self.conv = nn.utils.weight_norm(self.conv)
+    def forward(self, signal, mask=None):
+        if self.use_partial_padding:
+            conv_signal = self.conv(signal, mask)
+        else:
+            conv_signal = self.conv(signal)
+        if mask is not None:
+            # always re-zero output if mask is
+            # available to match zero-padding
+            conv_signal = conv_signal * mask
+        return conv_signal
+class DenseLayer(nn.Module):
+    def __init__(self, in_dim=1024, sizes=[1024, 1024]):
+        super(DenseLayer, self).__init__()
+        in_sizes = [in_dim] + sizes[:-1]
+        self.layers = nn.ModuleList(
+            [
+                LinearNorm(in_size, out_size, bias=True)
+                for (in_size, out_size) in zip(in_sizes, sizes)
+            ]
+        )
+    def forward(self, x):
+        for linear in self.layers:
+            x = torch.tanh(linear(x))
+        return x
+class LengthRegulator(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, dur):
+        output = []
+        for x_i, dur_i in zip(x, dur):
+            expanded = self.expand(x_i, dur_i)
+            output.append(expanded)
+        output = self.pad(output)
+        return output
+    def expand(self, x, dur):
+        output = []
+        for i, frame in enumerate(x):
+            expanded_len = int(dur[i] + 0.5)
+            expanded = frame.expand(expanded_len, -1)
+            output.append(expanded)
+        output = torch.cat(output, 0)
+        return output
+    def pad(self, x):
+        output = []
+        max_len = max([x[i].size(0) for i in range(len(x))])
+        for i, seq in enumerate(x):
+            padded = F.pad(seq, [0, 0, 0, max_len - seq.size(0)], "constant", 0.0)
+            output.append(padded)
+        output = torch.stack(output)
+        return output
+class ConvLSTMLinear(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        n_layers=2,
+        n_channels=256,
+        kernel_size=3,
+        p_dropout=0.1,
+        lstm_type="bilstm",
+        use_linear=True,
+    ):
+        super(ConvLSTMLinear, self).__init__()
+        self.out_dim = out_dim
+        self.lstm_type = lstm_type
+        self.use_linear = use_linear
+        self.dropout = nn.Dropout(p=p_dropout)
+        convolutions = []
+        for i in range(n_layers):
+            conv_layer = ConvNorm(
+                in_dim if i == 0 else n_channels,
+                n_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=int((kernel_size - 1) / 2),
+                dilation=1,
+                w_init_gain="relu",
+            )
+            conv_layer = torch.nn.utils.weight_norm(conv_layer.conv, name="weight")
+            convolutions.append(conv_layer)
+        self.convolutions = nn.ModuleList(convolutions)
+        if not self.use_linear:
+            n_channels = out_dim
+        if self.lstm_type != "":
+            use_bilstm = False
+            lstm_channels = n_channels
+            if self.lstm_type == "bilstm":
+                use_bilstm = True
+                lstm_channels = int(n_channels // 2)
+            self.bilstm = nn.LSTM(
+                n_channels, lstm_channels, 1, batch_first=True, bidirectional=use_bilstm
+            )
+            lstm_norm_fn_pntr = nn.utils.spectral_norm
+            self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0")
+            if self.lstm_type == "bilstm":
+                self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0_reverse")
+        if self.use_linear:
+            self.dense = nn.Linear(n_channels, out_dim)
+    def run_padded_sequence(self, context, lens):
+        context_embedded = []
+        for b_ind in range(context.size()[0]):  # TODO: speed up
+            curr_context = context[b_ind : b_ind + 1, :, : lens[b_ind]].clone()
+            for conv in self.convolutions:
+                curr_context = self.dropout(F.relu(conv(curr_context)))
+            context_embedded.append(curr_context[0].transpose(0, 1))
+        context = torch.nn.utils.rnn.pad_sequence(context_embedded, batch_first=True)
+        return context
+    def run_unsorted_inputs(self, fn, context, lens):
+        lens_sorted, ids_sorted = torch.sort(lens, descending=True)
+        unsort_ids = [0] * lens.size(0)
+        for i in range(len(ids_sorted)):
+            unsort_ids[ids_sorted[i]] = i
+        lens_sorted = lens_sorted.long().cpu()
+        context = context[ids_sorted]
+        context = nn.utils.rnn.pack_padded_sequence(
+            context, lens_sorted, batch_first=True
+        )
+        context = fn(context)[0]
+        context = nn.utils.rnn.pad_packed_sequence(context, batch_first=True)[0]
+        # map back to original indices
+        context = context[unsort_ids]
+        return context
+    def forward(self, context, lens):
+        if context.size()[0] > 1:
+            context = self.run_padded_sequence(context, lens)
+            # to B, D, T
+            context = context.transpose(1, 2)
+        else:
+            for conv in self.convolutions:
+                context = self.dropout(F.relu(conv(context)))
+        if self.lstm_type != "":
+            context = context.transpose(1, 2)
+            self.bilstm.flatten_parameters()
+            if lens is not None:
+                context = self.run_unsorted_inputs(self.bilstm, context, lens)
+            else:
+                context = self.bilstm(context)[0]
+            context = context.transpose(1, 2)
+        x_hat = context
+        if self.use_linear:
+            x_hat = self.dense(context.transpose(1, 2)).transpose(1, 2)
+        return x_hat
+    def infer(self, z, txt_enc, spk_emb):
+        x_hat = self.forward(txt_enc, spk_emb)["x_hat"]
+        x_hat = self.feature_processing.denormalize(x_hat)
+        return x_hat
+class Encoder(nn.Module):
+    """Encoder module:
+    - Three 1-d convolution banks
+    - Bidirectional LSTM
+    """
+    def __init__(
+        self,
+        encoder_n_convolutions=3,
+        encoder_embedding_dim=512,
+        encoder_kernel_size=5,
+        norm_fn=nn.BatchNorm1d,
+        lstm_norm_fn=None,
+    ):
+        super(Encoder, self).__init__()
+        convolutions = []
+        for _ in range(encoder_n_convolutions):
+            conv_layer = nn.Sequential(
+                ConvNorm(
+                    encoder_embedding_dim,
+                    encoder_embedding_dim,
+                    kernel_size=encoder_kernel_size,
+                    stride=1,
+                    padding=int((encoder_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain="relu",
+                    use_partial_padding=True,
+                ),
+                norm_fn(encoder_embedding_dim, affine=True),
+            )
+            convolutions.append(conv_layer)
+        self.convolutions = nn.ModuleList(convolutions)
+        self.lstm = nn.LSTM(
+            encoder_embedding_dim,
+            int(encoder_embedding_dim / 2),
+            1,
+            batch_first=True,
+            bidirectional=True,
+        )
+        if lstm_norm_fn is not None:
+            if "spectral" in lstm_norm_fn:
+                print("Applying spectral norm to text encoder LSTM")
+                lstm_norm_fn_pntr = torch.nn.utils.spectral_norm
+            elif "weight" in lstm_norm_fn:
+                print("Applying weight norm to text encoder LSTM")
+                lstm_norm_fn_pntr = torch.nn.utils.weight_norm
+            self.lstm = lstm_norm_fn_pntr(self.lstm, "weight_hh_l0")
+            self.lstm = lstm_norm_fn_pntr(self.lstm, "weight_hh_l0_reverse")
+    @torch.autocast(device, enabled=False)
+    def forward(self, x, in_lens):
+        """
+        Args:
+            x (torch.tensor): N x C x L padded input of text embeddings
+            in_lens (torch.tensor): 1D tensor of sequence lengths
+        """
+        if x.size()[0] > 1:
+            x_embedded = []
+            for b_ind in range(x.size()[0]):  # TODO: improve speed
+                curr_x = x[b_ind : b_ind + 1, :, : in_lens[b_ind]].clone()
+                for conv in self.convolutions:
+                    curr_x = F.dropout(F.relu(conv(curr_x)), 0.5, self.training)
+                x_embedded.append(curr_x[0].transpose(0, 1))
+            x = torch.nn.utils.rnn.pad_sequence(x_embedded, batch_first=True)
+        else:
+            for conv in self.convolutions:
+                x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+            x = x.transpose(1, 2)
+        # recent amp change -- change in_lens to int
+        in_lens = in_lens.int().cpu()
+        x = nn.utils.rnn.pack_padded_sequence(x, in_lens, batch_first=True)
+        self.lstm.flatten_parameters()
+        outputs, _ = self.lstm(x)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
+        return outputs
+    @torch.autocast(device, enabled=False)
+    def infer(self, x):
+        for conv in self.convolutions:
+            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+        x = x.transpose(1, 2)
+        self.lstm.flatten_parameters()
+        outputs, _ = self.lstm(x)
+        return outputs
+class Invertible1x1ConvLUS(torch.nn.Module):
+    def __init__(self, c, cache_inverse=False):
+        super(Invertible1x1ConvLUS, self).__init__()
+        # Sample a random orthonormal matrix to initialize weights
+        W = torch.linalg.qr(torch.FloatTensor(c, c).normal_())[0]
+        # Ensure determinant is 1.0 not -1.0
+        if torch.det(W) < 0:
+            W[:, 0] = -1 * W[:, 0]
+        p, lower, upper = torch.lu_unpack(*torch.lu(W))
+        self.register_buffer("p", p)
+        # diagonals of lower will always be 1s anyway
+        lower = torch.tril(lower, -1)
+        lower_diag = torch.diag(torch.eye(c, c))
+        self.register_buffer("lower_diag", lower_diag)
+        self.lower = nn.Parameter(lower)
+        self.upper_diag = nn.Parameter(torch.diag(upper))
+        self.upper = nn.Parameter(torch.triu(upper, 1))
+        self.cache_inverse = cache_inverse
+    @torch.autocast(device, enabled=False)
+    def forward(self, z, inverse=False):
+        U = torch.triu(self.upper, 1) + torch.diag(self.upper_diag)
+        L = torch.tril(self.lower, -1) + torch.diag(self.lower_diag)
+        W = torch.mm(self.p, torch.mm(L, U))
+        if inverse:
+            if not hasattr(self, "W_inverse"):
+                # inverse computation
+                W_inverse = W.float().inverse()
+                if z.type() == "torch.cuda.HalfTensor":
+                    W_inverse = W_inverse.half()
+                self.W_inverse = W_inverse[..., None]
+            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+            if not self.cache_inverse:
+                delattr(self, "W_inverse")
+            return z
+        else:
+            W = W[..., None]
+            z = F.conv1d(z, W, bias=None, stride=1, padding=0)
+            log_det_W = torch.sum(torch.log(torch.abs(self.upper_diag)))
+            return z, log_det_W
+class Invertible1x1Conv(torch.nn.Module):
+    """
+    The layer outputs both the convolution, and the log determinant
+    of its weight matrix.  If inverse=True it does convolution with
+    inverse
+    """
+    def __init__(self, c, cache_inverse=False):
+        super(Invertible1x1Conv, self).__init__()
+        self.conv = torch.nn.Conv1d(
+            c, c, kernel_size=1, stride=1, padding=0, bias=False
+        )
+        # Sample a random orthonormal matrix to initialize weights
+        W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+        # Ensure determinant is 1.0 not -1.0
+        if torch.det(W) < 0:
+            W[:, 0] = -1 * W[:, 0]
+        W = W.view(c, c, 1)
+        self.conv.weight.data = W
+        self.cache_inverse = cache_inverse
+    def forward(self, z, inverse=False):
+        # DO NOT apply n_of_groups, as it doesn't account for padded sequences
+        W = self.conv.weight.squeeze()
+        if inverse:
+            if not hasattr(self, "W_inverse"):
+                # Inverse computation
+                W_inverse = W.float().inverse()
+                if z.type() == "torch.cuda.HalfTensor":
+                    W_inverse = W_inverse.half()
+                self.W_inverse = W_inverse[..., None]
+            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+            if not self.cache_inverse:
+                delattr(self, "W_inverse")
+            return z
+        else:
+            # Forward computation
+            log_det_W = torch.logdet(W).clone()
+            z = self.conv(z)
+            return z, log_det_W
+class SimpleConvNet(torch.nn.Module):
+    def __init__(
+        self,
+        n_mel_channels,
+        n_context_dim,
+        final_out_channels,
+        n_layers=2,
+        kernel_size=5,
+        with_dilation=True,
+        max_channels=1024,
+        zero_init=True,
+        use_partial_padding=True,
+    ):
+        super(SimpleConvNet, self).__init__()
+        self.layers = torch.nn.ModuleList()
+        self.n_layers = n_layers
+        in_channels = n_mel_channels + n_context_dim
+        out_channels = -1
+        self.use_partial_padding = use_partial_padding
+        for i in range(n_layers):
+            dilation = 2**i if with_dilation else 1
+            padding = int((kernel_size * dilation - dilation) / 2)
+            out_channels = min(max_channels, in_channels * 2)
+            self.layers.append(
+                ConvNorm(
+                    in_channels,
+                    out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding,
+                    dilation=dilation,
+                    bias=True,
+                    w_init_gain="relu",
+                    use_partial_padding=use_partial_padding,
+                )
+            )
+            in_channels = out_channels
+        self.last_layer = torch.nn.Conv1d(
+            out_channels, final_out_channels, kernel_size=1
+        )
+        if zero_init:
+            self.last_layer.weight.data *= 0
+            self.last_layer.bias.data *= 0
+    def forward(self, z_w_context, seq_lens: torch.Tensor = None):
+        # seq_lens: tensor array of sequence sequence lengths
+        # output should be b x n_mel_channels x z_w_context.shape(2)
+        mask = None
+        if seq_lens is not None:
+            mask = get_mask_from_lengths(seq_lens).unsqueeze(1).float()
+        for i in range(self.n_layers):
+            z_w_context = self.layers[i](z_w_context, mask)
+            z_w_context = torch.relu(z_w_context)
+        z_w_context = self.last_layer(z_w_context)
+        return z_w_context
+class WN(torch.nn.Module):
+    """
+    Adapted from WN() module in WaveGlow with modififcations to variable names
+    """
+    def __init__(
+        self,
+        n_in_channels,
+        n_context_dim,
+        n_layers,
+        n_channels,
+        kernel_size=5,
+        affine_activation="softplus",
+        use_partial_padding=True,
+    ):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        assert n_channels % 2 == 0
+        self.n_layers = n_layers
+        self.n_channels = n_channels
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        start = torch.nn.Conv1d(n_in_channels + n_context_dim, n_channels, 1)
+        start = torch.nn.utils.weight_norm(start, name="weight")
+        self.start = start
+        self.softplus = torch.nn.Softplus()
+        self.affine_activation = affine_activation
+        self.use_partial_padding = use_partial_padding
+        # Initializing last layer to 0 makes the affine coupling layers
+        # do nothing at first.  This helps with training stability
+        end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1)
+        end.weight.data.zero_()
+        end.bias.data.zero_()
+        self.end = end
+        for i in range(n_layers):
+            dilation = 2**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = ConvNorm(
+                n_channels,
+                n_channels,
+                kernel_size=kernel_size,
+                dilation=dilation,
+                padding=padding,
+                use_partial_padding=use_partial_padding,
+                use_weight_norm=True,
+            )
+            # in_layer = nn.Conv1d(n_channels, n_channels, kernel_size,
+            #                      dilation=dilation, padding=padding)
+            # in_layer = nn.utils.weight_norm(in_layer)
+            self.in_layers.append(in_layer)
+            res_skip_layer = nn.Conv1d(n_channels, n_channels, 1)
+            res_skip_layer = nn.utils.weight_norm(res_skip_layer)
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(
+        self,
+        forward_input: Tuple[torch.Tensor, torch.Tensor],
+        seq_lens: torch.Tensor = None,
+    ):
+        z, context = forward_input
+        z = torch.cat((z, context), 1)  # append context to z as well
+        z = self.start(z)
+        output = torch.zeros_like(z)
+        mask = None
+        if seq_lens is not None:
+            mask = get_mask_from_lengths(seq_lens).unsqueeze(1).float()
+        non_linearity = torch.relu
+        if self.affine_activation == "softplus":
+            non_linearity = self.softplus
+        for i in range(self.n_layers):
+            z = non_linearity(self.in_layers[i](z, mask))
+            res_skip_acts = non_linearity(self.res_skip_layers[i](z))
+            output = output + res_skip_acts
+        output = self.end(output)  # [B, dim, seq_len]
+        return output
+# Affine Coupling Layers
+class SplineTransformationLayerAR(torch.nn.Module):
+    def __init__(
+        self,
+        n_in_channels,
+        n_context_dim,
+        n_layers,
+        affine_model="simple_conv",
+        kernel_size=1,
+        scaling_fn="exp",
+        affine_activation="softplus",
+        n_channels=1024,
+        n_bins=8,
+        left=-6,
+        right=6,
+        bottom=-6,
+        top=6,
+        use_quadratic=False,
+    ):
+        super(SplineTransformationLayerAR, self).__init__()
+        self.n_in_channels = n_in_channels  # input dimensions
+        self.left = left
+        self.right = right
+        self.bottom = bottom
+        self.top = top
+        self.n_bins = n_bins
+        self.spline_fn = piecewise_linear_transform
+        self.inv_spline_fn = piecewise_linear_inverse_transform
+        self.use_quadratic = use_quadratic
+        if self.use_quadratic:
+            self.spline_fn = unbounded_piecewise_quadratic_transform
+            self.inv_spline_fn = unbounded_piecewise_quadratic_transform
+            self.n_bins = 2 * self.n_bins + 1
+        final_out_channels = self.n_in_channels * self.n_bins
+        # autoregressive flow, kernel size 1 and no dilation
+        self.param_predictor = SimpleConvNet(
+            n_context_dim,
+            0,
+            final_out_channels,
+            n_layers,
+            with_dilation=False,
+            kernel_size=1,
+            zero_init=True,
+            use_partial_padding=False,
+        )
+        # output is unnormalized bin weights
+    def normalize(self, z, inverse):
+        # normalize to [0, 1]
+        if inverse:
+            z = (z - self.bottom) / (self.top - self.bottom)
+        else:
+            z = (z - self.left) / (self.right - self.left)
+        return z
+    def denormalize(self, z, inverse):
+        if inverse:
+            z = z * (self.right - self.left) + self.left
+        else:
+            z = z * (self.top - self.bottom) + self.bottom
+        return z
+    def forward(self, z, context, inverse=False):
+        b_s, c_s, t_s = z.size(0), z.size(1), z.size(2)
+        z = self.normalize(z, inverse)
+        if z.min() < 0.0 or z.max() > 1.0:
+            print("spline z scaled beyond [0, 1]", z.min(), z.max())
+        z_reshaped = z.permute(0, 2, 1).reshape(b_s * t_s, -1)
+        affine_params = self.param_predictor(context)
+        q_tilde = affine_params.permute(0, 2, 1).reshape(b_s * t_s, c_s, -1)
+        with torch.autocast(device, enabled=False):
+            if self.use_quadratic:
+                w = q_tilde[:, :, : self.n_bins // 2]
+                v = q_tilde[:, :, self.n_bins // 2 :]
+                z_tformed, log_s = self.spline_fn(
+                    z_reshaped.float(), w.float(), v.float(), inverse=inverse
+                )
+            else:
+                z_tformed, log_s = self.spline_fn(z_reshaped.float(), q_tilde.float())
+        z = z_tformed.reshape(b_s, t_s, -1).permute(0, 2, 1)
+        z = self.denormalize(z, inverse)
+        if inverse:
+            return z
+        log_s = log_s.reshape(b_s, t_s, -1)
+        log_s = log_s.permute(0, 2, 1)
+        log_s = log_s + c_s * (
+            np.log(self.top - self.bottom) - np.log(self.right - self.left)
+        )
+        return z, log_s
+class SplineTransformationLayer(torch.nn.Module):
+    def __init__(
+        self,
+        n_mel_channels,
+        n_context_dim,
+        n_layers,
+        with_dilation=True,
+        kernel_size=5,
+        scaling_fn="exp",
+        affine_activation="softplus",
+        n_channels=1024,
+        n_bins=8,
+        left=-4,
+        right=4,
+        bottom=-4,
+        top=4,
+        use_quadratic=False,
+    ):
+        super(SplineTransformationLayer, self).__init__()
+        self.n_mel_channels = n_mel_channels  # input dimensions
+        self.half_mel_channels = int(n_mel_channels / 2)  # half, because we split
+        self.left = left
+        self.right = right
+        self.bottom = bottom
+        self.top = top
+        self.n_bins = n_bins
+        self.spline_fn = piecewise_linear_transform
+        self.inv_spline_fn = piecewise_linear_inverse_transform
+        self.use_quadratic = use_quadratic
+        if self.use_quadratic:
+            self.spline_fn = unbounded_piecewise_quadratic_transform
+            self.inv_spline_fn = unbounded_piecewise_quadratic_transform
+            self.n_bins = 2 * self.n_bins + 1
+        final_out_channels = self.half_mel_channels * self.n_bins
+        self.param_predictor = SimpleConvNet(
+            self.half_mel_channels,
+            n_context_dim,
+            final_out_channels,
+            n_layers,
+            with_dilation=with_dilation,
+            kernel_size=kernel_size,
+            zero_init=False,
+        )
+        # output is unnormalized bin weights
+    def forward(self, z, context, inverse=False, seq_lens=None):
+        b_s, c_s, t_s = z.size(0), z.size(1), z.size(2)
+        # condition on z_0, transform z_1
+        n_half = self.half_mel_channels
+        z_0, z_1 = z[:, :n_half], z[:, n_half:]
+        # normalize to [0,1]
+        if inverse:
+            z_1 = (z_1 - self.bottom) / (self.top - self.bottom)
+        else:
+            z_1 = (z_1 - self.left) / (self.right - self.left)
+        z_w_context = torch.cat((z_0, context), 1)
+        affine_params = self.param_predictor(z_w_context, seq_lens)
+        z_1_reshaped = z_1.permute(0, 2, 1).reshape(b_s * t_s, -1)
+        q_tilde = affine_params.permute(0, 2, 1).reshape(b_s * t_s, n_half, self.n_bins)
+        with torch.autocast(device, enabled=False):
+            if self.use_quadratic:
+                w = q_tilde[:, :, : self.n_bins // 2]
+                v = q_tilde[:, :, self.n_bins // 2 :]
+                z_1_tformed, log_s = self.spline_fn(
+                    z_1_reshaped.float(), w.float(), v.float(), inverse=inverse
+                )
+                if not inverse:
+                    log_s = torch.sum(log_s, 1)
+            else:
+                if inverse:
+                    z_1_tformed, _dc = self.inv_spline_fn(
+                        z_1_reshaped.float(), q_tilde.float(), False
+                    )
+                else:
+                    z_1_tformed, log_s = self.spline_fn(
+                        z_1_reshaped.float(), q_tilde.float()
+                    )
+        z_1 = z_1_tformed.reshape(b_s, t_s, -1).permute(0, 2, 1)
+        # undo [0, 1] normalization
+        if inverse:
+            z_1 = z_1 * (self.right - self.left) + self.left
+            z = torch.cat((z_0, z_1), dim=1)
+            return z
+        else:  # training
+            z_1 = z_1 * (self.top - self.bottom) + self.bottom
+            z = torch.cat((z_0, z_1), dim=1)
+            log_s = log_s.reshape(b_s, t_s).unsqueeze(1) + n_half * (
+                np.log(self.top - self.bottom) - np.log(self.right - self.left)
+            )
+            return z, log_s
+class AffineTransformationLayer(torch.nn.Module):
+    def __init__(
+        self,
+        n_mel_channels,
+        n_context_dim,
+        n_layers,
+        affine_model="simple_conv",
+        with_dilation=True,
+        kernel_size=5,
+        scaling_fn="exp",
+        affine_activation="softplus",
+        n_channels=1024,
+        use_partial_padding=False,
+    ):
+        super(AffineTransformationLayer, self).__init__()
+        if affine_model not in ("wavenet", "simple_conv"):
+            raise Exception("{} affine model not supported".format(affine_model))
+        if isinstance(scaling_fn, list):
+            if not all(
+                [x in ("translate", "exp", "tanh", "sigmoid") for x in scaling_fn]
+            ):
+                raise Exception("{} scaling fn not supported".format(scaling_fn))
+        else:
+            if scaling_fn not in ("translate", "exp", "tanh", "sigmoid"):
+                raise Exception("{} scaling fn not supported".format(scaling_fn))
+        self.affine_model = affine_model
+        self.scaling_fn = scaling_fn
+        if affine_model == "wavenet":
+            self.affine_param_predictor = WN(
+                int(n_mel_channels / 2),
+                n_context_dim,
+                n_layers=n_layers,
+                n_channels=n_channels,
+                affine_activation=affine_activation,
+                use_partial_padding=use_partial_padding,
+            )
+        elif affine_model == "simple_conv":
+            self.affine_param_predictor = SimpleConvNet(
+                int(n_mel_channels / 2),
+                n_context_dim,
+                n_mel_channels,
+                n_layers,
+                with_dilation=with_dilation,
+                kernel_size=kernel_size,
+                use_partial_padding=use_partial_padding,
+            )
+        self.n_mel_channels = n_mel_channels
+    def get_scaling_and_logs(self, scale_unconstrained):
+        if self.scaling_fn == "translate":
+            s = torch.exp(scale_unconstrained * 0)
+            log_s = scale_unconstrained * 0
+        elif self.scaling_fn == "exp":
+            s = torch.exp(scale_unconstrained)
+            log_s = scale_unconstrained  # log(exp
+        elif self.scaling_fn == "tanh":
+            s = torch.tanh(scale_unconstrained) + 1 + 1e-6
+            log_s = torch.log(s)
+        elif self.scaling_fn == "sigmoid":
+            s = torch.sigmoid(scale_unconstrained + 10) + 1e-6
+            log_s = torch.log(s)
+        elif isinstance(self.scaling_fn, list):
+            s_list, log_s_list = [], []
+            for i in range(scale_unconstrained.shape[1]):
+                scaling_i = self.scaling_fn[i]
+                if scaling_i == "translate":
+                    s_i = torch.exp(scale_unconstrained[:i] * 0)
+                    log_s_i = scale_unconstrained[:, i] * 0
+                elif scaling_i == "exp":
+                    s_i = torch.exp(scale_unconstrained[:, i])
+                    log_s_i = scale_unconstrained[:, i]
+                elif scaling_i == "tanh":
+                    s_i = torch.tanh(scale_unconstrained[:, i]) + 1 + 1e-6
+                    log_s_i = torch.log(s_i)
+                elif scaling_i == "sigmoid":
+                    s_i = torch.sigmoid(scale_unconstrained[:, i]) + 1e-6
+                    log_s_i = torch.log(s_i)
+                s_list.append(s_i[:, None])
+                log_s_list.append(log_s_i[:, None])
+            s = torch.cat(s_list, dim=1)
+            log_s = torch.cat(log_s_list, dim=1)
+        return s, log_s
+    def forward(self, z, context, inverse=False, seq_lens=None):
+        n_half = int(self.n_mel_channels / 2)
+        z_0, z_1 = z[:, :n_half], z[:, n_half:]
+        if self.affine_model == "wavenet":
+            affine_params = self.affine_param_predictor(
+                (z_0, context), seq_lens=seq_lens
+            )
+        elif self.affine_model == "simple_conv":
+            z_w_context = torch.cat((z_0, context), 1)
+            affine_params = self.affine_param_predictor(z_w_context, seq_lens=seq_lens)
+        scale_unconstrained = affine_params[:, :n_half, :]
+        b = affine_params[:, n_half:, :]
+        s, log_s = self.get_scaling_and_logs(scale_unconstrained)
+        if inverse:
+            z_1 = (z_1 - b) / s
+            z = torch.cat((z_0, z_1), dim=1)
+            return z
+        else:
+            z_1 = s * z_1 + b
+            z = torch.cat((z_0, z_1), dim=1)
+            return z, log_s
+class ConvAttention(torch.nn.Module):
+    def __init__(
+        self, n_mel_channels=80, n_text_channels=512, n_att_channels=80, temperature=1.0
+    ):
+        super(ConvAttention, self).__init__()
+        self.temperature = temperature
+        self.softmax = torch.nn.Softmax(dim=3)
+        self.log_softmax = torch.nn.LogSoftmax(dim=3)
+        self.key_proj = nn.Sequential(
+            ConvNorm(
+                n_text_channels,
+                n_text_channels * 2,
+                kernel_size=3,
+                bias=True,
+                w_init_gain="relu",
+            ),
+            torch.nn.ReLU(),
+            ConvNorm(n_text_channels * 2, n_att_channels, kernel_size=1, bias=True),
+        )
+        self.query_proj = nn.Sequential(
+            ConvNorm(
+                n_mel_channels,
+                n_mel_channels * 2,
+                kernel_size=3,
+                bias=True,
+                w_init_gain="relu",
+            ),
+            torch.nn.ReLU(),
+            ConvNorm(n_mel_channels * 2, n_mel_channels, kernel_size=1, bias=True),
+            torch.nn.ReLU(),
+            ConvNorm(n_mel_channels, n_att_channels, kernel_size=1, bias=True),
+        )
+    def run_padded_sequence(
+        self, sorted_idx, unsort_idx, lens, padded_data, recurrent_model
+    ):
+        """Sorts input data by previded ordering (and un-ordering) and runs the
+        packed data through the recurrent model
+        Args:
+            sorted_idx (torch.tensor): 1D sorting index
+            unsort_idx (torch.tensor): 1D unsorting index (inverse of sorted_idx)
+            lens: lengths of input data (sorted in descending order)
+            padded_data (torch.tensor): input sequences (padded)
+            recurrent_model (nn.Module): recurrent model to run data through
+        Returns:
+            hidden_vectors (torch.tensor): outputs of the RNN, in the original,
+            unsorted, ordering
+        """
+        # sort the data by decreasing length using provided index
+        # we assume batch index is in dim=1
+        padded_data = padded_data[:, sorted_idx]
+        padded_data = nn.utils.rnn.pack_padded_sequence(padded_data, lens)
+        hidden_vectors = recurrent_model(padded_data)[0]
+        hidden_vectors, _ = nn.utils.rnn.pad_packed_sequence(hidden_vectors)
+        # unsort the results at dim=1 and return
+        hidden_vectors = hidden_vectors[:, unsort_idx]
+        return hidden_vectors
+    def forward(
+        self, queries, keys, query_lens, mask=None, key_lens=None, attn_prior=None
+    ):
+        """Attention mechanism for radtts. Unlike in Flowtron, we have no
+        restrictions such as causality etc, since we only need this during
+        training.
+        Args:
+            queries (torch.tensor): B x C x T1 tensor (likely mel data)
+            keys (torch.tensor): B x C2 x T2 tensor (text data)
+            query_lens: lengths for sorting the queries in descending order
+            mask (torch.tensor): uint8 binary mask for variable length entries
+                                 (should be in the T2 domain)
+        Output:
+            attn (torch.tensor): B x 1 x T1 x T2 attention mask.
+                                 Final dim T2 should sum to 1
+        """
+        temp = 0.0005
+        keys_enc = self.key_proj(keys)  # B x n_attn_dims x T2
+        # Beware can only do this since query_dim = attn_dim = n_mel_channels
+        queries_enc = self.query_proj(queries)
+        # Gaussian Isotopic Attention
+        # B x n_attn_dims x T1 x T2
+        attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2
+        # compute log-likelihood from gaussian
+        eps = 1e-8
+        attn = -temp * attn.sum(1, keepdim=True)
+        if attn_prior is not None:
+            attn = self.log_softmax(attn) + torch.log(attn_prior[:, None] + eps)
+        attn_logprob = attn.clone()
+        if mask is not None:
+            attn.data.masked_fill_(mask.permute(0, 2, 1).unsqueeze(2), -float("inf"))
+        attn = self.softmax(attn)  # softmax along T2
+        return attn, attn_logprob

configs/radtts-pp-dap-model.json ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+    "train_config": {
+        "output_directory": "outdir_pp_model",
+        "epochs": 10000000,
+        "optim_algo": "RAdam",
+        "learning_rate": 0.001,
+        "weight_decay": 1e-06,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 1000,
+        "batch_size": 16,
+        "seed": null,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "ignore_layers_warmstart": [],
+        "finetune_layers": [],
+        "include_layers": [],
+        "vocoder_config_path": "models/hifigan_22khz_config.json",
+        "vocoder_checkpoint_path": "models/hifigan_ljs_generator_v1.pt",
+        "log_attribute_samples": true,
+        "log_decoder_samples": true,
+        "warmstart_checkpoint_path": "outdir_pp/model_100000",
+        "use_amp": true,
+        "grad_clip_val": 1.0,
+        "loss_weights": {
+            "blank_logprob": -1,
+            "ctc_loss_weight": 0.1,
+            "binarization_loss_weight": 1.0,
+            "dur_loss_weight": 1.0,
+            "f0_loss_weight": 1.0,
+            "energy_loss_weight": 1.0,
+            "vpred_loss_weight": 1.0
+        },
+        "binarization_start_iter": 0,
+        "kl_loss_start_iter": 0,
+        "unfreeze_modules": "all"
+    },
+    "data_config": {
+        "training_files": {
+            "LJS": {
+                "basedir": "filelists/",
+                "audiodir": "wavs",
+                "filelist": "3speakers_ukrainian_train_filelist_dc.txt",
+                "lmdbpath": ""
+            }
+        },
+        "validation_files": {
+            "LJS": {
+                "basedir": "filelists/",
+                "audiodir": "wavs",
+                "filelist": "3speakers_ukrainian_val_filelist_dc.txt",
+                "lmdbpath": ""
+            }
+        },
+        "dur_min": 0.1,
+        "dur_max": 10.2,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "f0_min": 80.0,
+        "f0_max": 640.0,
+        "max_wav_value": 32768.0,
+        "use_f0": true,
+        "use_log_f0": 0,
+        "use_energy_avg": true,
+        "use_scaled_energy": true,
+        "symbol_set": "ukrainian",
+        "cleaner_names": [
+            "ukrainian_cleaners"
+        ],
+        "heteronyms_path": "tts_text_processing/heteronyms",
+        "phoneme_dict_path": "tts_text_processing/cmudict-0.7b",
+        "p_phoneme": 0.0,
+        "handle_phoneme": "word",
+        "handle_phoneme_ambiguous": "ignore",
+        "include_speakers": null,
+        "n_frames": -1,
+        "betabinom_cache_path": "/home/dmytro_chaplinsky/RAD-TTS/radtts-code/cache",
+        "lmdb_cache_path": "",
+        "use_attn_prior_masking": true,
+        "prepend_space_to_text": true,
+        "append_space_to_text": true,
+        "add_bos_eos_to_text": false,
+        "betabinom_scaling_factor": 1.0,
+        "distance_tx_unvoiced": false,
+        "mel_noise_scale": 0.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    "model_config": {
+        "n_speakers": 3,
+        "n_speaker_dim": 16,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 8,
+        "n_conv_layers_per_step": 4,
+        "n_mel_channels": 80,
+        "n_hidden": 1024,
+        "mel_encoder_n_hidden": 512,
+        "dummy_speaker_embedding": false,
+        "n_early_size": 2,
+        "n_early_every": 2,
+        "n_group_size": 2,
+        "affine_model": "wavenet",
+        "include_modules": "decatndpmvpredapm",
+        "scaling_fn": "tanh",
+        "matrix_decomposition": "LUS",
+        "learn_alignments": true,
+        "use_speaker_emb_for_alignment": false,
+        "attn_straight_through_estimator": true,
+        "use_context_lstm": true,
+        "context_lstm_norm": "spectral",
+        "context_lstm_w_f0_and_energy": true,
+        "text_encoder_lstm_norm": "spectral",
+        "n_f0_dims": 1,
+        "n_energy_avg_dims": 1,
+        "use_first_order_features": false,
+        "unvoiced_bias_activation": "relu",
+        "decoder_use_partial_padding": true,
+        "decoder_use_unvoiced_bias": true,
+        "ap_pred_log_f0": true,
+        "ap_use_unvoiced_bias": false,
+        "ap_use_voiced_embeddings": true,
+        "dur_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "take_log_of_input": true,
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.25,
+                    "in_dim": 48
+                }
+            }
+        },
+        "f0_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "take_log_of_input": false,
+                "use_transformer": false,
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 11,
+                    "p_dropout": 0.5,
+                    "in_dim": 48
+                }
+            }
+        },
+        "energy_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "take_log_of_input": false,
+                "use_transformer": false,
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.25,
+                    "in_dim": 48
+                }
+            }
+        },
+        "v_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "take_log_of_input": false,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.5,
+                    "lstm_type": "",
+                    "use_linear": 1,
+                    "in_dim": 48
+                }
+            }
+        }
+    }
+}

data.py ADDED Viewed

	@@ -0,0 +1,606 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# Based on https://github.com/NVIDIA/flowtron/blob/master/data.py
+# Original license text:
+###############################################################################
+#
+#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+###############################################################################
+import os
+import argparse
+import json
+import numpy as np
+import lmdb
+import pickle as pkl
+import torch
+import torch.utils.data
+from scipy.io.wavfile import read
+from audio_processing import TacotronSTFT
+from tts_text_processing.text_processing import TextProcessing
+from scipy.stats import betabinom
+from librosa import pyin
+from common import update_params
+from scipy.ndimage import distance_transform_edt as distance_transform
+def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=0.05):
+    P = phoneme_count
+    M = mel_count
+    x = np.arange(0, P)
+    mel_text_probs = []
+    for i in range(1, M + 1):
+        a, b = scaling_factor * i, scaling_factor * (M + 1 - i)
+        rv = betabinom(P - 1, a, b)
+        mel_i_prob = rv.pmf(x)
+        mel_text_probs.append(mel_i_prob)
+    return torch.tensor(np.array(mel_text_probs))
+def load_wav_to_torch(full_path):
+    """Loads wavdata into torch array"""
+    sampling_rate, data = read(full_path)
+    return torch.from_numpy(np.array(data)).float(), sampling_rate
+class Data(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        datasets,
+        filter_length,
+        hop_length,
+        win_length,
+        sampling_rate,
+        n_mel_channels,
+        mel_fmin,
+        mel_fmax,
+        f0_min,
+        f0_max,
+        max_wav_value,
+        use_f0,
+        use_energy_avg,
+        use_log_f0,
+        use_scaled_energy,
+        symbol_set,
+        cleaner_names,
+        heteronyms_path,
+        phoneme_dict_path,
+        p_phoneme,
+        handle_phoneme="word",
+        handle_phoneme_ambiguous="ignore",
+        speaker_ids=None,
+        include_speakers=None,
+        n_frames=-1,
+        use_attn_prior_masking=True,
+        prepend_space_to_text=True,
+        append_space_to_text=True,
+        add_bos_eos_to_text=False,
+        betabinom_cache_path="",
+        betabinom_scaling_factor=0.05,
+        lmdb_cache_path="",
+        dur_min=None,
+        dur_max=None,
+        combine_speaker_and_emotion=False,
+        **kwargs,
+    ):
+        self.combine_speaker_and_emotion = combine_speaker_and_emotion
+        self.max_wav_value = max_wav_value
+        self.audio_lmdb_dict = {}  # dictionary of lmdbs for audio data
+        self.data = self.load_data(datasets)
+        self.distance_tx_unvoiced = False
+        if "distance_tx_unvoiced" in kwargs.keys():
+            self.distance_tx_unvoiced = kwargs["distance_tx_unvoiced"]
+        self.stft = TacotronSTFT(
+            filter_length=filter_length,
+            hop_length=hop_length,
+            win_length=win_length,
+            sampling_rate=sampling_rate,
+            n_mel_channels=n_mel_channels,
+            mel_fmin=mel_fmin,
+            mel_fmax=mel_fmax,
+        )
+        self.do_mel_scaling = kwargs.get("do_mel_scaling", True)
+        self.mel_noise_scale = kwargs.get("mel_noise_scale", 0.0)
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.mel_fmin = mel_fmin
+        self.mel_fmax = mel_fmax
+        self.f0_min = f0_min
+        self.f0_max = f0_max
+        self.use_f0 = use_f0
+        self.use_log_f0 = use_log_f0
+        self.use_energy_avg = use_energy_avg
+        self.use_scaled_energy = use_scaled_energy
+        self.sampling_rate = sampling_rate
+        self.tp = TextProcessing(
+            symbol_set,
+            cleaner_names,
+            heteronyms_path,
+            phoneme_dict_path,
+            p_phoneme=p_phoneme,
+            handle_phoneme=handle_phoneme,
+            handle_phoneme_ambiguous=handle_phoneme_ambiguous,
+            prepend_space_to_text=prepend_space_to_text,
+            append_space_to_text=append_space_to_text,
+            add_bos_eos_to_text=add_bos_eos_to_text,
+        )
+        self.dur_min = dur_min
+        self.dur_max = dur_max
+        if speaker_ids is None or speaker_ids == "":
+            self.speaker_ids = self.create_speaker_lookup_table(self.data)
+        else:
+            self.speaker_ids = speaker_ids
+        print("Number of files", len(self.data))
+        if include_speakers is not None:
+            for speaker_set, include in include_speakers:
+                self.filter_by_speakers_(speaker_set, include)
+            print("Number of files after speaker filtering", len(self.data))
+        if dur_min is not None and dur_max is not None:
+            self.filter_by_duration_(dur_min, dur_max)
+            print("Number of files after duration filtering", len(self.data))
+        self.use_attn_prior_masking = bool(use_attn_prior_masking)
+        self.prepend_space_to_text = bool(prepend_space_to_text)
+        self.append_space_to_text = bool(append_space_to_text)
+        self.betabinom_cache_path = betabinom_cache_path
+        self.betabinom_scaling_factor = betabinom_scaling_factor
+        self.lmdb_cache_path = lmdb_cache_path
+        if self.lmdb_cache_path != "":
+            self.cache_data_lmdb = lmdb.open(
+                self.lmdb_cache_path, readonly=True, max_readers=1024, lock=False
+            ).begin()
+        # # make sure caching path exists
+        # if not os.path.exists(self.betabinom_cache_path):
+        #     os.makedirs(self.betabinom_cache_path)
+        print("Dataloader initialized with no augmentations")
+        self.speaker_map = None
+        if "speaker_map" in kwargs:
+            self.speaker_map = kwargs["speaker_map"]
+    def load_data(self, datasets, split="|"):
+        dataset = []
+        for dset_name, dset_dict in datasets.items():
+            folder_path = dset_dict["basedir"]
+            audiodir = dset_dict["audiodir"]
+            filename = dset_dict["filelist"]
+            audio_lmdb_key = None
+            if "lmdbpath" in dset_dict.keys() and len(dset_dict["lmdbpath"]) > 0:
+                self.audio_lmdb_dict[dset_name] = lmdb.open(
+                    dset_dict["lmdbpath"], readonly=True, max_readers=256, lock=False
+                ).begin()
+                audio_lmdb_key = dset_name
+            wav_folder_prefix = os.path.join(folder_path, audiodir)
+            filelist_path = os.path.join(folder_path, filename)
+            with open(filelist_path, encoding="utf-8") as f:
+                data = [line.strip().split(split) for line in f]
+            for d in data:
+                emotion = "other" if len(d) == 3 else d[3]
+                duration = -1 if len(d) == 3 else d[4]
+                dataset.append(
+                    {
+                        "audiopath": os.path.join(wav_folder_prefix, d[0]),
+                        "text": d[1],
+                        "speaker": d[2] + "-" + emotion
+                        if self.combine_speaker_and_emotion
+                        else d[2],
+                        "emotion": emotion,
+                        "duration": float(duration),
+                        "lmdb_key": audio_lmdb_key,
+                    }
+                )
+        return dataset
+    def filter_by_speakers_(self, speakers, include=True):
+        print("Include spaker {}: {}".format(speakers, include))
+        if include:
+            self.data = [x for x in self.data if x["speaker"] in speakers]
+        else:
+            self.data = [x for x in self.data if x["speaker"] not in speakers]
+    def filter_by_duration_(self, dur_min, dur_max):
+        self.data = [
+            x
+            for x in self.data
+            if x["duration"] == -1
+            or (x["duration"] >= dur_min and x["duration"] <= dur_max)
+        ]
+    def create_speaker_lookup_table(self, data):
+        speaker_ids = np.sort(np.unique([x["speaker"] for x in data]))
+        d = {speaker_ids[i]: i for i in range(len(speaker_ids))}
+        print("Number of speakers:", len(d))
+        print("Speaker IDS", d)
+        return d
+    def f0_normalize(self, x):
+        if self.use_log_f0:
+            mask = x >= self.f0_min
+            x[mask] = torch.log(x[mask])
+            x[~mask] = 0.0
+        return x
+    def f0_denormalize(self, x):
+        if self.use_log_f0:
+            log_f0_min = np.log(self.f0_min)
+            mask = x >= log_f0_min
+            x[mask] = torch.exp(x[mask])
+            x[~mask] = 0.0
+        x[x <= 0.0] = 0.0
+        return x
+    def energy_avg_normalize(self, x):
+        if self.use_scaled_energy:
+            x = (x + 20.0) / 20.0
+        return x
+    def energy_avg_denormalize(self, x):
+        if self.use_scaled_energy:
+            x = x * 20.0 - 20.0
+        return x
+    def get_f0_pvoiced(
+        self,
+        audio,
+        sampling_rate=22050,
+        frame_length=1024,
+        hop_length=256,
+        f0_min=100,
+        f0_max=300,
+    ):
+        audio_norm = audio / self.max_wav_value
+        f0, voiced_mask, p_voiced = pyin(
+            audio_norm,
+            f0_min,
+            f0_max,
+            sampling_rate,
+            frame_length=frame_length,
+            win_length=frame_length // 2,
+            hop_length=hop_length,
+        )
+        f0[~voiced_mask] = 0.0
+        f0 = torch.FloatTensor(f0)
+        p_voiced = torch.FloatTensor(p_voiced)
+        voiced_mask = torch.FloatTensor(voiced_mask)
+        return f0, voiced_mask, p_voiced
+    def get_energy_average(self, mel):
+        energy_avg = mel.mean(0)
+        energy_avg = self.energy_avg_normalize(energy_avg)
+        return energy_avg
+    def get_mel(self, audio):
+        audio_norm = audio / self.max_wav_value
+        audio_norm = audio_norm.unsqueeze(0)
+        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+        melspec = self.stft.mel_spectrogram(audio_norm)
+        melspec = torch.squeeze(melspec, 0)
+        if self.do_mel_scaling:
+            melspec = (melspec + 5.5) / 2
+        if self.mel_noise_scale > 0:
+            melspec += torch.randn_like(melspec) * self.mel_noise_scale
+        return melspec
+    def get_speaker_id(self, speaker):
+        if self.speaker_map is not None and speaker in self.speaker_map:
+            speaker = self.speaker_map[speaker]
+        return torch.LongTensor([self.speaker_ids[speaker]])
+    def get_text(self, text):
+        text = self.tp.encode_text(text)
+        text = torch.LongTensor(text)
+        return text
+    def get_attention_prior(self, n_tokens, n_frames):
+        # cache the entire attn_prior by filename
+        if self.use_attn_prior_masking:
+            filename = "{}_{}".format(n_tokens, n_frames)
+            prior_path = os.path.join(self.betabinom_cache_path, filename)
+            prior_path += "_prior.pth"
+            if self.lmdb_cache_path != "":
+                attn_prior = pkl.loads(
+                    self.cache_data_lmdb.get(prior_path.encode("ascii"))
+                )
+            elif os.path.exists(prior_path):
+                attn_prior = torch.load(prior_path)
+            else:
+                attn_prior = beta_binomial_prior_distribution(
+                    n_tokens, n_frames, self.betabinom_scaling_factor
+                )
+                torch.save(attn_prior, prior_path)
+        else:
+            attn_prior = torch.ones(n_frames, n_tokens)  # all ones baseline
+        return attn_prior
+    def __getitem__(self, index):
+        data = self.data[index]
+        audiopath, text = data["audiopath"], data["text"]
+        speaker_id = data["speaker"]
+        if data["lmdb_key"] is not None:
+            data_dict = pkl.loads(
+                self.audio_lmdb_dict[data["lmdb_key"]].get(audiopath.encode("ascii"))
+            )
+            audio = data_dict["audio"]
+            sampling_rate = data_dict["sampling_rate"]
+        else:
+            audio, sampling_rate = load_wav_to_torch(audiopath)
+        if sampling_rate != self.sampling_rate:
+            raise ValueError(
+                "{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate
+                )
+            )
+        mel = self.get_mel(audio)
+        f0 = None
+        p_voiced = None
+        voiced_mask = None
+        if self.use_f0:
+            filename = "_".join(audiopath.split("/")[-3:])
+            f0_path = os.path.join(self.betabinom_cache_path, filename)
+            f0_path += "_f0_sr{}_fl{}_hl{}_f0min{}_f0max{}_log{}.pt".format(
+                self.sampling_rate,
+                self.filter_length,
+                self.hop_length,
+                self.f0_min,
+                self.f0_max,
+                self.use_log_f0,
+            )
+            dikt = None
+            if len(self.lmdb_cache_path) > 0:
+                dikt = pkl.loads(self.cache_data_lmdb.get(f0_path.encode("ascii")))
+                f0 = dikt["f0"]
+                p_voiced = dikt["p_voiced"]
+                voiced_mask = dikt["voiced_mask"]
+            elif os.path.exists(f0_path):
+                try:
+                    dikt = torch.load(f0_path)
+                except:
+                    print(f"f0 loading from {f0_path} is broken, recomputing.")
+            if dikt is not None:
+                f0 = dikt["f0"]
+                p_voiced = dikt["p_voiced"]
+                voiced_mask = dikt["voiced_mask"]
+            else:
+                f0, voiced_mask, p_voiced = self.get_f0_pvoiced(
+                    audio.cpu().numpy(),
+                    self.sampling_rate,
+                    self.filter_length,
+                    self.hop_length,
+                    self.f0_min,
+                    self.f0_max,
+                )
+                print("saving f0 to {}".format(f0_path))
+                torch.save(
+                    {"f0": f0, "voiced_mask": voiced_mask, "p_voiced": p_voiced},
+                    f0_path,
+                )
+            if f0 is None:
+                raise Exception("STOP, BROKEN F0 {}".format(audiopath))
+            f0 = self.f0_normalize(f0)
+            if self.distance_tx_unvoiced:
+                mask = f0 <= 0.0
+                distance_map = np.log(distance_transform(mask))
+                distance_map[distance_map <= 0] = 0.0
+                f0 = f0 - distance_map
+        energy_avg = None
+        if self.use_energy_avg:
+            energy_avg = self.get_energy_average(mel)
+            if self.use_scaled_energy and energy_avg.min() < 0.0:
+                print(audiopath, "has scaled energy avg smaller than 0")
+        speaker_id = self.get_speaker_id(speaker_id)
+        text_encoded = self.get_text(text)
+        attn_prior = self.get_attention_prior(text_encoded.shape[0], mel.shape[1])
+        if not self.use_attn_prior_masking:
+            attn_prior = None
+        return {
+            "mel": mel,
+            "speaker_id": speaker_id,
+            "text_encoded": text_encoded,
+            "audiopath": audiopath,
+            "attn_prior": attn_prior,
+            "f0": f0,
+            "p_voiced": p_voiced,
+            "voiced_mask": voiced_mask,
+            "energy_avg": energy_avg,
+        }
+    def __len__(self):
+        return len(self.data)
+class DataCollate:
+    """Zero-pads model inputs and targets given number of steps"""
+    def __init__(self, n_frames_per_step=1):
+        self.n_frames_per_step = n_frames_per_step
+    def __call__(self, batch):
+        """Collate from normalized data"""
+        # Right zero-pad all one-hot text sequences to max input length
+        input_lengths, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([len(x["text_encoded"]) for x in batch]),
+            dim=0,
+            descending=True,
+        )
+        max_input_len = input_lengths[0]
+        text_padded = torch.LongTensor(len(batch), max_input_len)
+        text_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            text = batch[ids_sorted_decreasing[i]]["text_encoded"]
+            text_padded[i, : text.size(0)] = text
+        # Right zero-pad mel-spec
+        num_mel_channels = batch[0]["mel"].size(0)
+        max_target_len = max([x["mel"].size(1) for x in batch])
+        # include mel padded, gate padded and speaker ids
+        mel_padded = torch.FloatTensor(len(batch), num_mel_channels, max_target_len)
+        mel_padded.zero_()
+        f0_padded = None
+        p_voiced_padded = None
+        voiced_mask_padded = None
+        energy_avg_padded = None
+        if batch[0]["f0"] is not None:
+            f0_padded = torch.FloatTensor(len(batch), max_target_len)
+            f0_padded.zero_()
+        if batch[0]["p_voiced"] is not None:
+            p_voiced_padded = torch.FloatTensor(len(batch), max_target_len)
+            p_voiced_padded.zero_()
+        if batch[0]["voiced_mask"] is not None:
+            voiced_mask_padded = torch.FloatTensor(len(batch), max_target_len)
+            voiced_mask_padded.zero_()
+        if batch[0]["energy_avg"] is not None:
+            energy_avg_padded = torch.FloatTensor(len(batch), max_target_len)
+            energy_avg_padded.zero_()
+        attn_prior_padded = torch.FloatTensor(len(batch), max_target_len, max_input_len)
+        attn_prior_padded.zero_()
+        output_lengths = torch.LongTensor(len(batch))
+        speaker_ids = torch.LongTensor(len(batch))
+        audiopaths = []
+        for i in range(len(ids_sorted_decreasing)):
+            mel = batch[ids_sorted_decreasing[i]]["mel"]
+            mel_padded[i, :, : mel.size(1)] = mel
+            if batch[ids_sorted_decreasing[i]]["f0"] is not None:
+                f0 = batch[ids_sorted_decreasing[i]]["f0"]
+                f0_padded[i, : len(f0)] = f0
+            if batch[ids_sorted_decreasing[i]]["voiced_mask"] is not None:
+                voiced_mask = batch[ids_sorted_decreasing[i]]["voiced_mask"]
+                voiced_mask_padded[i, : len(f0)] = voiced_mask
+            if batch[ids_sorted_decreasing[i]]["p_voiced"] is not None:
+                p_voiced = batch[ids_sorted_decreasing[i]]["p_voiced"]
+                p_voiced_padded[i, : len(f0)] = p_voiced
+            if batch[ids_sorted_decreasing[i]]["energy_avg"] is not None:
+                energy_avg = batch[ids_sorted_decreasing[i]]["energy_avg"]
+                energy_avg_padded[i, : len(energy_avg)] = energy_avg
+            output_lengths[i] = mel.size(1)
+            speaker_ids[i] = batch[ids_sorted_decreasing[i]]["speaker_id"]
+            audiopath = batch[ids_sorted_decreasing[i]]["audiopath"]
+            audiopaths.append(audiopath)
+            cur_attn_prior = batch[ids_sorted_decreasing[i]]["attn_prior"]
+            if cur_attn_prior is None:
+                attn_prior_padded = None
+            else:
+                attn_prior_padded[
+                    i, : cur_attn_prior.size(0), : cur_attn_prior.size(1)
+                ] = cur_attn_prior
+        return {
+            "mel": mel_padded,
+            "speaker_ids": speaker_ids,
+            "text": text_padded,
+            "input_lengths": input_lengths,
+            "output_lengths": output_lengths,
+            "audiopaths": audiopaths,
+            "attn_prior": attn_prior_padded,
+            "f0": f0_padded,
+            "p_voiced": p_voiced_padded,
+            "voiced_mask": voiced_mask_padded,
+            "energy_avg": energy_avg_padded,
+        }
+# ===================================================================
+# Takes directory of clean audio and makes directory of spectrograms
+# Useful for making test sets
+# ===================================================================
+if __name__ == "__main__":
+    # Get defaults so it can work with no Sacred
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, help="JSON file for configuration")
+    parser.add_argument("-p", "--params", nargs="+", default=[])
+    args = parser.parse_args()
+    args.rank = 0
+    # Parse configs.  Globals nicer in this case
+    with open(args.config) as f:
+        data = f.read()
+    config = json.loads(data)
+    update_params(config, args.params)
+    print(config)
+    data_config = config["data_config"]
+    ignore_keys = ["training_files", "validation_files"]
+    trainset = Data(
+        data_config["training_files"],
+        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
+    )
+    valset = Data(
+        data_config["validation_files"],
+        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
+        speaker_ids=trainset.speaker_ids,
+    )
+    collate_fn = DataCollate()
+    for dataset in (trainset, valset):
+        for i, batch in enumerate(dataset):
+            out = batch
+            print("{}/{}".format(i, len(dataset)))

distributed.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# Original source: https://github.com/NVIDIA/waveglow/blob/master/distributed.py
+#
+# Original license text:
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import os
+import torch
+import torch.distributed as dist
+from torch.autograd import Variable
+def reduce_tensor(tensor, num_gpus, reduce_dst=None):
+    if num_gpus <= 1:  # pass-thru
+        return tensor
+    rt = tensor.clone()
+    if reduce_dst is not None:
+        dist.reduce(rt, reduce_dst, op=dist.ReduceOp.SUM)
+    else:
+        dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= num_gpus
+    return rt
+def init_distributed(rank, num_gpus, dist_backend, dist_url):
+    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
+    print("> initializing distributed for rank {} out of {}".format(rank, num_gpus))
+    # Set cuda device so everything is done on the right GPU.
+    torch.cuda.set_device(rank % torch.cuda.device_count())
+    init_method = "tcp://"
+    master_ip = os.getenv("MASTER_ADDR", "localhost")
+    master_port = os.getenv("MASTER_PORT", "6000")
+    init_method += master_ip + ":" + master_port
+    torch.distributed.init_process_group(
+        backend="nccl", world_size=num_gpus, rank=rank, init_method=init_method
+    )
+def _flatten_dense_tensors(tensors):
+    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+    same dense type.
+    Since inputs are dense, the resulting tensor will be a concatenated 1D
+    buffer. Element-wise operation on this buffer will be equivalent to
+    operating individually.
+    Arguments:
+        tensors (Iterable[Tensor]): dense tensors to flatten.
+    Returns:
+        A contiguous 1D buffer containing input tensors.
+    """
+    if len(tensors) == 1:
+        return tensors[0].contiguous().view(-1)
+    flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
+    return flat
+def _unflatten_dense_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors. Assume that tensors are of
+    same dense type, and that flat is given by _flatten_dense_tensors.
+    Arguments:
+        flat (Tensor): flattened dense tensors to unflatten.
+        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+          unflatten flat.
+    Returns:
+        Unflattened dense tensors with sizes same as tensors and values from
+        flat.
+    """
+    outputs = []
+    offset = 0
+    for tensor in tensors:
+        numel = tensor.numel()
+        outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
+        offset += numel
+    return tuple(outputs)
+def apply_gradient_allreduce(module):
+    """
+    Modifies existing model to do gradient allreduce, but doesn't change class
+    so you don't need "module"
+    """
+    if not hasattr(dist, "_backend"):
+        module.warn_on_half = True
+    else:
+        module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+    for p in module.state_dict().values():
+        if not torch.is_tensor(p):
+            continue
+        dist.broadcast(p, 0)
+    def allreduce_params():
+        if module.needs_reduction:
+            module.needs_reduction = False
+            buckets = {}
+            for param in module.parameters():
+                if param.requires_grad and param.grad is not None:
+                    tp = type(param.data)
+                    if tp not in buckets:
+                        buckets[tp] = []
+                    buckets[tp].append(param)
+            if module.warn_on_half:
+                if torch.cuda.HalfTensor in buckets:
+                    print(
+                        "WARNING: gloo dist backend for half parameters may be extremely slow."
+                        + " It is recommended to use the NCCL backend in this case. This currently requires"
+                        + "PyTorch built from top of tree master."
+                    )
+                    module.warn_on_half = False
+            for tp in buckets:
+                bucket = buckets[tp]
+                grads = [param.grad.data for param in bucket]
+                coalesced = _flatten_dense_tensors(grads)
+                dist.all_reduce(coalesced)
+                coalesced /= dist.get_world_size()
+                for buf, synced in zip(
+                    grads, _unflatten_dense_tensors(coalesced, grads)
+                ):
+                    buf.copy_(synced)
+    for param in list(module.parameters()):
+        def allreduce_hook(*unused):
+            Variable._execution_engine.queue_callback(allreduce_params)
+        if param.requires_grad:
+            param.register_hook(allreduce_hook)
+            dir(param)
+    def set_needs_reduction(self, input, output):
+        self.needs_reduction = True
+    module.register_forward_hook(set_needs_reduction)
+    return module

filelists/3speakers_ukrainian_train_filelist.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

filelists/3speakers_ukrainian_train_filelist_dc.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

filelists/3speakers_ukrainian_val_filelist.txt ADDED Viewed

	@@ -0,0 +1,85 @@

+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48849.wav|мандрівник+и вп+ерто відмовл+ялися.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48850.wav|він уз+яв сок+иру й г+острим кінц+ем поч+ав розв+ажувати з+уби.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48851.wav|розгр+ібши сніг, тр+охи прос+унув г+олову й пл+ечі під шатр+о.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48853.wav|ал+е раз зас+идівся до п+ізнього в+ечора.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48854.wav|то ж не дим їй +очі роз'їд+ав, бо др+ова бул+и сух+і.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48855.wav|вон+а не м+ала теп+ер с+умніву, що в портоса з д+амою бул+а інтр+ига.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48857.wav|х+очуть укра+їну з під л+яхів визвол+яти.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48858.wav|там жінк+ам не д+уже догодж+ають.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48859.wav|і б+удьте спок+ійні! якщ+о вин+о нам не спод+обається, ми пошлем+о по +інше.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48830.wav|мій д+івер і я м+арно чек+али на вас вч+ора й позавч+ора.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48831.wav|п+ане д'артаньяне, ви п+ерший.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48832.wav|ось мо+я в+ідповідь.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48833.wav|хоч той так+и й д+ійсно д+урень.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48834.wav|ви давн+о не гр+али?|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48835.wav|теп+ер їм довел+ось зазн+ати д+оброї бід+и в цій кра+їні.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48836.wav|позавч+ора був пісн+ий день, а там подав+али лиш+е скор+омне.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48837.wav|і не потреб+уєте всі роб+ити.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48838.wav|у рук+ах у н+еї бул+а нов+а зап+иска міл+еді.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48839.wav|і ч+етверо др+узів одн+им г+олосом повтор+или прис+ягу, запропон+овану від д'артаньяна.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48841.wav|іг+уменя ст+ала сл+ухати ув+ажніш, тр+охи пожвав+іла й всміхн+улася.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48842.wav|так ти цьог+о не роб+и й не втрач+айся, бо одн+аково не пом+оже.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48843.wav|туд+и і рв+еться н+аша душ+а, кол+и х+очеш зн+ати.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48844.wav|б+олісно всміх+ався і трясс+я, як у проп+асниці.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48845.wav|я прив+ів тоб+і др+угого, сказ+ав д'артаньян.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48846.wav|я поб+ачу корол+я сьог+одні увечорі, ал+е вас не р+аджу наверт+атись йому на в+ічі.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48847.wav|ще весел+іш почал+и тод+і гомон+іти.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/lada/accept/48848.wav|споч+атку вон+а нарахув+ала двох, п+отім п'ять, нар+ешті в+ісім.|lada
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68540.wav|кр+аще вже пуст+ити соб+і к+улю в л+оба і відр+азу покл+асти всь+ому край.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68541.wav|ал+е сидяч+и за стол+ом, при п+иві, знов поч+ув як+есь невдов+олення.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68543.wav|на шабл+ях!|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68544.wav|вон+а пров+адила з незнай+омим д+уже жв+аву розм+ову.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68545.wav|офіц+ер взяв зі ст+олу вк+азані пап+ери, под+ав їх і, н+изько вклонившися, в+ийшов.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68546.wav|аж с+умно йому ст+ало.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68547.wav|житт+я не ласк+аве з багать+ох прич+ин.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68548.wav|так, звич+айно тр+еба, ств+ердила корол+ева.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68549.wav|вон+а, не зверн+увши ув+аги на цей д+ок+ір, промовл+яла д+алі.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68550.wav|зда+ється, не дочув+аю.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68551.wav|відв+ажний і завз+ятий, він не вп+ерше в+ажив сво+ї+++м житт+ям у так+их приг+одах.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68552.wav|як ч+асом, г+аво.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68553.wav|мій друг араміс, що оц+е сто+їть п+еред вами, здоб+ув легк+ого вд+ара шпад+ою в р+уку.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68554.wav|я знав+ець свог+о д+іла.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68556.wav|пог+онич леж+ав на с+анк+ах, а соб+аки шв+идко б+ігли пр+ямо до хат+ини.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68557.wav|міл+еді к+инулась до нього.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68558.wav|хто тоб+і сказ+ав?|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68559.wav|то й не поваж+ай, не зляк+аєш.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68560.wav|поясн+іть, бо я не розум+ію, що ви х+очете сказ+ати.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68561.wav|шрам наздогн+ав свій п+оїзд к+оло вис+оких вор+іт п+ана гвинтовки.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68562.wav|що ж він так+е?|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68563.wav|що це так+е? спит+ав портос.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/mykyta/accept/68565.wav|див+іться, тут зн+ову втруч+алася ц+ерква, з+авжд+и та ц+ерква.|mykyta
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67117.wav|а чолов+ік цьог+о жахл+ивого створ+іння ще жив+ий? зацік+авився араміс.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67118.wav|ви, дик, не ч+ули ці+єї т+иші.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67119.wav|він баг+атий на р+ок+и, шан+обу й сл+аву вел+ику.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67120.wav|в +осени зар+ані, ск+оро п+ісля сп+аса под+ався макс+им до київа.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67121.wav|а до н+еї п+ишеш?|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67122.wav|я, б+ачилось, н+авіть не люб+ив її так, як л+юблять зак+охані.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67123.wav|юрб+а провал+ила тим ч+асом м+имо петр+а.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67124.wav|хай так! приєдн+ався швайц+арець.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67125.wav|к+онюх підтв+ердив кардин+алові слов+а мушкет+ерів про атоса.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67126.wav|що завин+ив, те б+уду терп+іти.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67127.wav|чи є у вас тр+охи піск+у? ск+ільки? він показ+ав їй свій міш+ок.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67128.wav|я скаж+у це т+ільки том+у, хто прозирн+е в мо+ю д+ушу.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67129.wav|і в оц+ій хв+илі вон+а не міркув+ала тог+о.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67130.wav|ти б+ачив сво+ю ж?|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67132.wav|прот+е, тр+еба скл+асти як+ийсь плян б+ою, пром+овив араміс.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67133.wav|огого! д+уже швидк+а! так я теб+е й пуст+ив до богун+а!|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67134.wav|бог з тоб+ою, добр+одію!|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67135.wav|киценька! ти т+ямиш її?|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67136.wav|розм+ова поверн+ула на вес+еле.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67137.wav|розум+іється, сказ+ала вон+а к+оротко.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67138.wav|їй с+оромно ст+ало, що на оч+ах у всіх її так знев+ажено, і вон+а знен+авиділа фреду.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67139.wav|це бул+о м+ужнє обл+иччя.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67140.wav|св+екра зн+ала м+ало, не ч+асто й б+ачилася з ним, на рік раз+ів зо три.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67141.wav|спр+ава ця єсть особл+ивої делікатности.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67143.wav|я так отощ+ав, не +ївши зр+анку, що й р+адуватись незд+ужаю.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67145.wav|т+ільки в+ірна будь мен+і.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67146.wav|п'єр піш+ов за н+ею і відч+алив.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67147.wav|і по цих слов+ах к+инув торб+инку із з+олотом в р+ічку.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67148.wav|а, він в пор+ядку, сказ+ав нач+альник, та з чуд+овою рекоменд+ацією.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67149.wav|тод+і підожд+іть тр+ошки, зачек+айте.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67150.wav|із як+ими вістьми? пит+ає г+етьман.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67151.wav|стар+ий сарабр+ин міг л+егко пот+ішитися.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67152.wav|о, я, нещ+асний!|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67153.wav|кр+оки в сальоні.|tetiana
+/home/yehor/RADTTS-Multiple-Voices/datasets/tetiana/accept/67154.wav|щоб н+ашим ворог+ам бул+о т+яжко!|tetiana

filelists/3speakers_ukrainian_val_filelist_dc.txt ADDED Viewed

	@@ -0,0 +1,85 @@

+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48849.wav|мандрівник+и вп+ерто відмовл+ялися.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48850.wav|він уз+яв сок+иру й г+острим кінц+ем поч+ав розв+ажувати з+уби.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48851.wav|розгр+ібши сніг, тр+охи прос+унув г+олову й пл+ечі під шатр+о.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48853.wav|ал+е раз зас+идівся до п+ізнього в+ечора.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48854.wav|то ж не дим їй +очі роз'їд+ав, бо др+ова бул+и сух+і.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48855.wav|вон+а не м+ала теп+ер с+умніву, що в портоса з д+амою бул+а інтр+ига.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48857.wav|х+очуть укра+їну з під л+яхів визвол+яти.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48858.wav|там жінк+ам не д+уже догодж+ають.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48859.wav|і б+удьте спок+ійні! якщ+о вин+о нам не спод+обається, ми пошлем+о по +інше.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48830.wav|мій д+івер і я м+арно чек+али на вас вч+ора й позавч+ора.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48831.wav|п+ане д'артаньяне, ви п+ерший.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48832.wav|ось мо+я в+ідповідь.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48833.wav|хоч той так+и й д+ійсно д+урень.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48834.wav|ви давн+о не гр+али?|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48835.wav|теп+ер їм довел+ось зазн+ати д+оброї бід+и в цій кра+їні.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48836.wav|позавч+ора був пісн+ий день, а там подав+али лиш+е скор+омне.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48837.wav|і не потреб+уєте всі роб+ити.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48838.wav|у рук+ах у н+еї бул+а нов+а зап+иска міл+еді.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48839.wav|і ч+етверо др+узів одн+им г+олосом повтор+или прис+ягу, запропон+овану від д'артаньяна.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48841.wav|іг+уменя ст+ала сл+ухати ув+ажніш, тр+охи пожвав+іла й всміхн+улася.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48842.wav|так ти цьог+о не роб+и й не втрач+айся, бо одн+аково не пом+оже.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48843.wav|туд+и і рв+еться н+аша душ+а, кол+и х+очеш зн+ати.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48844.wav|б+олісно всміх+ався і трясс+я, як у проп+асниці.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48845.wav|я прив+ів тоб+і др+угого, сказ+ав д'артаньян.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48846.wav|я поб+ачу корол+я сьог+одні увечорі, ал+е вас не р+аджу наверт+атись йому на в+ічі.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48847.wav|ще весел+іш почал+и тод+і гомон+іти.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/lada/accept/48848.wav|споч+атку вон+а нарахув+ала двох, п+отім п'ять, нар+ешті в+ісім.|lada
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68540.wav|кр+аще вже пуст+ити соб+і к+улю в л+оба і відр+азу покл+асти всь+ому край.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68541.wav|ал+е сидяч+и за стол+ом, при п+иві, знов поч+ув як+есь невдов+олення.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68543.wav|на шабл+ях!|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68544.wav|вон+а пров+адила з незнай+омим д+уже жв+аву розм+ову.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68545.wav|офіц+ер взяв зі ст+олу вк+азані пап+ери, под+ав їх і, н+изько вклонившися, в+ийшов.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68546.wav|аж с+умно йому ст+ало.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68547.wav|житт+я не ласк+аве з багать+ох прич+ин.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68548.wav|так, звич+айно тр+еба, ств+ердила корол+ева.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68549.wav|вон+а, не зверн+увши ув+аги на цей д+ок+ір, промовл+яла д+алі.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68550.wav|зда+ється, не дочув+аю.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68551.wav|відв+ажний і завз+ятий, він не вп+ерше в+ажив сво+ї+++м житт+ям у так+их приг+одах.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68552.wav|як ч+асом, г+аво.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68553.wav|мій друг араміс, що оц+е сто+їть п+еред вами, здоб+ув легк+ого вд+ара шпад+ою в р+уку.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68554.wav|я знав+ець свог+о д+іла.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68556.wav|пог+онич леж+ав на с+анк+ах, а соб+аки шв+идко б+ігли пр+ямо до хат+ини.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68557.wav|міл+еді к+инулась до нього.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68558.wav|хто тоб+і сказ+ав?|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68559.wav|то й не поваж+ай, не зляк+аєш.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68560.wav|поясн+іть, бо я не розум+ію, що ви х+очете сказ+ати.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68561.wav|шрам наздогн+ав свій п+оїзд к+оло вис+оких вор+іт п+ана гвинтовки.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68562.wav|що ж він так+е?|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68563.wav|що це так+е? спит+ав портос.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/mykyta/accept/68565.wav|див+іться, тут зн+ову втруч+алася ц+ерква, з+авжд+и та ц+ерква.|mykyta
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67117.wav|а чолов+ік цьог+о жахл+ивого створ+іння ще жив+ий? зацік+авився араміс.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67118.wav|ви, дик, не ч+ули ці+єї т+иші.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67119.wav|він баг+атий на р+ок+и, шан+обу й сл+аву вел+ику.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67120.wav|в +осени зар+ані, ск+оро п+ісля сп+аса под+ався макс+им до київа.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67121.wav|а до н+еї п+ишеш?|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67122.wav|я, б+ачилось, н+авіть не люб+ив її так, як л+юблять зак+охані.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67123.wav|юрб+а провал+ила тим ч+асом м+имо петр+а.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67124.wav|хай так! приєдн+ався швайц+арець.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67125.wav|к+онюх підтв+ердив кардин+алові слов+а мушкет+ерів про атоса.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67126.wav|що завин+ив, те б+уду терп+іти.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67127.wav|чи є у вас тр+охи піск+у? ск+ільки? він показ+ав їй свій міш+ок.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67128.wav|я скаж+у це т+ільки том+у, хто прозирн+е в мо+ю д+ушу.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67129.wav|і в оц+ій хв+илі вон+а не міркув+ала тог+о.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67130.wav|ти б+ачив сво+ю ж?|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67132.wav|прот+е, тр+еба скл+асти як+ийсь плян б+ою, пром+овив араміс.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67133.wav|огого! д+уже швидк+а! так я теб+е й пуст+ив до богун+а!|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67134.wav|бог з тоб+ою, добр+одію!|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67135.wav|киценька! ти т+ямиш її?|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67136.wav|розм+ова поверн+ула на вес+еле.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67137.wav|розум+іється, сказ+ала вон+а к+оротко.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67138.wav|їй с+оромно ст+ало, що на оч+ах у всіх її так знев+ажено, і вон+а знен+авиділа фреду.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67139.wav|це бул+о м+ужнє обл+иччя.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67140.wav|св+екра зн+ала м+ало, не ч+асто й б+ачилася з ним, на рік раз+ів зо три.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67141.wav|спр+ава ця єсть особл+ивої делікатности.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67143.wav|я так отощ+ав, не +ївши зр+анку, що й р+адуватись незд+ужаю.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67145.wav|т+ільки в+ірна будь мен+і.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67146.wav|п'єр піш+ов за н+ею і відч+алив.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67147.wav|і по цих слов+ах к+инув торб+инку із з+олотом в р+ічку.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67148.wav|а, він в пор+ядку, сказ+ав нач+альник, та з чуд+овою рекоменд+ацією.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67149.wav|тод+і підожд+іть тр+ошки, зачек+айте.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67150.wav|із як+ими вістьми? пит+ає г+етьман.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67151.wav|стар+ий сарабр+ин міг л+егко пот+ішитися.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67152.wav|о, я, нещ+асний!|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67153.wav|кр+оки в сальоні.|tetiana
+/home/dmytro_chaplinsky/RAD-TTS/datasets/tetiana/accept/67154.wav|щоб н+ашим ворог+ам бул+о т+яжко!|tetiana

loss.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from common import get_mask_from_lengths
+def compute_flow_loss(
+    z, log_det_W_list, log_s_list, n_elements, n_dims, mask, sigma=1.0
+):
+    log_det_W_total = 0.0
+    for i, log_s in enumerate(log_s_list):
+        if i == 0:
+            log_s_total = torch.sum(log_s * mask)
+            if len(log_det_W_list):
+                log_det_W_total = log_det_W_list[i]
+        else:
+            log_s_total = log_s_total + torch.sum(log_s * mask)
+            if len(log_det_W_list):
+                log_det_W_total += log_det_W_list[i]
+    if len(log_det_W_list):
+        log_det_W_total *= n_elements
+    z = z * mask
+    prior_NLL = torch.sum(z * z) / (2 * sigma * sigma)
+    loss = prior_NLL - log_s_total - log_det_W_total
+    denom = n_elements * n_dims
+    loss = loss / denom
+    loss_prior = prior_NLL / denom
+    return loss, loss_prior
+def compute_regression_loss(x_hat, x, mask, name=False):
+    x = x[:, None] if len(x.shape) == 2 else x  # add channel dim
+    mask = mask[:, None] if len(mask.shape) == 2 else mask  # add channel dim
+    assert len(x.shape) == len(mask.shape)
+    x = x * mask
+    x_hat = x_hat * mask
+    if name == "vpred":
+        loss = F.binary_cross_entropy_with_logits(x_hat, x, reduction="sum")
+    else:
+        loss = F.mse_loss(x_hat, x, reduction="sum")
+    loss = loss / mask.sum()
+    loss_dict = {"loss_{}".format(name): loss}
+    return loss_dict
+class AttributePredictionLoss(torch.nn.Module):
+    def __init__(self, name, model_config, loss_weight, sigma=1.0):
+        super(AttributePredictionLoss, self).__init__()
+        self.name = name
+        self.sigma = sigma
+        self.model_name = model_config["name"]
+        self.loss_weight = loss_weight
+        self.n_group_size = 1
+        if "n_group_size" in model_config["hparams"]:
+            self.n_group_size = model_config["hparams"]["n_group_size"]
+    def forward(self, model_output, lens):
+        mask = get_mask_from_lengths(lens // self.n_group_size)
+        mask = mask[:, None].float()
+        loss_dict = {}
+        if "z" in model_output:
+            n_elements = lens.sum() // self.n_group_size
+            n_dims = model_output["z"].size(1)
+            loss, loss_prior = compute_flow_loss(
+                model_output["z"],
+                model_output["log_det_W_list"],
+                model_output["log_s_list"],
+                n_elements,
+                n_dims,
+                mask,
+                self.sigma,
+            )
+            loss_dict = {
+                "loss_{}".format(self.name): (loss, self.loss_weight),
+                "loss_prior_{}".format(self.name): (loss_prior, 0.0),
+            }
+        elif "x_hat" in model_output:
+            loss_dict = compute_regression_loss(
+                model_output["x_hat"], model_output["x"], mask, self.name
+            )
+            for k, v in loss_dict.items():
+                loss_dict[k] = (v, self.loss_weight)
+        if len(loss_dict) == 0:
+            raise Exception("loss not supported")
+        return loss_dict
+class AttentionCTCLoss(torch.nn.Module):
+    def __init__(self, blank_logprob=-1):
+        super(AttentionCTCLoss, self).__init__()
+        self.log_softmax = torch.nn.LogSoftmax(dim=3)
+        self.blank_logprob = blank_logprob
+        self.CTCLoss = nn.CTCLoss(zero_infinity=True)
+    def forward(self, attn_logprob, in_lens, out_lens):
+        key_lens = in_lens
+        query_lens = out_lens
+        attn_logprob_padded = F.pad(
+            input=attn_logprob, pad=(1, 0, 0, 0, 0, 0, 0, 0), value=self.blank_logprob
+        )
+        cost_total = 0.0
+        for bid in range(attn_logprob.shape[0]):
+            target_seq = torch.arange(1, key_lens[bid] + 1).unsqueeze(0)
+            curr_logprob = attn_logprob_padded[bid].permute(1, 0, 2)[
+                : query_lens[bid], :, : key_lens[bid] + 1
+            ]
+            curr_logprob = self.log_softmax(curr_logprob[None])[0]
+            ctc_cost = self.CTCLoss(
+                curr_logprob,
+                target_seq,
+                input_lengths=query_lens[bid : bid + 1],
+                target_lengths=key_lens[bid : bid + 1],
+            )
+            cost_total += ctc_cost
+        cost = cost_total / attn_logprob.shape[0]
+        return cost
+class AttentionBinarizationLoss(torch.nn.Module):
+    def __init__(self):
+        super(AttentionBinarizationLoss, self).__init__()
+    def forward(self, hard_attention, soft_attention):
+        log_sum = torch.log(soft_attention[hard_attention == 1]).sum()
+        return -log_sum / hard_attention.sum()
+class RADTTSLoss(torch.nn.Module):
+    def __init__(
+        self,
+        sigma=1.0,
+        n_group_size=1,
+        dur_model_config=None,
+        f0_model_config=None,
+        energy_model_config=None,
+        vpred_model_config=None,
+        loss_weights=None,
+    ):
+        super(RADTTSLoss, self).__init__()
+        self.sigma = sigma
+        self.n_group_size = n_group_size
+        self.loss_weights = loss_weights
+        self.attn_ctc_loss = AttentionCTCLoss(
+            blank_logprob=loss_weights.get("blank_logprob", -1)
+        )
+        self.loss_fns = {}
+        if dur_model_config is not None:
+            self.loss_fns["duration_model_outputs"] = AttributePredictionLoss(
+                "duration", dur_model_config, loss_weights["dur_loss_weight"]
+            )
+        if f0_model_config is not None:
+            self.loss_fns["f0_model_outputs"] = AttributePredictionLoss(
+                "f0", f0_model_config, loss_weights["f0_loss_weight"], sigma=1.0
+            )
+        if energy_model_config is not None:
+            self.loss_fns["energy_model_outputs"] = AttributePredictionLoss(
+                "energy", energy_model_config, loss_weights["energy_loss_weight"]
+            )
+        if vpred_model_config is not None:
+            self.loss_fns["vpred_model_outputs"] = AttributePredictionLoss(
+                "vpred", vpred_model_config, loss_weights["vpred_loss_weight"]
+            )
+    def forward(self, model_output, in_lens, out_lens):
+        loss_dict = {}
+        if len(model_output["z_mel"]):
+            n_elements = out_lens.sum() // self.n_group_size
+            mask = get_mask_from_lengths(out_lens // self.n_group_size)
+            mask = mask[:, None].float()
+            n_dims = model_output["z_mel"].size(1)
+            loss_mel, loss_prior_mel = compute_flow_loss(
+                model_output["z_mel"],
+                model_output["log_det_W_list"],
+                model_output["log_s_list"],
+                n_elements,
+                n_dims,
+                mask,
+                self.sigma,
+            )
+            loss_dict["loss_mel"] = (loss_mel, 1.0)  # loss, weight
+            loss_dict["loss_prior_mel"] = (loss_prior_mel, 0.0)
+        ctc_cost = self.attn_ctc_loss(model_output["attn_logprob"], in_lens, out_lens)
+        loss_dict["loss_ctc"] = (ctc_cost, self.loss_weights["ctc_loss_weight"])
+        for k in model_output:
+            if k in self.loss_fns:
+                if model_output[k] is not None and len(model_output[k]) > 0:
+                    t_lens = in_lens if "dur" in k else out_lens
+                    mout = model_output[k]
+                    for loss_name, v in self.loss_fns[k](mout, t_lens).items():
+                        loss_dict[loss_name] = v
+        return loss_dict

partialconv1d.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Modified partialconv source code based on implementation from
+# https://github.com/NVIDIA/partialconv/blob/master/models/partialconv2d.py
+###############################################################################
+# BSD 3-Clause License
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Author & Contact: Guilin Liu ([email protected])
+###############################################################################
+# Original Author & Contact: Guilin Liu ([email protected])
+# Modified by Kevin Shih ([email protected])
+import torch
+import torch.nn.functional as F
+from torch import nn
+class PartialConv1d(nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        self.multi_channel = False
+        self.return_mask = False
+        super(PartialConv1d, self).__init__(*args, **kwargs)
+        self.weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0])
+        self.slide_winsize = (
+            self.weight_maskUpdater.shape[1] * self.weight_maskUpdater.shape[2]
+        )
+        self.last_size = (None, None, None)
+        self.update_mask = None
+        self.mask_ratio = None
+    @torch.jit.ignore
+    def forward(self, input: torch.Tensor, mask_in: torch.Tensor = None):
+        """
+        input: standard input to a 1D conv
+        mask_in: binary mask for valid values, same shape as input
+        """
+        assert len(input.shape) == 3
+        # if a mask is input, or tensor shape changed, update mask ratio
+        if mask_in is not None or self.last_size != tuple(input.shape):
+            self.last_size = tuple(input.shape)
+            with torch.no_grad():
+                if self.weight_maskUpdater.type() != input.type():
+                    self.weight_maskUpdater = self.weight_maskUpdater.to(input)
+                if mask_in is None:
+                    mask = torch.ones(1, 1, input.data.shape[2]).to(input)
+                else:
+                    mask = mask_in
+                self.update_mask = F.conv1d(
+                    mask,
+                    self.weight_maskUpdater,
+                    bias=None,
+                    stride=self.stride,
+                    padding=self.padding,
+                    dilation=self.dilation,
+                    groups=1,
+                )
+                # for mixed precision training, change 1e-8 to 1e-6
+                self.mask_ratio = self.slide_winsize / (self.update_mask + 1e-6)
+                self.update_mask = torch.clamp(self.update_mask, 0, 1)
+                self.mask_ratio = torch.mul(self.mask_ratio, self.update_mask)
+        raw_out = super(PartialConv1d, self).forward(
+            torch.mul(input, mask) if mask_in is not None else input
+        )
+        if self.bias is not None:
+            bias_view = self.bias.view(1, self.out_channels, 1)
+            output = torch.mul(raw_out - bias_view, self.mask_ratio) + bias_view
+            output = torch.mul(output, self.update_mask)
+        else:
+            output = torch.mul(raw_out, self.mask_ratio)
+        if self.return_mask:
+            return output, self.update_mask
+        else:
+            return output

radam.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Original source taken from https://github.com/LiyuanLucasLiu/RAdam
+#
+# Copyright 2019 Liyuan Liu
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+import math
+import torch
+# pylint: disable=no-name-in-module
+from torch.optim.optimizer import Optimizer
+class RAdam(Optimizer):
+    """RAdam optimizer"""
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
+        """
+        Init
+        :param params: parameters to optimize
+        :param lr: learning rate
+        :param betas: beta
+        :param eps: numerical precision
+        :param weight_decay: weight decay weight
+        """
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        self.buffer = [[None, None, None] for _ in range(10)]
+        super().__init__(params, defaults)
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError("RAdam does not support sparse gradients")
+                p_data_fp32 = p.data.float()
+                state = self.state[p]
+                if len(state) == 0:
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
+                else:
+                    state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                state["step"] += 1
+                buffered = self.buffer[int(state["step"] % 10)]
+                if state["step"] == buffered[0]:
+                    N_sma, step_size = buffered[1], buffered[2]
+                else:
+                    buffered[0] = state["step"]
+                    beta2_t = beta2 ** state["step"]
+                    N_sma_max = 2 / (1 - beta2) - 1
+                    N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
+                    buffered[1] = N_sma
+                    # more conservative since it's an approximated value
+                    if N_sma >= 5:
+                        step_size = (
+                            group["lr"]
+                            * math.sqrt(
+                                (1 - beta2_t)
+                                * (N_sma - 4)
+                                / (N_sma_max - 4)
+                                * (N_sma - 2)
+                                / N_sma
+                                * N_sma_max
+                                / (N_sma_max - 2)
+                            )
+                            / (1 - beta1 ** state["step"])
+                        )
+                    else:
+                        step_size = group["lr"] / (1 - beta1 ** state["step"])
+                    buffered[2] = step_size
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)
+                # more conservative since it's an approximated value
+                if N_sma >= 5:
+                    denom = exp_avg_sq.sqrt().add_(group["eps"])
+                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
+                else:
+                    p_data_fp32.add_(-step_size, exp_avg)
+                p.data.copy_(p_data_fp32)
+        return loss

radtts.py ADDED Viewed

	@@ -0,0 +1,936 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+import torch
+from torch import nn
+from common import Encoder, LengthRegulator, ConvAttention
+from common import Invertible1x1ConvLUS, Invertible1x1Conv
+from common import AffineTransformationLayer, LinearNorm, ExponentialClass
+from common import get_mask_from_lengths
+from attribute_prediction_model import get_attribute_prediction_model
+from alignment import mas_width1 as mas
+class FlowStep(nn.Module):
+    def __init__(
+        self,
+        n_mel_channels,
+        n_context_dim,
+        n_layers,
+        affine_model="simple_conv",
+        scaling_fn="exp",
+        matrix_decomposition="",
+        affine_activation="softplus",
+        use_partial_padding=False,
+        cache_inverse=False,
+    ):
+        super(FlowStep, self).__init__()
+        if matrix_decomposition == "LUS":
+            self.invtbl_conv = Invertible1x1ConvLUS(
+                n_mel_channels, cache_inverse=cache_inverse
+            )
+        else:
+            self.invtbl_conv = Invertible1x1Conv(
+                n_mel_channels, cache_inverse=cache_inverse
+            )
+        self.affine_tfn = AffineTransformationLayer(
+            n_mel_channels,
+            n_context_dim,
+            n_layers,
+            affine_model=affine_model,
+            scaling_fn=scaling_fn,
+            affine_activation=affine_activation,
+            use_partial_padding=use_partial_padding,
+        )
+    def enable_inverse_cache(self):
+        self.invtbl_conv.cache_inverse = True
+    def forward(self, z, context, inverse=False, seq_lens=None):
+        if inverse:  # for inference z-> mel
+            z = self.affine_tfn(z, context, inverse, seq_lens=seq_lens)
+            z = self.invtbl_conv(z, inverse)
+            return z
+        else:  # training mel->z
+            z, log_det_W = self.invtbl_conv(z)
+            z, log_s = self.affine_tfn(z, context, seq_lens=seq_lens)
+            return z, log_det_W, log_s
+class RADTTS(torch.nn.Module):
+    def __init__(
+        self,
+        n_speakers,
+        n_speaker_dim,
+        n_text,
+        n_text_dim,
+        n_flows,
+        n_conv_layers_per_step,
+        n_mel_channels,
+        n_hidden,
+        mel_encoder_n_hidden,
+        dummy_speaker_embedding,
+        n_early_size,
+        n_early_every,
+        n_group_size,
+        affine_model,
+        dur_model_config,
+        f0_model_config,
+        energy_model_config,
+        v_model_config=None,
+        include_modules="dec",
+        scaling_fn="exp",
+        matrix_decomposition="",
+        learn_alignments=False,
+        affine_activation="softplus",
+        attn_use_CTC=True,
+        use_speaker_emb_for_alignment=False,
+        use_context_lstm=False,
+        context_lstm_norm=None,
+        text_encoder_lstm_norm=None,
+        n_f0_dims=0,
+        n_energy_avg_dims=0,
+        context_lstm_w_f0_and_energy=True,
+        use_first_order_features=False,
+        unvoiced_bias_activation="",
+        ap_pred_log_f0=False,
+        **kwargs,
+    ):
+        super(RADTTS, self).__init__()
+        assert n_early_size % 2 == 0
+        self.do_mel_descaling = kwargs.get("do_mel_descaling", True)
+        self.n_mel_channels = n_mel_channels
+        self.n_f0_dims = n_f0_dims  # >= 1 to trains with f0
+        self.n_energy_avg_dims = n_energy_avg_dims  # >= 1 trains with energy
+        self.decoder_use_partial_padding = kwargs.get(
+            "decoder_use_partial_padding", True
+        )
+        self.n_speaker_dim = n_speaker_dim
+        assert self.n_speaker_dim % 2 == 0
+        self.speaker_embedding = torch.nn.Embedding(n_speakers, self.n_speaker_dim)
+        self.embedding = torch.nn.Embedding(n_text, n_text_dim)
+        self.flows = torch.nn.ModuleList()
+        self.encoder = Encoder(
+            encoder_embedding_dim=n_text_dim,
+            norm_fn=nn.InstanceNorm1d,
+            lstm_norm_fn=text_encoder_lstm_norm,
+        )
+        self.dummy_speaker_embedding = dummy_speaker_embedding
+        self.learn_alignments = learn_alignments
+        self.affine_activation = affine_activation
+        self.include_modules = include_modules
+        self.attn_use_CTC = bool(attn_use_CTC)
+        self.use_speaker_emb_for_alignment = use_speaker_emb_for_alignment
+        self.use_context_lstm = bool(use_context_lstm)
+        self.context_lstm_norm = context_lstm_norm
+        self.context_lstm_w_f0_and_energy = context_lstm_w_f0_and_energy
+        self.length_regulator = LengthRegulator()
+        self.use_first_order_features = bool(use_first_order_features)
+        self.decoder_use_unvoiced_bias = kwargs.get("decoder_use_unvoiced_bias", True)
+        self.ap_pred_log_f0 = ap_pred_log_f0
+        self.ap_use_unvoiced_bias = kwargs.get("ap_use_unvoiced_bias", True)
+        self.attn_straight_through_estimator = kwargs.get(
+            "attn_straight_through_estimator", False
+        )
+        if "atn" in include_modules or "dec" in include_modules:
+            if self.learn_alignments:
+                if self.use_speaker_emb_for_alignment:
+                    self.attention = ConvAttention(
+                        n_mel_channels, n_text_dim + self.n_speaker_dim
+                    )
+                else:
+                    self.attention = ConvAttention(n_mel_channels, n_text_dim)
+            self.n_flows = n_flows
+            self.n_group_size = n_group_size
+            n_flowstep_cond_dims = (
+                self.n_speaker_dim
+                + (n_text_dim + n_f0_dims + n_energy_avg_dims) * n_group_size
+            )
+            if self.use_context_lstm:
+                n_in_context_lstm = self.n_speaker_dim + n_text_dim * n_group_size
+                n_context_lstm_hidden = int(
+                    (self.n_speaker_dim + n_text_dim * n_group_size) / 2
+                )
+                if self.context_lstm_w_f0_and_energy:
+                    n_in_context_lstm = n_f0_dims + n_energy_avg_dims + n_text_dim
+                    n_in_context_lstm *= n_group_size
+                    n_in_context_lstm += self.n_speaker_dim
+                    n_context_hidden = n_f0_dims + n_energy_avg_dims + n_text_dim
+                    n_context_hidden = n_context_hidden * n_group_size / 2
+                    n_context_hidden = self.n_speaker_dim + n_context_hidden
+                    n_context_hidden = int(n_context_hidden)
+                    n_flowstep_cond_dims = (
+                        self.n_speaker_dim + n_text_dim * n_group_size
+                    )
+                self.context_lstm = torch.nn.LSTM(
+                    input_size=n_in_context_lstm,
+                    hidden_size=n_context_lstm_hidden,
+                    num_layers=1,
+                    batch_first=True,
+                    bidirectional=True,
+                )
+                if context_lstm_norm is not None:
+                    if "spectral" in context_lstm_norm:
+                        print("Applying spectral norm to context encoder LSTM")
+                        lstm_norm_fn_pntr = torch.nn.utils.spectral_norm
+                    elif "weight" in context_lstm_norm:
+                        print("Applying weight norm to context encoder LSTM")
+                        lstm_norm_fn_pntr = torch.nn.utils.weight_norm
+                    self.context_lstm = lstm_norm_fn_pntr(
+                        self.context_lstm, "weight_hh_l0"
+                    )
+                    self.context_lstm = lstm_norm_fn_pntr(
+                        self.context_lstm, "weight_hh_l0_reverse"
+                    )
+            if self.n_group_size > 1:
+                self.unfold_params = {
+                    "kernel_size": (n_group_size, 1),
+                    "stride": n_group_size,
+                    "padding": 0,
+                    "dilation": 1,
+                }
+                self.unfold = nn.Unfold(**self.unfold_params)
+            self.exit_steps = []
+            self.n_early_size = n_early_size
+            n_mel_channels = n_mel_channels * n_group_size
+            for i in range(self.n_flows):
+                if i > 0 and i % n_early_every == 0:  # early exitting
+                    n_mel_channels -= self.n_early_size
+                    self.exit_steps.append(i)
+                self.flows.append(
+                    FlowStep(
+                        n_mel_channels,
+                        n_flowstep_cond_dims,
+                        n_conv_layers_per_step,
+                        affine_model,
+                        scaling_fn,
+                        matrix_decomposition,
+                        affine_activation=affine_activation,
+                        use_partial_padding=self.decoder_use_partial_padding,
+                    )
+                )
+        if "dpm" in include_modules:
+            dur_model_config["hparams"]["n_speaker_dim"] = n_speaker_dim
+            self.dur_pred_layer = get_attribute_prediction_model(dur_model_config)
+        self.use_unvoiced_bias = False
+        self.use_vpred_module = False
+        self.ap_use_voiced_embeddings = kwargs.get("ap_use_voiced_embeddings", True)
+        if self.decoder_use_unvoiced_bias or self.ap_use_unvoiced_bias:
+            assert unvoiced_bias_activation in {"relu", "exp"}
+            self.use_unvoiced_bias = True
+            if unvoiced_bias_activation == "relu":
+                unvbias_nonlin = nn.ReLU()
+            elif unvoiced_bias_activation == "exp":
+                unvbias_nonlin = ExponentialClass()
+            else:
+                exit(1)  # we won't reach here anyway due to the assertion
+            self.unvoiced_bias_module = nn.Sequential(
+                LinearNorm(n_text_dim, 1), unvbias_nonlin
+            )
+        # all situations in which the vpred module is necessary
+        if (
+            self.ap_use_voiced_embeddings
+            or self.use_unvoiced_bias
+            or "vpred" in include_modules
+        ):
+            self.use_vpred_module = True
+        if self.use_vpred_module:
+            v_model_config["hparams"]["n_speaker_dim"] = n_speaker_dim
+            self.v_pred_module = get_attribute_prediction_model(v_model_config)
+            # 4 embeddings, first two are scales, second two are biases
+            if self.ap_use_voiced_embeddings:
+                self.v_embeddings = torch.nn.Embedding(4, n_text_dim)
+        if "apm" in include_modules:
+            f0_model_config["hparams"]["n_speaker_dim"] = n_speaker_dim
+            energy_model_config["hparams"]["n_speaker_dim"] = n_speaker_dim
+            if self.use_first_order_features:
+                f0_model_config["hparams"]["n_in_dim"] = 2
+                energy_model_config["hparams"]["n_in_dim"] = 2
+                if (
+                    "spline_flow_params" in f0_model_config["hparams"]
+                    and f0_model_config["hparams"]["spline_flow_params"] is not None
+                ):
+                    f0_model_config["hparams"]["spline_flow_params"][
+                        "n_in_channels"
+                    ] = 2
+                if (
+                    "spline_flow_params" in energy_model_config["hparams"]
+                    and energy_model_config["hparams"]["spline_flow_params"] is not None
+                ):
+                    energy_model_config["hparams"]["spline_flow_params"][
+                        "n_in_channels"
+                    ] = 2
+            else:
+                if (
+                    "spline_flow_params" in f0_model_config["hparams"]
+                    and f0_model_config["hparams"]["spline_flow_params"] is not None
+                ):
+                    f0_model_config["hparams"]["spline_flow_params"][
+                        "n_in_channels"
+                    ] = f0_model_config["hparams"]["n_in_dim"]
+                if (
+                    "spline_flow_params" in energy_model_config["hparams"]
+                    and energy_model_config["hparams"]["spline_flow_params"] is not None
+                ):
+                    energy_model_config["hparams"]["spline_flow_params"][
+                        "n_in_channels"
+                    ] = energy_model_config["hparams"]["n_in_dim"]
+            self.f0_pred_module = get_attribute_prediction_model(f0_model_config)
+            self.energy_pred_module = get_attribute_prediction_model(
+                energy_model_config
+            )
+    def is_attribute_unconditional(self):
+        """
+        returns true if the decoder is conditioned on neither energy nor F0
+        """
+        return self.n_f0_dims == 0 and self.n_energy_avg_dims == 0
+    def encode_speaker(self, spk_ids):
+        spk_ids = spk_ids * 0 if self.dummy_speaker_embedding else spk_ids
+        spk_vecs = self.speaker_embedding(spk_ids)
+        return spk_vecs
+    def encode_text(self, text, in_lens):
+        # text_embeddings: b x len_text x n_text_dim
+        text_embeddings = self.embedding(text).transpose(1, 2)
+        # text_enc: b x n_text_dim x encoder_dim (512)
+        if in_lens is None:
+            text_enc = self.encoder.infer(text_embeddings).transpose(1, 2)
+        else:
+            text_enc = self.encoder(text_embeddings, in_lens).transpose(1, 2)
+        return text_enc, text_embeddings
+    def preprocess_context(
+        self, context, speaker_vecs, out_lens=None, f0=None, energy_avg=None
+    ):
+        if self.n_group_size > 1:
+            # unfolding zero-padded values
+            context = self.unfold(context.unsqueeze(-1))
+            if f0 is not None:
+                f0 = self.unfold(f0[:, None, :, None])
+            if energy_avg is not None:
+                energy_avg = self.unfold(energy_avg[:, None, :, None])
+        speaker_vecs = speaker_vecs[..., None].expand(-1, -1, context.shape[2])
+        context_w_spkvec = torch.cat((context, speaker_vecs), 1)
+        if self.use_context_lstm:
+            if self.context_lstm_w_f0_and_energy:
+                if f0 is not None:
+                    context_w_spkvec = torch.cat((context_w_spkvec, f0), 1)
+                if energy_avg is not None:
+                    context_w_spkvec = torch.cat((context_w_spkvec, energy_avg), 1)
+            unfolded_out_lens = (out_lens // self.n_group_size).long().cpu()
+            unfolded_out_lens_packed = nn.utils.rnn.pack_padded_sequence(
+                context_w_spkvec.transpose(1, 2),
+                unfolded_out_lens,
+                batch_first=True,
+                enforce_sorted=False,
+            )
+            self.context_lstm.flatten_parameters()
+            context_lstm_packed_output, _ = self.context_lstm(unfolded_out_lens_packed)
+            context_lstm_padded_output, _ = nn.utils.rnn.pad_packed_sequence(
+                context_lstm_packed_output, batch_first=True
+            )
+            context_w_spkvec = context_lstm_padded_output.transpose(1, 2)
+        if not self.context_lstm_w_f0_and_energy:
+            if f0 is not None:
+                context_w_spkvec = torch.cat((context_w_spkvec, f0), 1)
+            if energy_avg is not None:
+                context_w_spkvec = torch.cat((context_w_spkvec, energy_avg), 1)
+        return context_w_spkvec
+    def enable_inverse_cache(self):
+        for flow_step in self.flows:
+            flow_step.enable_inverse_cache()
+    def fold(self, mel):
+        """Inverse of the self.unfold(mel.unsqueeze(-1)) operation used for the
+        grouping or "squeeze" operation on input
+        Args:
+            mel: B x C x T tensor of temporal data
+        """
+        mel = nn.functional.fold(
+            mel, output_size=(mel.shape[2] * self.n_group_size, 1), **self.unfold_params
+        ).squeeze(-1)
+        return mel
+    def binarize_attention(self, attn, in_lens, out_lens):
+        """For training purposes only. Binarizes attention with MAS. These will
+        no longer recieve a gradient
+        Args:
+            attn: B x 1 x max_mel_len x max_text_len
+        """
+        b_size = attn.shape[0]
+        with torch.no_grad():
+            attn_cpu = attn.data.cpu().numpy()
+            attn_out = torch.zeros_like(attn)
+            for ind in range(b_size):
+                hard_attn = mas(attn_cpu[ind, 0, : out_lens[ind], : in_lens[ind]])
+                attn_out[ind, 0, : out_lens[ind], : in_lens[ind]] = torch.tensor(
+                    hard_attn, device=attn.get_device()
+                )
+        return attn_out
+    def get_first_order_features(self, feats, out_lens, dilation=1):
+        """
+        feats: b x max_length
+        out_lens: b-dim
+        """
+        # add an extra column
+        feats_extended_R = torch.cat(
+            (feats, torch.zeros_like(feats[:, 0:dilation])), dim=1
+        )
+        feats_extended_L = torch.cat(
+            (torch.zeros_like(feats[:, 0:dilation]), feats), dim=1
+        )
+        dfeats_R = feats_extended_R[:, dilation:] - feats
+        dfeats_L = feats - feats_extended_L[:, 0:-dilation]
+        return (dfeats_R + dfeats_L) * 0.5
+    def apply_voice_mask_to_text(self, text_enc, voiced_mask):
+        """
+        text_enc: b x C x N
+        voiced_mask: b x N
+        """
+        voiced_mask = voiced_mask.unsqueeze(1)
+        voiced_embedding_s = self.v_embeddings.weight[0:1, :, None]
+        unvoiced_embedding_s = self.v_embeddings.weight[1:2, :, None]
+        voiced_embedding_b = self.v_embeddings.weight[2:3, :, None]
+        unvoiced_embedding_b = self.v_embeddings.weight[3:4, :, None]
+        scale = torch.sigmoid(
+            voiced_embedding_s * voiced_mask + unvoiced_embedding_s * (1 - voiced_mask)
+        )
+        bias = 0.1 * torch.tanh(
+            voiced_embedding_b * voiced_mask + unvoiced_embedding_b * (1 - voiced_mask)
+        )
+        return text_enc * scale + bias
+    def forward(
+        self,
+        mel,
+        speaker_ids,
+        text,
+        in_lens,
+        out_lens,
+        binarize_attention=False,
+        attn_prior=None,
+        f0=None,
+        energy_avg=None,
+        voiced_mask=None,
+        p_voiced=None,
+    ):
+        speaker_vecs = self.encode_speaker(speaker_ids)
+        text_enc, text_embeddings = self.encode_text(text, in_lens)
+        log_s_list, log_det_W_list, z_mel = [], [], []
+        attn = None
+        attn_soft = None
+        attn_hard = None
+        if "atn" in self.include_modules or "dec" in self.include_modules:
+            # make sure to do the alignments before folding
+            attn_mask = get_mask_from_lengths(in_lens)[..., None] == 0
+            text_embeddings_for_attn = text_embeddings
+            if self.use_speaker_emb_for_alignment:
+                speaker_vecs_expd = speaker_vecs[:, :, None].expand(
+                    -1, -1, text_embeddings.shape[2]
+                )
+                text_embeddings_for_attn = torch.cat(
+                    (text_embeddings_for_attn, speaker_vecs_expd.detach()), 1
+                )
+            # attn_mask shld be 1 for unsd t-steps in text_enc_w_spkvec tensor
+            attn_soft, attn_logprob = self.attention(
+                mel,
+                text_embeddings_for_attn,
+                out_lens,
+                attn_mask,
+                key_lens=in_lens,
+                attn_prior=attn_prior,
+            )
+            if binarize_attention:
+                attn = self.binarize_attention(attn_soft, in_lens, out_lens)
+                attn_hard = attn
+                if self.attn_straight_through_estimator:
+                    attn_hard = attn_soft + (attn_hard - attn_soft).detach()
+            else:
+                attn = attn_soft
+            context = torch.bmm(text_enc, attn.squeeze(1).transpose(1, 2))
+        f0_bias = 0
+        # unvoiced bias forward pass
+        if self.use_unvoiced_bias:
+            f0_bias = self.unvoiced_bias_module(context.permute(0, 2, 1))
+            f0_bias = -f0_bias[..., 0]
+            f0_bias = f0_bias * (~voiced_mask.bool()).float()
+        # mel decoder forward pass
+        if "dec" in self.include_modules:
+            if self.n_group_size > 1:
+                # might truncate some frames at the end, but that's ok
+                # sometimes referred to as the "squeeeze" operation
+                # invert this by calling self.fold(mel_or_z)
+                mel = self.unfold(mel.unsqueeze(-1))
+            z_out = []
+            # where context is folded
+            # mask f0 in case values are interpolated
+            if f0 is None:
+                f0_aug = None
+            else:
+                if self.decoder_use_unvoiced_bias:
+                    f0_aug = f0 * voiced_mask + f0_bias
+                else:
+                    f0_aug = f0 * voiced_mask
+            context_w_spkvec = self.preprocess_context(
+                context, speaker_vecs, out_lens, f0_aug, energy_avg
+            )
+            log_s_list, log_det_W_list, z_out = [], [], []
+            unfolded_seq_lens = out_lens // self.n_group_size
+            for i, flow_step in enumerate(self.flows):
+                if i in self.exit_steps:
+                    z = mel[:, : self.n_early_size]
+                    z_out.append(z)
+                    mel = mel[:, self.n_early_size :]
+                mel, log_det_W, log_s = flow_step(
+                    mel, context_w_spkvec, seq_lens=unfolded_seq_lens
+                )
+                log_s_list.append(log_s)
+                log_det_W_list.append(log_det_W)
+            z_out.append(mel)
+            z_mel = torch.cat(z_out, 1)
+        # duration predictor forward pass
+        duration_model_outputs = None
+        if "dpm" in self.include_modules:
+            if attn_hard is None:
+                attn_hard = self.binarize_attention(attn_soft, in_lens, out_lens)
+            # convert hard attention to durations
+            attn_hard_reduced = attn_hard.sum(2)[:, 0, :]
+            duration_model_outputs = self.dur_pred_layer(
+                torch.detach(text_enc),
+                torch.detach(speaker_vecs),
+                torch.detach(attn_hard_reduced.float()),
+                in_lens,
+            )
+        # f0, energy, vpred predictors forward pass
+        f0_model_outputs = None
+        energy_model_outputs = None
+        vpred_model_outputs = None
+        if "apm" in self.include_modules:
+            if attn_hard is None:
+                attn_hard = self.binarize_attention(attn_soft, in_lens, out_lens)
+            # convert hard attention to durations
+            if binarize_attention:
+                text_enc_time_expanded = context.clone()
+            else:
+                text_enc_time_expanded = torch.bmm(
+                    text_enc, attn_hard.squeeze(1).transpose(1, 2)
+                )
+            if self.use_vpred_module:
+                # unvoiced bias requires  voiced mask prediction
+                vpred_model_outputs = self.v_pred_module(
+                    torch.detach(text_enc_time_expanded),
+                    torch.detach(speaker_vecs),
+                    torch.detach(voiced_mask),
+                    out_lens,
+                )
+                # affine transform context using voiced mask
+                if self.ap_use_voiced_embeddings:
+                    text_enc_time_expanded = self.apply_voice_mask_to_text(
+                        text_enc_time_expanded, voiced_mask
+                    )
+            # whether to use the unvoiced bias in the attribute predictor
+            # circumvent in-place modification
+            f0_target = f0.clone()
+            if self.ap_use_unvoiced_bias:
+                f0_target = torch.detach(f0_target * voiced_mask + f0_bias)
+            else:
+                f0_target = torch.detach(f0_target)
+            # fit to log f0 in f0 predictor
+            f0_target[voiced_mask.bool()] = torch.log(f0_target[voiced_mask.bool()])
+            f0_target = f0_target / 6  # scale to ~ [0, 1] in log space
+            energy_avg = energy_avg * 2 - 1  # scale to ~ [-1, 1]
+            if self.use_first_order_features:
+                df0 = self.get_first_order_features(f0_target, out_lens)
+                denergy_avg = self.get_first_order_features(energy_avg, out_lens)
+                f0_voiced = torch.cat((f0_target[:, None], df0[:, None]), dim=1)
+                energy_avg = torch.cat(
+                    (energy_avg[:, None], denergy_avg[:, None]), dim=1
+                )
+                f0_voiced = f0_voiced * 3  # scale to ~ 1 std
+                energy_avg = energy_avg * 3  # scale to ~ 1 std
+            else:
+                f0_voiced = f0_target * 2  # scale to ~ 1 std
+                energy_avg = energy_avg * 1.4  # scale to ~ 1 std
+            f0_model_outputs = self.f0_pred_module(
+                text_enc_time_expanded, torch.detach(speaker_vecs), f0_voiced, out_lens
+            )
+            energy_model_outputs = self.energy_pred_module(
+                text_enc_time_expanded, torch.detach(speaker_vecs), energy_avg, out_lens
+            )
+        outputs = {
+            "z_mel": z_mel,
+            "log_det_W_list": log_det_W_list,
+            "log_s_list": log_s_list,
+            "duration_model_outputs": duration_model_outputs,
+            "f0_model_outputs": f0_model_outputs,
+            "energy_model_outputs": energy_model_outputs,
+            "vpred_model_outputs": vpred_model_outputs,
+            "attn_soft": attn_soft,
+            "attn": attn,
+            "text_embeddings": text_embeddings,
+            "attn_logprob": attn_logprob,
+        }
+        return outputs
+    def infer(
+        self,
+        speaker_id,
+        text,
+        sigma,
+        sigma_dur=0.8,
+        sigma_f0=0.8,
+        sigma_energy=0.8,
+        token_dur_scaling=1.0,
+        token_duration_max=100,
+        speaker_id_text=None,
+        speaker_id_attributes=None,
+        dur=None,
+        f0=None,
+        energy_avg=None,
+        voiced_mask=None,
+        f0_mean=0.0,
+        f0_std=0.0,
+        energy_mean=0.0,
+        energy_std=0.0,
+        use_cuda=False,
+    ):
+        batch_size = text.shape[0]
+        n_tokens = text.shape[1]
+        spk_vec = self.encode_speaker(speaker_id)
+        spk_vec_text, spk_vec_attributes = spk_vec, spk_vec
+        if speaker_id_text is not None:
+            spk_vec_text = self.encode_speaker(speaker_id_text)
+        if speaker_id_attributes is not None:
+            spk_vec_attributes = self.encode_speaker(speaker_id_attributes)
+        txt_enc, txt_emb = self.encode_text(text, None)
+        if dur is None:
+            # get token durations
+            if use_cuda:
+                z_dur = torch.cuda.FloatTensor(batch_size, 1, n_tokens)
+            else:
+                z_dur = torch.FloatTensor(batch_size, 1, n_tokens)
+            z_dur = z_dur.normal_() * sigma_dur
+            dur = self.dur_pred_layer.infer(z_dur, txt_enc, spk_vec_text)
+            if dur.shape[-1] < txt_enc.shape[-1]:
+                to_pad = txt_enc.shape[-1] - dur.shape[2]
+                pad_fn = nn.ReplicationPad1d((0, to_pad))
+                dur = pad_fn(dur)
+            dur = dur[:, 0]
+            dur = dur.clamp(0, token_duration_max)
+            dur = dur * token_dur_scaling if token_dur_scaling > 0 else dur
+            dur = (dur + 0.5).floor().int()
+        out_lens = dur.sum(1).long().cpu() if dur.shape[0] != 1 else [dur.sum(1)]
+        max_n_frames = max(out_lens)
+        out_lens = torch.LongTensor(out_lens).to(txt_enc.device)
+        # get attributes f0, energy, vpred, etc)
+        txt_enc_time_expanded = self.length_regulator(
+            txt_enc.transpose(1, 2), dur
+        ).transpose(1, 2)
+        if not self.is_attribute_unconditional():
+            # if explicitly modeling attributes
+            if voiced_mask is None:
+                if self.use_vpred_module:
+                    # get logits
+                    voiced_mask = self.v_pred_module.infer(
+                        None, txt_enc_time_expanded, spk_vec_attributes
+                    )
+                    voiced_mask = torch.sigmoid(voiced_mask[:, 0]) > 0.5
+                    voiced_mask = voiced_mask.float()
+            ap_txt_enc_time_expanded = txt_enc_time_expanded
+            # voice mask augmentation only used for attribute prediction
+            if self.ap_use_voiced_embeddings:
+                ap_txt_enc_time_expanded = self.apply_voice_mask_to_text(
+                    txt_enc_time_expanded, voiced_mask
+                )
+            f0_bias = 0
+            # unvoiced bias forward pass
+            if self.use_unvoiced_bias:
+                f0_bias = self.unvoiced_bias_module(
+                    txt_enc_time_expanded.permute(0, 2, 1)
+                )
+                f0_bias = -f0_bias[..., 0]
+                f0_bias = f0_bias * (~voiced_mask.bool()).float()
+            if f0 is None:
+                n_f0_feature_channels = 2 if self.use_first_order_features else 1
+                if use_cuda:
+                    z_f0 = (
+                        torch.cuda.FloatTensor(
+                            batch_size, n_f0_feature_channels, max_n_frames
+                        ).normal_()
+                        * sigma_f0
+                    )
+                else:
+                    z_f0 = (
+                        torch.FloatTensor(
+                            batch_size, n_f0_feature_channels, max_n_frames
+                        ).normal_()
+                        * sigma_f0
+                    )
+                f0 = self.infer_f0(
+                    z_f0,
+                    ap_txt_enc_time_expanded,
+                    spk_vec_attributes,
+                    voiced_mask,
+                    out_lens,
+                )[:, 0]
+            if f0_mean > 0.0:
+                vmask_bool = voiced_mask.bool()
+                f0_mu, f0_sigma = f0[vmask_bool].mean(), f0[vmask_bool].std()
+                f0[vmask_bool] = (f0[vmask_bool] - f0_mu) / f0_sigma
+                f0_std = f0_std if f0_std > 0 else f0_sigma
+                f0[vmask_bool] = f0[vmask_bool] * f0_std + f0_mean
+            if energy_avg is None:
+                n_energy_feature_channels = 2 if self.use_first_order_features else 1
+                if use_cuda:
+                    z_energy_avg = (
+                        torch.cuda.FloatTensor(
+                            batch_size, n_energy_feature_channels, max_n_frames
+                        ).normal_()
+                        * sigma_energy
+                    )
+                else:
+                    z_energy_avg = (
+                        torch.FloatTensor(
+                            batch_size, n_energy_feature_channels, max_n_frames
+                        ).normal_()
+                        * sigma_energy
+                    )
+                energy_avg = self.infer_energy(
+                    z_energy_avg, ap_txt_enc_time_expanded, spk_vec, out_lens
+                )[:, 0]
+            # replication pad, because ungrouping with different group sizes
+            # may lead to mismatched lengths
+            if energy_avg.shape[1] < out_lens[0]:
+                to_pad = out_lens[0] - energy_avg.shape[1]
+                pad_fn = nn.ReplicationPad1d((0, to_pad))
+                f0 = pad_fn(f0[None])[0]
+                energy_avg = pad_fn(energy_avg[None])[0]
+            if f0.shape[1] < out_lens[0]:
+                to_pad = out_lens[0] - f0.shape[1]
+                pad_fn = nn.ReplicationPad1d((0, to_pad))
+                f0 = pad_fn(f0[None])[0]
+            if self.decoder_use_unvoiced_bias:
+                context_w_spkvec = self.preprocess_context(
+                    txt_enc_time_expanded,
+                    spk_vec,
+                    out_lens,
+                    f0 * voiced_mask + f0_bias,
+                    energy_avg,
+                )
+            else:
+                context_w_spkvec = self.preprocess_context(
+                    txt_enc_time_expanded,
+                    spk_vec,
+                    out_lens,
+                    f0 * voiced_mask,
+                    energy_avg,
+                )
+        else:
+            context_w_spkvec = self.preprocess_context(
+                txt_enc_time_expanded, spk_vec, out_lens, None, None
+            )
+        if use_cuda:
+            residual = torch.cuda.FloatTensor(
+                batch_size, 80 * self.n_group_size, max_n_frames // self.n_group_size
+            )
+        else:
+            residual = torch.FloatTensor(
+                batch_size, 80 * self.n_group_size, max_n_frames // self.n_group_size
+            )
+        residual = residual.normal_() * sigma
+        # map from z sample to data
+        exit_steps_stack = self.exit_steps.copy()
+        mel = residual[:, len(exit_steps_stack) * self.n_early_size :]
+        remaining_residual = residual[:, : len(exit_steps_stack) * self.n_early_size]
+        unfolded_seq_lens = out_lens // self.n_group_size
+        for i, flow_step in enumerate(reversed(self.flows)):
+            curr_step = len(self.flows) - i - 1
+            mel = flow_step(
+                mel, context_w_spkvec, inverse=True, seq_lens=unfolded_seq_lens
+            )
+            if len(exit_steps_stack) > 0 and curr_step == exit_steps_stack[-1]:
+                # concatenate the next chunk of z
+                exit_steps_stack.pop()
+                residual_to_add = remaining_residual[
+                    :, len(exit_steps_stack) * self.n_early_size :
+                ]
+                remaining_residual = remaining_residual[
+                    :, : len(exit_steps_stack) * self.n_early_size
+                ]
+                mel = torch.cat((residual_to_add, mel), 1)
+        if self.n_group_size > 1:
+            mel = self.fold(mel)
+        if self.do_mel_descaling:
+            mel = mel * 2 - 5.5
+        return {
+            "mel": mel,
+            "dur": dur,
+            "f0": f0,
+            "energy_avg": energy_avg,
+            "voiced_mask": voiced_mask,
+        }
+    def infer_f0(
+        self, residual, txt_enc_time_expanded, spk_vec, voiced_mask=None, lens=None
+    ):
+        f0 = self.f0_pred_module.infer(residual, txt_enc_time_expanded, spk_vec, lens)
+        if voiced_mask is not None and len(voiced_mask.shape) == 2:
+            voiced_mask = voiced_mask[:, None]
+        # constants
+        if self.ap_pred_log_f0:
+            if self.use_first_order_features:
+                f0 = f0[:, 0:1, :] / 3
+            else:
+                f0 = f0 / 2
+            f0 = f0 * 6
+        else:
+            f0 = f0 / 6
+            f0 = f0 / 640
+        if voiced_mask is None:
+            voiced_mask = f0 > 0.0
+        else:
+            voiced_mask = voiced_mask.bool()
+        # due to grouping, f0 might be 1 frame short
+        voiced_mask = voiced_mask[:, :, : f0.shape[-1]]
+        if self.ap_pred_log_f0:
+            # if variable is set, decoder sees linear f0
+            # mask = f0 > 0.0 if voiced_mask is None else voiced_mask.bool()
+            f0[voiced_mask] = torch.exp(f0[voiced_mask])
+        f0[~voiced_mask] = 0.0
+        return f0
+    def infer_energy(self, residual, txt_enc_time_expanded, spk_vec, lens):
+        energy = self.energy_pred_module.infer(
+            residual, txt_enc_time_expanded, spk_vec, lens
+        )
+        # magic constants
+        if self.use_first_order_features:
+            energy = energy / 3
+        else:
+            energy = energy / 1.4
+        energy = (energy + 1) / 2
+        return energy
+    def remove_norms(self):
+        """Removes spectral and weightnorms from model. Call before inference"""
+        for name, module in self.named_modules():
+            try:
+                nn.utils.remove_spectral_norm(module, name="weight_hh_l0")
+                print("Removed spectral norm from {}".format(name))
+            except:
+                pass
+            try:
+                nn.utils.remove_spectral_norm(module, name="weight_hh_l0_reverse")
+                print("Removed spectral norm from {}".format(name))
+            except:
+                pass
+            try:
+                nn.utils.remove_weight_norm(module)
+                print("Removed wnorm from {}".format(name))
+            except:
+                pass

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ruff

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+huggingface_hub
+gradio==5.18.0
+torch
+torchaudio
+scipy
+numba
+lmdb
+librosa
+unidecode
+inflect
+git+https://github.com/langtech-bsc/vocos.git@matcha

splines.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# Original Source:
+# Original Source:
+# https://github.com/ndeutschmann/zunis/blob/master/zunis_lib/zunis/models/flows/coupling_cells/piecewise_coupling/piecewise_linear.py
+# https://github.com/ndeutschmann/zunis/blob/master/zunis_lib/zunis/models/flows/coupling_cells/piecewise_coupling/piecewise_quadratic.py
+# Modifications made to jacobian computation by Yurong You and Kevin Shih
+# Original License Text:
+#########################################################################
+# The MIT License (MIT)
+# Copyright (c) 2020, nicolas deutschmann
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import torch
+import torch.nn.functional as F
+third_dimension_softmax = torch.nn.Softmax(dim=2)
+def piecewise_linear_transform(
+    x, q_tilde, compute_jacobian=True, outlier_passthru=True
+):
+    """Apply an element-wise piecewise-linear transformation to some variables
+    Parameters
+    ----------
+    x : torch.Tensor
+        a tensor with shape (N,k) where N is the batch dimension while k is the
+        dimension of the variable space. This variable span the k-dimensional unit
+        hypercube
+    q_tilde: torch.Tensor
+        is a tensor with shape (N,k,b) where b is the number of bins.
+        This contains the un-normalized heights of the bins of the piecewise-constant PDF for dimension k,
+        i.e. q_tilde lives in all of R and we don't impose a constraint on their sum yet.
+        Normalization is imposed in this function using softmax.
+    compute_jacobian : bool, optional
+        determines whether the jacobian should be compute or None is returned
+    Returns
+    -------
+    tuple of torch.Tensor
+        pair `(y,h)`.
+        - `y` is a tensor with shape (N,k) living in the k-dimensional unit hypercube
+        - `j` is the jacobian of the transformation with shape (N,) if compute_jacobian==True, else None.
+    """
+    logj = None
+    # TODO bottom-up assesment of handling the differentiability of variables
+    # Compute the bin width w
+    N, k, b = q_tilde.shape
+    Nx, kx = x.shape
+    assert N == Nx and k == kx, "Shape mismatch"
+    w = 1.0 / b
+    # Compute normalized bin heights with softmax function on bin dimension
+    q = 1.0 / w * third_dimension_softmax(q_tilde)
+    # x is in the mx-th bin: x \in [0,1],
+    # mx \in [[0,b-1]], so we clamp away the case x == 1
+    mx = torch.clamp(torch.floor(b * x), 0, b - 1).to(torch.long)
+    # Need special error handling because trying to index with mx
+    # if it contains nans will lock the GPU. (device-side assert triggered)
+    if torch.any(torch.isnan(mx)).item() or torch.any(mx < 0) or torch.any(mx >= b):
+        raise Exception("NaN detected in PWLinear bin indexing")
+    # We compute the output variable in-place
+    out = x - mx * w  # alpha (element of [0.,w], the position of x in its bin
+    # Multiply by the slope
+    # q has shape (N,k,b), mxu = mx.unsqueeze(-1) has shape (N,k) with entries that are a b-index
+    # gather defines slope[i, j, k] = q[i, j, mxu[i, j, k]] with k taking only 0 as a value
+    # i.e. we say slope[i, j] = q[i, j, mx [i, j]]
+    slopes = torch.gather(q, 2, mx.unsqueeze(-1)).squeeze(-1)
+    out = out * slopes
+    # The jacobian is the product of the slopes in all dimensions
+    # Compute the integral over the left-bins.
+    # 1. Compute all integrals: cumulative sum of bin height * bin weight.
+    # We want that index i contains the cumsum *strictly to the left* so we shift by 1
+    # leaving the first entry null, which is achieved with a roll and assignment
+    q_left_integrals = torch.roll(torch.cumsum(q, 2) * w, 1, 2)
+    q_left_integrals[:, :, 0] = 0
+    # 2. Access the correct index to get the left integral of each point and add it to our transformation
+    out = out + torch.gather(q_left_integrals, 2, mx.unsqueeze(-1)).squeeze(-1)
+    # Regularization: points must be strictly within the unit hypercube
+    # Use the dtype information from pytorch
+    eps = torch.finfo(out.dtype).eps
+    out = out.clamp(min=eps, max=1.0 - eps)
+    oob_mask = torch.logical_or(x < 0.0, x > 1.0).detach().float()
+    if outlier_passthru:
+        out = out * (1 - oob_mask) + x * oob_mask
+        slopes = slopes * (1 - oob_mask) + oob_mask
+    if compute_jacobian:
+        # logj = torch.log(torch.prod(slopes.float(), 1))
+        logj = torch.sum(torch.log(slopes), 1)
+    del slopes
+    return out, logj
+def piecewise_linear_inverse_transform(
+    y, q_tilde, compute_jacobian=True, outlier_passthru=True
+):
+    """
+    Apply inverse of an element-wise piecewise-linear transformation to some
+    variables
+    Parameters
+    ----------
+    y : torch.Tensor
+        a tensor with shape (N,k) where N is the batch dimension while k is the
+        dimension of the variable space. This variable span the k-dimensional unit
+        hypercube
+    q_tilde: torch.Tensor
+        is a tensor with shape (N,k,b) where b is the number of bins.
+        This contains the un-normalized heights of the bins of the piecewise-constant PDF for dimension k,
+        i.e. q_tilde lives in all of R and we don't impose a constraint on their sum yet.
+        Normalization is imposed in this function using softmax.
+    compute_jacobian : bool, optional
+        determines whether the jacobian should be compute or None is returned
+    Returns
+    -------
+    tuple of torch.Tensor
+        pair `(x,h)`.
+        - `x` is a tensor with shape (N,k) living in the k-dimensional unit hypercube
+        - `j` is the jacobian of the transformation with shape (N,) if compute_jacobian==True, else None.
+    """
+    # TODO bottom-up assesment of handling the differentiability of variables
+    # Compute the bin width w
+    N, k, b = q_tilde.shape
+    Ny, ky = y.shape
+    assert N == Ny and k == ky, "Shape mismatch"
+    w = 1.0 / b
+    # Compute normalized bin heights with softmax function on the bin dimension
+    q = 1.0 / w * third_dimension_softmax(q_tilde)
+    # Compute the integral over the left-bins in the forward transform.
+    # 1. Compute all integrals: cumulative sum of bin height * bin weight.
+    # We want that index i contains the cumsum *strictly to the left*,
+    # so we shift by 1 leaving the first entry null,
+    # which is achieved with a roll and assignment
+    q_left_integrals = torch.roll(torch.cumsum(q.float(), 2) * w, 1, 2)
+    q_left_integrals[:, :, 0] = 0
+    # Find which bin each y belongs to by finding the smallest bin such that
+    # y - q_left_integral is positive
+    edges = (y.unsqueeze(-1) - q_left_integrals).detach()
+    # y and q_left_integrals are between 0 and 1,
+    # so that their difference is at most 1.
+    # By setting the negative values to 2., we know that the
+    # smallest value left is the smallest positive
+    edges[edges < 0] = 2.0
+    edges = torch.clamp(torch.argmin(edges, dim=2), 0, b - 1).to(torch.long)
+    # Need special error handling because trying to index with mx
+    # if it contains nans will lock the GPU. (device-side assert triggered)
+    if (
+        torch.any(torch.isnan(edges)).item()
+        or torch.any(edges < 0)
+        or torch.any(edges >= b)
+    ):
+        raise Exception("NaN detected in PWLinear bin indexing")
+    # Gather the left integrals at each edge. See comment about gathering in q_left_integrals
+    # for the unsqueeze
+    q_left_integrals = q_left_integrals.gather(2, edges.unsqueeze(-1)).squeeze(-1)
+    # Gather the slope at each edge.
+    q = q.gather(2, edges.unsqueeze(-1)).squeeze(-1)
+    # Build the output
+    x = (y - q_left_integrals) / q + edges * w
+    # Regularization: points must be strictly within the unit hypercube
+    # Use the dtype information from pytorch
+    eps = torch.finfo(x.dtype).eps
+    x = x.clamp(min=eps, max=1.0 - eps)
+    oob_mask = torch.logical_or(y < 0.0, y > 1.0).detach().float()
+    if outlier_passthru:
+        x = x * (1 - oob_mask) + y * oob_mask
+        q = q * (1 - oob_mask) + oob_mask
+    # Prepare the jacobian
+    logj = None
+    if compute_jacobian:
+        # logj = - torch.log(torch.prod(q, 1))
+        logj = -torch.sum(torch.log(q.float()), 1)
+    return x.detach(), logj
+def unbounded_piecewise_quadratic_transform(
+    x, w_tilde, v_tilde, upper=1, lower=0, inverse=False
+):
+    assert upper > lower
+    _range = upper - lower
+    inside_interval_mask = (x >= lower) & (x < upper)
+    outside_interval_mask = ~inside_interval_mask
+    outputs = torch.zeros_like(x)
+    log_j = torch.zeros_like(x)
+    outputs[outside_interval_mask] = x[outside_interval_mask]
+    log_j[outside_interval_mask] = 0
+    output, _log_j = piecewise_quadratic_transform(
+        (x[inside_interval_mask] - lower) / _range,
+        w_tilde[inside_interval_mask, :],
+        v_tilde[inside_interval_mask, :],
+        inverse=inverse,
+    )
+    outputs[inside_interval_mask] = output * _range + lower
+    if not inverse:
+        # the before and after transformation cancel out, so the log_j would be just as it is.
+        log_j[inside_interval_mask] = _log_j
+    else:
+        log_j = None
+    return outputs, log_j
+def weighted_softmax(v, w):
+    # to avoid NaN...
+    v = v - torch.max(v, dim=-1, keepdim=True)[0]
+    v = torch.exp(v) + 1e-8  # to avoid NaN...
+    v_sum = torch.sum((v[..., :-1] + v[..., 1:]) / 2 * w, dim=-1, keepdim=True)
+    return v / v_sum
+def piecewise_quadratic_transform(x, w_tilde, v_tilde, inverse=False):
+    """Element-wise piecewise-quadratic transformation
+    Parameters
+    ----------
+    x : torch.Tensor
+        *, The variable spans the D-dim unit hypercube ([0,1))
+    w_tilde : torch.Tensor
+        * x K defined in the paper
+    v_tilde : torch.Tensor
+        * x (K+1) defined in the paper
+    inverse : bool
+        forward or inverse
+    Returns
+    -------
+    c : torch.Tensor
+        *, transformed value
+    log_j : torch.Tensor
+        *, log determinant of the Jacobian matrix
+    """
+    w = torch.softmax(w_tilde, dim=-1)
+    v = weighted_softmax(v_tilde, w)
+    w_cumsum = torch.cumsum(w, dim=-1)
+    # force sum = 1
+    w_cumsum[..., -1] = 1.0
+    w_cumsum_shift = F.pad(w_cumsum, (1, 0), "constant", 0)
+    cdf = torch.cumsum((v[..., 1:] + v[..., :-1]) / 2 * w, dim=-1)
+    # force sum = 1
+    cdf[..., -1] = 1.0
+    cdf_shift = F.pad(cdf, (1, 0), "constant", 0)
+    if not inverse:
+        # * x D x 1, (w_cumsum[idx-1] < x <= w_cumsum[idx])
+        bin_index = torch.searchsorted(w_cumsum, x.unsqueeze(-1))
+    else:
+        # * x D x 1, (cdf[idx-1] < x <= cdf[idx])
+        bin_index = torch.searchsorted(cdf, x.unsqueeze(-1))
+    w_b = torch.gather(w, -1, bin_index).squeeze(-1)
+    w_bn1 = torch.gather(w_cumsum_shift, -1, bin_index).squeeze(-1)
+    v_b = torch.gather(v, -1, bin_index).squeeze(-1)
+    v_bp1 = torch.gather(v, -1, bin_index + 1).squeeze(-1)
+    cdf_bn1 = torch.gather(cdf_shift, -1, bin_index).squeeze(-1)
+    if not inverse:
+        alpha = (x - w_bn1) / w_b.clamp(min=torch.finfo(w_b.dtype).eps)
+        c = (alpha**2) / 2 * (v_bp1 - v_b) * w_b + alpha * v_b * w_b + cdf_bn1
+        # just sum of log pdfs
+        log_j = torch.lerp(v_b, v_bp1, alpha).clamp(min=torch.finfo(c.dtype).eps).log()
+        # make sure it falls into [0,1)
+        c = c.clamp(min=torch.finfo(c.dtype).eps, max=1.0 - torch.finfo(c.dtype).eps)
+        return c, log_j
+    else:
+        # quadratic equation for alpha
+        # alpha should fall into (0, 1]. Since a, b > 0, the symmetry axis -b/2a < 0 and we should pick the larger root
+        # skip calculating the log_j in inverse since we don't need it
+        a = (v_bp1 - v_b) * w_b / 2
+        b = v_b * w_b
+        c = cdf_bn1 - x
+        alpha = (-b + torch.sqrt((b**2) - 4 * a * c)) / (2 * a)
+        inv = alpha * w_b + w_bn1
+        # make sure it falls into [0,1)
+        inv = inv.clamp(
+            min=torch.finfo(c.dtype).eps, max=1.0 - torch.finfo(inv.dtype).eps
+        )
+        return inv, None

transformer.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# adapted from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/transformer.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from common import get_mask_from_lengths, LinearNorm
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super(PositionalEmbedding, self).__init__()
+        self.demb = demb
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer("inv_freq", inv_freq)
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.matmul(
+            torch.unsqueeze(pos_seq, -1), torch.unsqueeze(self.inv_freq, 0)
+        )
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=1)
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+class PositionwiseConvFF(nn.Module):
+    def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False):
+        super(PositionwiseConvFF, self).__init__()
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+        self.CoreNet = nn.Sequential(
+            nn.Conv1d(d_model, d_inner, kernel_size, 1, (kernel_size // 2)),
+            nn.ReLU(),
+            # nn.Dropout(dropout),  # worse convergence
+            nn.Conv1d(d_inner, d_model, kernel_size, 1, (kernel_size // 2)),
+            nn.Dropout(dropout),
+        )
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.pre_lnorm = pre_lnorm
+    def forward(self, inp):
+        return self._forward(inp)
+    def _forward(self, inp):
+        if self.pre_lnorm:
+            # layer normalization + positionwise feed-forward
+            core_out = inp.transpose(1, 2)
+            core_out = self.CoreNet(self.layer_norm(core_out).to(inp.dtype))
+            core_out = core_out.transpose(1, 2)
+            # residual connection
+            output = core_out + inp
+        else:
+            # positionwise feed-forward
+            core_out = inp.transpose(1, 2)
+            core_out = self.CoreNet(core_out)
+            core_out = core_out.transpose(1, 2)
+            # residual connection + layer normalization
+            output = self.layer_norm(inp + core_out).to(inp.dtype)
+        return output
+class MultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, pre_lnorm=False):
+        super(MultiHeadAttn, self).__init__()
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.scale = 1 / (d_head**0.5)
+        self.pre_lnorm = pre_lnorm
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head)
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+        self.layer_norm = nn.LayerNorm(d_model)
+    def forward(self, inp, attn_mask=None):
+        return self._forward(inp, attn_mask)
+    def _forward(self, inp, attn_mask=None):
+        residual = inp
+        if self.pre_lnorm:
+            # layer normalization
+            inp = self.layer_norm(inp)
+        n_head, d_head = self.n_head, self.d_head
+        head_q, head_k, head_v = torch.chunk(self.qkv_net(inp), 3, dim=2)
+        head_q = head_q.view(inp.size(0), inp.size(1), n_head, d_head)
+        head_k = head_k.view(inp.size(0), inp.size(1), n_head, d_head)
+        head_v = head_v.view(inp.size(0), inp.size(1), n_head, d_head)
+        q = head_q.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head)
+        k = head_k.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head)
+        v = head_v.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head)
+        attn_score = torch.bmm(q, k.transpose(1, 2))
+        attn_score.mul_(self.scale)
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(1).to(attn_score.dtype)
+            attn_mask = attn_mask.repeat(n_head, attn_mask.size(2), 1)
+            attn_score.masked_fill_(attn_mask.to(torch.bool), -float("inf"))
+        attn_prob = F.softmax(attn_score, dim=2)
+        attn_prob = self.dropatt(attn_prob)
+        attn_vec = torch.bmm(attn_prob, v)
+        attn_vec = attn_vec.view(n_head, inp.size(0), inp.size(1), d_head)
+        attn_vec = (
+            attn_vec.permute(1, 2, 0, 3)
+            .contiguous()
+            .view(inp.size(0), inp.size(1), n_head * d_head)
+        )
+        # linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+        # residual connection + layer normalization
+        output = self.layer_norm(residual + attn_out)
+        output = output.to(attn_out.dtype)
+        return output
+class TransformerLayer(nn.Module):
+    def __init__(
+        self, n_head, d_model, d_head, d_inner, kernel_size, dropout, **kwargs
+    ):
+        super(TransformerLayer, self).__init__()
+        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseConvFF(d_model, d_inner, kernel_size, dropout)
+    def forward(self, dec_inp, mask=None):
+        output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2))
+        output *= mask
+        output = self.pos_ff(output)
+        output *= mask
+        return output
+class FFTransformer(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim=1,
+        n_layers=6,
+        n_head=1,
+        d_head=64,
+        d_inner=1024,
+        kernel_size=3,
+        dropout=0.1,
+        dropatt=0.1,
+        dropemb=0.0,
+    ):
+        super(FFTransformer, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.n_head = n_head
+        self.d_head = d_head
+        self.pos_emb = PositionalEmbedding(self.in_dim)
+        self.drop = nn.Dropout(dropemb)
+        self.layers = nn.ModuleList()
+        for _ in range(n_layers):
+            self.layers.append(
+                TransformerLayer(
+                    n_head,
+                    in_dim,
+                    d_head,
+                    d_inner,
+                    kernel_size,
+                    dropout,
+                    dropatt=dropatt,
+                )
+            )
+        self.dense = LinearNorm(in_dim, out_dim)
+    def forward(self, dec_inp, in_lens):
+        # B, C, T --> B, T, C
+        inp = dec_inp.transpose(1, 2)
+        mask = get_mask_from_lengths(in_lens)[..., None]
+        pos_seq = torch.arange(inp.size(1), device=inp.device).to(inp.dtype)
+        pos_emb = self.pos_emb(pos_seq) * mask
+        out = self.drop(inp + pos_emb)
+        for layer in self.layers:
+            out = layer(out, mask=mask)
+        out = self.dense(out).transpose(1, 2)
+        return out

tts_text_processing/LICENSE ADDED Viewed

	@@ -0,0 +1,19 @@

+Copyright (c) 2017 Keith Ito
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

tts_text_processing/abbreviations.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import re
+_no_period_re = re.compile(r"(No[.])(?=[ ]?[0-9])")
+_percent_re = re.compile(r"([ ]?[%])")
+_half_re = re.compile("([0-9]½)|(½)")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("ms", "miss"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+def _expand_no_period(m):
+    word = m.group(0)
+    if word[0] == "N":
+        return "Number"
+    return "number"
+def _expand_percent(m):
+    return " percent"
+def _expand_half(m):
+    word = m.group(1)
+    if word is None:
+        return "half"
+    return word[0] + " and a half"
+def normalize_abbreviations(text):
+    text = re.sub(_no_period_re, _expand_no_period, text)
+    text = re.sub(_percent_re, _expand_percent, text)
+    text = re.sub(_half_re, _expand_half, text)
+    return text

tts_text_processing/acronyms.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import re
+_letter_to_arpabet = {
+    "A": "EY1",
+    "B": "B IY1",
+    "C": "S IY1",
+    "D": "D IY1",
+    "E": "IY1",
+    "F": "EH1 F",
+    "G": "JH IY1",
+    "H": "EY1 CH",
+    "I": "AY1",
+    "J": "JH EY1",
+    "K": "K EY1",
+    "L": "EH1 L",
+    "M": "EH1 M",
+    "N": "EH1 N",
+    "O": "OW1",
+    "P": "P IY1",
+    "Q": "K Y UW1",
+    "R": "AA1 R",
+    "S": "EH1 S",
+    "T": "T IY1",
+    "U": "Y UW1",
+    "V": "V IY1",
+    "X": "EH1 K S",
+    "Y": "W AY1",
+    "W": "D AH1 B AH0 L Y UW0",
+    "Z": "Z IY1",
+    "s": "Z",
+}
+# must ignore roman numerals
+# _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)')
+_acronym_re = re.compile(r"([A-Z][A-Z]+)s?")
+class AcronymNormalizer(object):
+    def __init__(self, phoneme_dict):
+        self.phoneme_dict = phoneme_dict
+    def normalize_acronyms(self, text):
+        def _expand_acronyms(m, add_spaces=True):
+            acronym = m.group(0)
+            # remove dots if they exist
+            acronym = re.sub("\.", "", acronym)
+            acronym = "".join(acronym.split())
+            arpabet = self.phoneme_dict.lookup(acronym)
+            if arpabet is None:
+                acronym = list(acronym)
+                arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
+                # temporary fix
+                if arpabet[-1] == "{Z}" and len(arpabet) > 1:
+                    arpabet[-2] = arpabet[-2][:-1] + " " + arpabet[-1][1:]
+                    del arpabet[-1]
+                arpabet = " ".join(arpabet)
+            elif len(arpabet) == 1:
+                arpabet = "{" + arpabet[0] + "}"
+            else:
+                arpabet = acronym
+            return arpabet
+        text = re.sub(_acronym_re, _expand_acronyms, text)
+        return text
+    def __call__(self, text):
+        return self.normalize_acronyms(text)

tts_text_processing/cleaners.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""adapted from https://github.com/keithito/tacotron"""
+"""
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+    1. "english_cleaners" for English text
+    2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+         the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+    3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+         the symbols in symbols.py to match your data).
+"""
+import re
+from string import punctuation
+from functools import reduce
+from unidecode import unidecode
+from .numerical import normalize_numbers, normalize_currency
+from .acronyms import AcronymNormalizer
+from .datestime import normalize_datestime
+from .letters_and_numbers import normalize_letters_and_numbers
+from .abbreviations import normalize_abbreviations
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r"\s+")
+# Regular expression separating words enclosed in curly braces for cleaning
+_arpa_re = re.compile(r"{[^}]+}|\S+")
+def expand_abbreviations(text):
+    return normalize_abbreviations(text)
+def expand_numbers(text):
+    return normalize_numbers(text)
+def expand_currency(text):
+    return normalize_currency(text)
+def expand_datestime(text):
+    return normalize_datestime(text)
+def expand_letters_and_numbers(text):
+    return normalize_letters_and_numbers(text)
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def separate_acronyms(text):
+    text = re.sub(r"([0-9]+)([a-zA-Z]+)", r"\1 \2", text)
+    text = re.sub(r"([a-zA-Z]+)([0-9]+)", r"\1 \2", text)
+    return text
+def convert_to_ascii(text):
+    return unidecode(text)
+def dehyphenize_compound_words(text):
+    text = re.sub(r"(?<=[a-zA-Z0-9])-(?=[a-zA-Z])", " ", text)
+    return text
+def remove_space_before_punctuation(text):
+    return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text)
+class Cleaner(object):
+    def __init__(self, cleaner_names, phonemedict):
+        self.cleaner_names = cleaner_names
+        self.phonemedict = phonemedict
+        self.acronym_normalizer = AcronymNormalizer(self.phonemedict)
+    def __call__(self, text):
+        for cleaner_name in self.cleaner_names:
+            sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name)
+            for fn in sequence_fns:
+                text = fn(text)
+            text = [
+                reduce(lambda x, y: y(x), word_fns, split) if split[0] != "{" else split
+                for split in _arpa_re.findall(text)
+            ]
+            text = " ".join(text)
+        text = remove_space_before_punctuation(text)
+        return text
+    def get_cleaner_fns(self, cleaner_name):
+        if cleaner_name == "basic_cleaners":
+            sequence_fns = [lowercase, collapse_whitespace]
+            word_fns = []
+        elif cleaner_name == "english_cleaners":
+            sequence_fns = [collapse_whitespace, convert_to_ascii, lowercase]
+            word_fns = [expand_numbers, expand_abbreviations]
+        elif cleaner_name == "radtts_cleaners":
+            sequence_fns = [
+                collapse_whitespace,
+                expand_currency,
+                expand_datestime,
+                expand_letters_and_numbers,
+            ]
+            word_fns = [expand_numbers, expand_abbreviations]
+        elif cleaner_name == "ukrainian_cleaners":
+            sequence_fns = [lowercase, collapse_whitespace]
+            word_fns = []
+        elif cleaner_name == "transliteration_cleaners":
+            sequence_fns = [convert_to_ascii, lowercase, collapse_whitespace]
+        else:
+            raise Exception("{} cleaner not supported".format(cleaner_name))
+        return sequence_fns, word_fns

tts_text_processing/cmudict.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""adapted from https://github.com/keithito/tacotron"""
+import re
+valid_symbols = [
+    "AA",
+    "AA0",
+    "AA1",
+    "AA2",
+    "AE",
+    "AE0",
+    "AE1",
+    "AE2",
+    "AH",
+    "AH0",
+    "AH1",
+    "AH2",
+    "AO",
+    "AO0",
+    "AO1",
+    "AO2",
+    "AW",
+    "AW0",
+    "AW1",
+    "AW2",
+    "AY",
+    "AY0",
+    "AY1",
+    "AY2",
+    "B",
+    "CH",
+    "D",
+    "DH",
+    "EH",
+    "EH0",
+    "EH1",
+    "EH2",
+    "ER",
+    "ER0",
+    "ER1",
+    "ER2",
+    "EY",
+    "EY0",
+    "EY1",
+    "EY2",
+    "F",
+    "G",
+    "HH",
+    "IH",
+    "IH0",
+    "IH1",
+    "IH2",
+    "IY",
+    "IY0",
+    "IY1",
+    "IY2",
+    "JH",
+    "K",
+    "L",
+    "M",
+    "N",
+    "NG",
+    "OW",
+    "OW0",
+    "OW1",
+    "OW2",
+    "OY",
+    "OY0",
+    "OY1",
+    "OY2",
+    "P",
+    "R",
+    "S",
+    "SH",
+    "T",
+    "TH",
+    "UH",
+    "UH0",
+    "UH1",
+    "UH2",
+    "UW",
+    "UW0",
+    "UW1",
+    "UW2",
+    "V",
+    "W",
+    "Y",
+    "Z",
+    "ZH",
+]
+_valid_symbol_set = set(valid_symbols)
+class CMUDict:
+    """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
+    def __init__(self, file_or_path, keep_ambiguous=True):
+        if isinstance(file_or_path, str):
+            with open(file_or_path, encoding="latin-1") as f:
+                entries = _parse_cmudict(f)
+        else:
+            entries = _parse_cmudict(file_or_path)
+        if not keep_ambiguous:
+            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+        self._entries = entries
+    def __len__(self):
+        return len(self._entries)
+    def lookup(self, word):
+        """Returns list of ARPAbet pronunciations of the given word."""
+        return self._entries.get(word.upper())
+_alt_re = re.compile(r"\([0-9]+\)")
+def _parse_cmudict(file):
+    cmudict = {}
+    for line in file:
+        if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
+            parts = line.split("  ")
+            word = re.sub(_alt_re, "", parts[0])
+            pronunciation = _get_pronunciation(parts[1])
+            if pronunciation:
+                if word in cmudict:
+                    cmudict[word].append(pronunciation)
+                else:
+                    cmudict[word] = [pronunciation]
+    return cmudict
+def _get_pronunciation(s):
+    parts = s.strip().split(" ")
+    for part in parts:
+        if part not in _valid_symbol_set:
+            return None
+    return " ".join(parts)

tts_text_processing/datestime.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""adapted from https://github.com/keithito/tacotron"""
+import re
+_ampm_re = re.compile(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):?([0-5][0-9])?\s*([AaPp][Mm]\b)")
+def _expand_ampm(m):
+    matches = list(m.groups(0))
+    txt = matches[0]
+    txt = txt if int(matches[1]) == 0 else txt + " " + matches[1]
+    if matches[2][0].lower() == "a":
+        txt += " a.m."
+    elif matches[2][0].lower() == "p":
+        txt += " p.m."
+    return txt
+def normalize_datestime(text):
+    text = re.sub(_ampm_re, _expand_ampm, text)
+    # text = re.sub(r"([0-9]|0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])?", r"\1 \2", text)
+    return text

tts_text_processing/grapheme_dictionary.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""adapted from https://github.com/keithito/tacotron"""
+import re
+_alt_re = re.compile(r"\([0-9]+\)")
+class Grapheme2PhonemeDictionary:
+    """Thin wrapper around g2p data."""
+    def __init__(self, file_or_path, keep_ambiguous=True, encoding="latin-1"):
+        with open(file_or_path, encoding=encoding) as f:
+            entries = _parse_g2p(f)
+        if not keep_ambiguous:
+            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+        self._entries = entries
+    def __len__(self):
+        return len(self._entries)
+    def lookup(self, word):
+        """Returns list of pronunciations of the given word."""
+        return self._entries.get(word.upper())
+def _parse_g2p(file):
+    g2p = {}
+    for line in file:
+        if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
+            parts = line.split("  ")
+            word = re.sub(_alt_re, "", parts[0])
+            pronunciation = parts[1].strip()
+            if word in g2p:
+                g2p[word].append(pronunciation)
+            else:
+                g2p[word] = [pronunciation]
+    return g2p

tts_text_processing/heteronyms ADDED Viewed

	@@ -0,0 +1,413 @@

+abject
+abrogate
+absent
+abstract
+abuse
+ache
+acre
+acuminate
+addict
+address
+adduct
+adele
+advocate
+affect
+affiliate
+agape
+aged
+agglomerate
+aggregate
+agonic
+agora
+allied
+ally
+alternate
+alum
+am
+analyses
+andrea
+animate
+apply
+appropriate
+approximate
+ares
+arithmetic
+arsenic
+articulate
+associate
+attribute
+august
+axes
+ay
+aye
+bases
+bass
+bathed
+bested
+bifurcate
+blessed
+blotto
+bow
+bowed
+bowman
+brassy
+buffet
+bustier
+carbonate
+celtic
+choral
+chumash
+close
+closer
+coax
+coincidence
+color coordinate
+colour coordinate
+comber
+combine
+combs
+committee
+commune
+compact
+complex
+compound
+compress
+concert
+conduct
+confine
+confines
+conflict
+conglomerate
+conscript
+conserve
+consist
+console
+consort
+construct
+consult
+consummate
+content
+contest
+contract
+contracts
+contrast
+converse
+convert
+convict
+coop
+coordinate
+covey
+crooked
+curate
+cussed
+decollate
+decrease
+defect
+defense
+delegate
+deliberate
+denier
+desert
+detail
+deviate
+diagnoses
+diffuse
+digest
+discard
+discharge
+discount
+do
+document
+does
+dogged
+domesticate
+dominican
+dove
+dr
+drawer
+duplicate
+egress
+ejaculate
+eject
+elaborate
+ellipses
+email
+emu
+entrace
+entrance
+escort
+estimate
+eta
+etna
+evening
+excise
+excuse
+exploit
+export
+extract
+fine
+flower
+forbear
+four-legged
+frequent
+furrier
+gallant
+gel
+geminate
+gillie
+glower
+gotham
+graduate
+haggis
+heavy
+hinder
+house
+housewife
+impact
+imped
+implant
+implement
+import
+impress
+incense
+incline
+increase
+infix
+insert
+instar
+insult
+integral
+intercept
+interchange
+interflow
+interleaf
+intermediate
+intern
+interspace
+intimate
+intrigue
+invalid
+invert
+invite
+irony
+jagged
+jesses
+julies
+kite
+laminate
+laos
+lather
+lead
+learned
+leasing
+lech
+legitimate
+lied
+lima
+lipread
+live
+lower
+lunged
+maas
+magdalen
+manes
+mare
+marked
+merchandise
+merlion
+minute
+misconduct
+misled
+misprint
+mobile
+moderate
+mong
+moped
+moth
+mouth
+mow
+mpg
+multiply
+mush
+nana
+nice
+nice
+number
+numerate
+nun
+object
+opiate
+ornament
+outbox
+outcry
+outpour
+outreach
+outride
+outright
+outside
+outwork
+overall
+overbid
+overcall
+overcast
+overfall
+overflow
+overhaul
+overhead
+overlap
+overlay
+overuse
+overweight
+overwork
+pace
+palled
+palling
+para
+pasty
+pate
+pauline
+pedal
+peer
+perfect
+periodic
+permit
+pervert
+pinta
+placer
+platy
+polish
+polish
+poll
+pontificate
+postulate
+pram
+prayer
+precipitate
+predate
+predicate
+prefix
+preposition
+present
+pretest
+primer
+proceeds
+produce
+progress
+project
+proportionate
+prospect
+protest
+pussy
+putter
+putting
+quite
+ragged
+raven
+re
+read
+reading
+reading
+real
+rebel
+recall
+recap
+recitative
+recollect
+record
+recreate
+recreation
+redress
+refill
+refund
+refuse
+reject
+relay
+remake
+repaint
+reprint
+reread
+rerun
+resent
+reside
+resign
+respray
+resume
+retard
+retest
+retread
+rewrite
+root
+routed
+routing
+row
+rugged
+rummy
+sais
+sake
+sambuca
+saucier
+second
+secrete
+secreted
+secreting
+segment
+separate
+sewer
+shirk
+shower
+sin
+skied
+slaver
+slough
+sow
+spoof
+squid
+stingy
+subject
+subordinate
+subvert
+supply
+supposed
+survey
+suspect
+syringes
+tabulate
+tales
+tarrier
+tarry
+taxes
+taxis
+tear
+theron
+thou
+three-legged
+tier
+tinged
+torment
+transfer
+transform
+transplant
+transport
+transpose
+tush
+two-legged
+unionised
+unionized
+update
+uplift
+upset
+use
+used
+vale
+violist
+viva
+ware
+whinged
+whoop
+wicked
+wind
+windy
+wino
+won
+worsted
+wound

tts_text_processing/letters_and_numbers.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""adapted from https://github.com/keithito/tacotron"""
+import re
+_letters_and_numbers_re = re.compile(
+    r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE
+)
+_hardware_re = re.compile(
+    "([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)", re.IGNORECASE
+)
+_hardware_key = {
+    "tb": "terabyte",
+    "gb": "gigabyte",
+    "mb": "megabyte",
+    "kb": "kilobyte",
+    "ghz": "gigahertz",
+    "mhz": "megahertz",
+    "khz": "kilohertz",
+    "hz": "hertz",
+    "mm": "millimeter",
+    "cm": "centimeter",
+    "km": "kilometer",
+}
+_dimension_re = re.compile(
+    r"\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b"
+)
+_dimension_key = {"m": "meter", "in": "inch", "inch": "inch"}
+def _expand_letters_and_numbers(m):
+    text = re.split(r"(\d+)", m.group(0))
+    # remove trailing space
+    if text[-1] == "":
+        text = text[:-1]
+    elif text[0] == "":
+        text = text[1:]
+    # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc...
+    if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit():
+        text[-2] = text[-2] + text[-1]
+        text = text[:-1]
+    # for combining digits 2 by 2
+    new_text = []
+    for i in range(len(text)):
+        string = text[i]
+        if string.isdigit() and len(string) < 5:
+            # heuristics
+            if len(string) > 2 and string[-2] == "0":
+                if string[-1] == "0":
+                    string = [string]
+                else:
+                    string = [string[:-3], string[-2], string[-1]]
+            elif len(string) % 2 == 0:
+                string = [string[i : i + 2] for i in range(0, len(string), 2)]
+            elif len(string) > 2:
+                string = [string[0]] + [
+                    string[i : i + 2] for i in range(1, len(string), 2)
+                ]
+            new_text.extend(string)
+        else:
+            new_text.append(string)
+    text = new_text
+    text = " ".join(text)
+    return text
+def _expand_hardware(m):
+    quantity, measure = m.groups(0)
+    measure = _hardware_key[measure.lower()]
+    if measure[-1] != "z" and float(quantity.replace(",", "")) > 1:
+        return "{} {}s".format(quantity, measure)
+    return "{} {}".format(quantity, measure)
+def _expand_dimension(m):
+    text = "".join([x for x in m.groups(0) if x != 0])
+    text = text.replace(" x ", " by ")
+    text = text.replace("x", " by ")
+    if text.endswith(tuple(_dimension_key.keys())):
+        if text[-2].isdigit():
+            text = "{} {}".format(text[:-1], _dimension_key[text[-1:]])
+        elif text[-3].isdigit():
+            text = "{} {}".format(text[:-2], _dimension_key[text[-2:]])
+    return text
+def normalize_letters_and_numbers(text):
+    text = re.sub(_hardware_re, _expand_hardware, text)
+    text = re.sub(_dimension_re, _expand_dimension, text)
+    text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text)
+    return text

tts_text_processing/numerical.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""adapted from https://github.com/keithito/tacotron"""
+import inflect
+import re
+_magnitudes = ["trillion", "billion", "million", "thousand", "hundred", "m", "b", "t"]
+_magnitudes_key = {"m": "million", "b": "billion", "t": "trillion"}
+_measurements = "(f|c|k|d|m)"
+_measurements_key = {"f": "fahrenheit", "c": "celsius", "k": "thousand", "m": "meters"}
+_currency_key = {"$": "dollar", "£": "pound", "€": "euro", "₩": "won"}
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_currency_re = re.compile(
+    r"([\$€£₩])([0-9\.\,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]))?".format(
+        "|".join(_magnitudes)
+    ),
+    re.IGNORECASE,
+)
+_measurement_re = re.compile(
+    r"([0-9\.\,]*[0-9]+(\s)?{}\b)".format(_measurements), re.IGNORECASE
+)
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+# _range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?')
+_roman_re = re.compile(
+    r"\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b"
+)  # avoid I
+_multiply_re = re.compile(r"(\b[0-9]+)(x)([0-9]+)")
+_number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+")
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def _expand_currency(m):
+    currency = _currency_key[m.group(1)]
+    quantity = m.group(2)
+    magnitude = m.group(3)
+    # remove commas from quantity to be able to convert to numerical
+    quantity = quantity.replace(",", "")
+    # check for million, billion, etc...
+    if magnitude is not None and magnitude.lower() in _magnitudes:
+        if len(magnitude) == 1:
+            magnitude = _magnitudes_key[magnitude.lower()]
+        return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency + "s")
+    parts = quantity.split(".")
+    if len(parts) > 2:
+        return quantity + " " + currency + "s"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = currency if dollars == 1 else currency + "s"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "{} {}, {} {}".format(
+            _expand_hundreds(dollars),
+            dollar_unit,
+            _inflect.number_to_words(cents),
+            cent_unit,
+        )
+    elif dollars:
+        dollar_unit = currency if dollars == 1 else currency + "s"
+        return "{} {}".format(_expand_hundreds(dollars), dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "{} {}".format(_inflect.number_to_words(cents), cent_unit)
+    else:
+        return "zero" + " " + currency + "s"
+def _expand_hundreds(text):
+    number = float(text)
+    if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0):
+        return _inflect.number_to_words(int(number / 100)) + " hundred"
+    else:
+        return _inflect.number_to_words(text)
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_measurement(m):
+    _, number, measurement = re.split("(\d+(?:\.\d+)?)", m.group(0))
+    number = _inflect.number_to_words(number)
+    measurement = "".join(measurement.split())
+    measurement = _measurements_key[measurement.lower()]
+    return "{} {}".format(number, measurement)
+def _expand_range(m):
+    return " to "
+def _expand_multiply(m):
+    left = m.group(1)
+    right = m.group(3)
+    return "{} by {}".format(left, right)
+def _expand_roman(m):
+    # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
+    roman_numerals = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
+    result = 0
+    num = m.group(0)
+    for i, c in enumerate(num):
+        if (i + 1) == len(num) or roman_numerals[c] >= roman_numerals[num[i + 1]]:
+            result += roman_numerals[c]
+        else:
+            result -= roman_numerals[c]
+    return str(result)
+def _expand_number(m):
+    _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0))
+    number = int(number)
+    if (
+        number > 1000
+        and number < 10000
+        and (number % 100 == 0)
+        and (number % 1000 != 0)
+    ):
+        text = _inflect.number_to_words(number // 100) + " hundred"
+    elif number > 1000 and number < 3000:
+        if number == 2000:
+            text = "two thousand"
+        elif number > 2000 and number < 2010:
+            text = "two thousand " + _inflect.number_to_words(number % 100)
+        elif number % 100 == 0:
+            text = _inflect.number_to_words(number // 100) + " hundred"
+        else:
+            number = _inflect.number_to_words(
+                number, andword="", zero="oh", group=2
+            ).replace(", ", " ")
+            number = re.sub(r"-", " ", number)
+            text = number
+    else:
+        number = _inflect.number_to_words(number, andword="and")
+        number = re.sub(r"-", " ", number)
+        number = re.sub(r",", "", number)
+        text = number
+    if suffix in ("'s", "s"):
+        if text[-1] == "y":
+            text = text[:-1] + "ies"
+        else:
+            text = text + suffix
+    return text
+def normalize_currency(text):
+    return re.sub(_currency_re, _expand_currency, text)
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_currency_re, _expand_currency, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    # text = re.sub(_range_re, _expand_range, text)
+    # text = re.sub(_measurement_re, _expand_measurement, text)
+    text = re.sub(_roman_re, _expand_roman, text)
+    text = re.sub(_multiply_re, _expand_multiply, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text

tts_text_processing/symbols.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""adapted from https://github.com/keithito/tacotron"""
+"""
+Defines the set of symbols used in text input to the model.
+The default is a set of ASCII characters that works well for English or text
+that has been run through Unidecode. For other data, you can modify
+_characters."""
+arpabet = [
+    "AA",
+    "AA0",
+    "AA1",
+    "AA2",
+    "AE",
+    "AE0",
+    "AE1",
+    "AE2",
+    "AH",
+    "AH0",
+    "AH1",
+    "AH2",
+    "AO",
+    "AO0",
+    "AO1",
+    "AO2",
+    "AW",
+    "AW0",
+    "AW1",
+    "AW2",
+    "AY",
+    "AY0",
+    "AY1",
+    "AY2",
+    "B",
+    "CH",
+    "D",
+    "DH",
+    "EH",
+    "EH0",
+    "EH1",
+    "EH2",
+    "ER",
+    "ER0",
+    "ER1",
+    "ER2",
+    "EY",
+    "EY0",
+    "EY1",
+    "EY2",
+    "F",
+    "G",
+    "HH",
+    "IH",
+    "IH0",
+    "IH1",
+    "IH2",
+    "IY",
+    "IY0",
+    "IY1",
+    "IY2",
+    "JH",
+    "K",
+    "L",
+    "M",
+    "N",
+    "NG",
+    "OW",
+    "OW0",
+    "OW1",
+    "OW2",
+    "OY",
+    "OY0",
+    "OY1",
+    "OY2",
+    "P",
+    "R",
+    "S",
+    "SH",
+    "T",
+    "TH",
+    "UH",
+    "UH0",
+    "UH1",
+    "UH2",
+    "UW",
+    "UW0",
+    "UW1",
+    "UW2",
+    "V",
+    "W",
+    "Y",
+    "Z",
+    "ZH",
+]
+def get_symbols(symbol_set):
+    if symbol_set == "english_basic":
+        _pad = "_"
+        _punctuation = "!'\"(),.:;? "
+        _special = "-"
+        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+        _arpabet = ["@" + s for s in arpabet]
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+    elif symbol_set == "english_basic_lowercase":
+        _pad = "_"
+        _punctuation = "!'\"(),.:;? "
+        _special = "-"
+        _letters = "abcdefghijklmnopqrstuvwxyz"
+        _arpabet = ["@" + s for s in arpabet]
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+    elif symbol_set == "english_expanded":
+        _punctuation = "!'\",.:;? "
+        _math = "#%&*+-/[]()"
+        _special = "_@©°½—₩€$"
+        _accented = "áçéêëñöøćž"
+        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+        _arpabet = ["@" + s for s in arpabet]
+        symbols = (
+            list(_punctuation + _math + _special + _accented + _letters) + _arpabet
+        )
+    elif symbol_set == "ukrainian":
+        _punctuation = "'.,?! "
+        _special = "-+"
+        _letters = "абвгґдежзийклмнопрстуфхцчшщьюяєії"
+        symbols = list(_punctuation + _special + _letters)
+    elif symbol_set == "radtts":
+        _punctuation = "!'\",.:;? "
+        _math = "#%&*+-/[]()"
+        _special = "_@©°½—₩€$"
+        _accented = "áçéêëñöøćž"
+        _numbers = "0123456789"
+        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+        _arpabet = ["@" + s for s in arpabet]
+        symbols = (
+            list(_punctuation + _math + _special + _accented + _numbers + _letters)
+            + _arpabet
+        )
+    else:
+        raise Exception("{} symbol set does not exist".format(symbol_set))
+    return symbols

tts_text_processing/text_processing.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""adapted from https://github.com/keithito/tacotron"""
+import re
+import numpy as np
+from .cleaners import Cleaner
+from .symbols import get_symbols
+from .grapheme_dictionary import Grapheme2PhonemeDictionary
+#########
+# REGEX #
+#########
+# Regular expression matching text enclosed in curly braces for encoding
+_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
+# Regular expression matching words and not words
+_words_re = re.compile(
+    r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]+|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)"
+)
+def lines_to_list(filename):
+    with open(filename, encoding="utf-8") as f:
+        lines = f.readlines()
+    lines = [l.rstrip() for l in lines]
+    return lines
+class TextProcessing(object):
+    def __init__(
+        self,
+        symbol_set,
+        cleaner_name,
+        heteronyms_path,
+        phoneme_dict_path,
+        p_phoneme,
+        handle_phoneme,
+        handle_phoneme_ambiguous,
+        prepend_space_to_text=False,
+        append_space_to_text=False,
+        add_bos_eos_to_text=False,
+        encoding="latin-1",
+    ):
+        if heteronyms_path is not None and heteronyms_path != "":
+            self.heteronyms = set(lines_to_list(heteronyms_path))
+        else:
+            self.heteronyms = []
+        # phoneme dict
+        self.phonemedict = {}
+        self.p_phoneme = p_phoneme
+        self.handle_phoneme = handle_phoneme
+        self.handle_phoneme_ambiguous = handle_phoneme_ambiguous
+        self.symbols = get_symbols(symbol_set)
+        self.cleaner_names = cleaner_name
+        self.cleaner = Cleaner(cleaner_name, self.phonemedict)
+        self.prepend_space_to_text = prepend_space_to_text
+        self.append_space_to_text = append_space_to_text
+        self.add_bos_eos_to_text = add_bos_eos_to_text
+        if add_bos_eos_to_text:
+            self.symbols.append("<bos>")
+            self.symbols.append("<eos>")
+        # Mappings from symbol to numeric ID and vice versa:
+        self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
+        self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
+    def text_to_sequence(self, text):
+        sequence = []
+        # Check for curly braces and treat their contents as phoneme:
+        while len(text):
+            m = _curly_re.match(text)
+            if not m:
+                sequence += self.symbols_to_sequence(text)
+                break
+            sequence += self.symbols_to_sequence(m.group(1))
+            sequence += self.phoneme_to_sequence(m.group(2))
+            text = m.group(3)
+        return sequence
+    def sequence_to_text(self, sequence):
+        result = ""
+        for symbol_id in sequence:
+            if symbol_id in self.id_to_symbol:
+                s = self.id_to_symbol[symbol_id]
+                # Enclose phoneme back in curly braces:
+                if len(s) > 1 and s[0] == "@":
+                    s = "{%s}" % s[1:]
+                result += s
+        return result.replace("}{", " ")
+    def clean_text(self, text):
+        text = self.cleaner(text)
+        return text
+    def symbols_to_sequence(self, symbols):
+        return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id]
+    def phoneme_to_sequence(self, text):
+        return self.symbols_to_sequence(["@" + s for s in text.split()])
+    def get_phoneme(self, word):
+        phoneme_suffix = ""
+        if word.lower() in self.heteronyms:
+            return word
+        if len(word) > 2 and word.endswith("'s"):
+            phoneme = self.phonemedict.lookup(word)
+            if phoneme is None:
+                phoneme = self.phonemedict.lookup(word[:-2])
+                phoneme_suffix = "" if phoneme is None else " Z"
+        elif len(word) > 1 and word.endswith("s"):
+            phoneme = self.phonemedict.lookup(word)
+            if phoneme is None:
+                phoneme = self.phonemedict.lookup(word[:-1])
+                phoneme_suffix = "" if phoneme is None else " Z"
+        else:
+            phoneme = self.phonemedict.lookup(word)
+        if phoneme is None:
+            return word
+        if len(phoneme) > 1:
+            if self.handle_phoneme_ambiguous == "first":
+                phoneme = phoneme[0]
+            elif self.handle_phoneme_ambiguous == "random":
+                phoneme = np.random.choice(phoneme)
+            elif self.handle_phoneme_ambiguous == "ignore":
+                return word
+        else:
+            phoneme = phoneme[0]
+        phoneme = "{" + phoneme + phoneme_suffix + "}"
+        return phoneme
+    def encode_text(self, text, return_all=False):
+        text_clean = self.clean_text(text)
+        text = text_clean
+        text_phoneme = ""
+        if self.p_phoneme > 0:
+            text_phoneme = self.convert_to_phoneme(text)
+            text = text_phoneme
+        text_encoded = self.text_to_sequence(text)
+        if self.prepend_space_to_text:
+            text_encoded.insert(0, self.symbol_to_id[" "])
+        if self.append_space_to_text:
+            text_encoded.append(self.symbol_to_id[" "])
+        if self.add_bos_eos_to_text:
+            text_encoded.insert(0, self.symbol_to_id["<bos>"])
+            text_encoded.append(self.symbol_to_id["<eos>"])
+        if return_all:
+            return text_encoded, text_clean, text_phoneme
+        return text_encoded
+    def convert_to_phoneme(self, text):
+        if self.handle_phoneme == "sentence":
+            if np.random.uniform() < self.p_phoneme:
+                words = _words_re.findall(text)
+                text_phoneme = [
+                    self.get_phoneme(word[0])
+                    if (word[0] != "")
+                    else re.sub(r"\s(\d)", r"\1", word[1].upper())
+                    for word in words
+                ]
+                text_phoneme = "".join(text_phoneme)
+                text = text_phoneme
+        elif self.handle_phoneme == "word":
+            words = _words_re.findall(text)
+            text_phoneme = [
+                re.sub(r"\s(\d)", r"\1", word[1].upper())
+                if word[0] == ""
+                else (
+                    self.get_phoneme(word[0])
+                    if np.random.uniform() < self.p_phoneme
+                    else word[0]
+                )
+                for word in words
+            ]
+            text_phoneme = "".join(text_phoneme)
+            text = text_phoneme
+        elif self.handle_phoneme != "":
+            raise Exception(
+                "{} handle_phoneme is not supported".format(self.handle_phoneme)
+            )
+        return text