Spaces:

Yehor
/

radtts-uk-vocos-demo

Running

App Files Files Community

Yehor commited on 14 days ago

Commit

eac7684

1 Parent(s): 1cf3da8

Clean up

Browse files

Files changed (8) hide show

app.py +21 -7
audio_processing.py +0 -54
common.py +3 -1
configs/radtts-pp-dap-model.json +0 -39
data.py +3 -365
export_weights.py +3 -3
radtts.py +3 -1
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ from huggingface_hub import hf_hub_download
 # RAD-TTS code
 from radtts import RADTTS
-from data import Data
 from common import update_params
 from torch_env import device
@@ -100,10 +100,10 @@ radtts.eval()
 print(f"Loaded checkpoint '{radtts_path}')")
 ignore_keys = ["training_files", "validation_files"]
-trainset = Data(
     data_config["training_files"],
     **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
-)
 # Config
 concurrency_limit = 5
@@ -186,6 +186,20 @@ examples = [
 ]
 def inference(text, voice):
     if not text:
         raise gr.Error("Please paste your text.")
@@ -209,16 +223,16 @@ def inference(text, voice):
     energy_mean = 0
     energy_std = 0
-    tensor_text = trainset.get_text(text).to(device)
-    speaker_id = trainset.get_speaker_id(speaker).to(device)
     speaker_id_text, speaker_id_attributes = speaker_id, speaker_id
     if speaker_text is not None:
-        speaker_id_text = trainset.get_speaker_id(speaker_text).to(device)
     if speaker_attributes is not None:
-        speaker_id_attributes = trainset.get_speaker_id(speaker_attributes).to(device)
     inference_start = time.time()

 # RAD-TTS code
 from radtts import RADTTS
+from data import TextProcessor
 from common import update_params
 from torch_env import device
 print(f"Loaded checkpoint '{radtts_path}')")
 ignore_keys = ["training_files", "validation_files"]
+tp = TextProcessor(
     data_config["training_files"],
     **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
+).get_processor()
 # Config
 concurrency_limit = 5
 ]
+def get_speaker_id(speaker):
+    speaker_ids = {
+        "lada": 0,
+        "mykyta": 1,
+        "tetiana": 2,
+    }
+    return torch.LongTensor([speaker_ids[speaker]])
+def get_text(text):
+    return torch.LongTensor(tp.encode_text(text))
 def inference(text, voice):
     if not text:
         raise gr.Error("Please paste your text.")
     energy_mean = 0
     energy_std = 0
+    tensor_text = get_text(text).to(device)
+    speaker_id = get_speaker_id(speaker).to(device)
     speaker_id_text, speaker_id_attributes = speaker_id, speaker_id
     if speaker_text is not None:
+        speaker_id_text = get_speaker_id(speaker_text).to(device)
     if speaker_attributes is not None:
+        speaker_id_attributes = get_speaker_id(speaker_attributes).to(device)
     inference_start = time.time()

audio_processing.py CHANGED Viewed

@@ -55,7 +55,6 @@ import torch
 import numpy as np
 from scipy.signal import get_window
-from librosa.filters import mel as librosa_mel_fn
 import librosa.util as librosa_util
 import torch.nn.functional as F
@@ -159,59 +158,6 @@ def dynamic_range_decompression(x, C=1):
     return torch.exp(x) / C
-class TacotronSTFT(torch.nn.Module):
-    def __init__(
-        self,
-        filter_length=1024,
-        hop_length=256,
-        win_length=1024,
-        n_mel_channels=80,
-        sampling_rate=22050,
-        mel_fmin=0.0,
-        mel_fmax=None,
-    ):
-        super(TacotronSTFT, self).__init__()
-        self.n_mel_channels = n_mel_channels
-        self.sampling_rate = sampling_rate
-        self.stft_fn = STFT(filter_length, hop_length, win_length)
-        mel_basis = librosa_mel_fn(
-            sr=sampling_rate,
-            n_fft=filter_length,
-            n_mels=n_mel_channels,
-            fmin=mel_fmin,
-            fmax=mel_fmax,
-        )
-        mel_basis = torch.from_numpy(mel_basis).float()
-        self.register_buffer("mel_basis", mel_basis)
-    def spectral_normalize(self, magnitudes):
-        output = dynamic_range_compression(magnitudes)
-        return output
-    def spectral_de_normalize(self, magnitudes):
-        output = dynamic_range_decompression(magnitudes)
-        return output
-    def mel_spectrogram(self, y):
-        """Computes mel-spectrograms from a batch of waves
-        PARAMS
-        ------
-        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
-        RETURNS
-        -------
-        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
-        """
-        assert torch.min(y.data) >= -1
-        assert torch.max(y.data) <= 1
-        magnitudes, phases = self.stft_fn.transform(y)
-        magnitudes = magnitudes.data
-        mel_output = torch.matmul(self.mel_basis, magnitudes)
-        mel_output = self.spectral_normalize(mel_output)
-        return mel_output
 class STFT(torch.nn.Module):
     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""

 import numpy as np
 from scipy.signal import get_window
 import librosa.util as librosa_util
 import torch.nn.functional as F
     return torch.exp(x) / C
 class STFT(torch.nn.Module):
     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""

common.py CHANGED Viewed

@@ -233,7 +233,9 @@ class ConvLSTMLinear(nn.Module):
                 dilation=1,
                 w_init_gain="relu",
             )
-            conv_layer = torch.nn.utils.parametrizations.weight_norm(conv_layer.conv, name="weight")
             convolutions.append(conv_layer)
         self.convolutions = nn.ModuleList(convolutions)

                 dilation=1,
                 w_init_gain="relu",
             )
+            conv_layer = torch.nn.utils.parametrizations.weight_norm(
+                conv_layer.conv, name="weight"
+            )
             convolutions.append(conv_layer)
         self.convolutions = nn.ModuleList(convolutions)

configs/radtts-pp-dap-model.json CHANGED Viewed

@@ -1,39 +1,4 @@
 {
-    "train_config": {
-        "output_directory": "outdir_pp_model",
-        "epochs": 10000000,
-        "optim_algo": "RAdam",
-        "learning_rate": 0.001,
-        "weight_decay": 1e-06,
-        "sigma": 1.0,
-        "iters_per_checkpoint": 1000,
-        "batch_size": 16,
-        "seed": null,
-        "checkpoint_path": "",
-        "ignore_layers": [],
-        "ignore_layers_warmstart": [],
-        "finetune_layers": [],
-        "include_layers": [],
-        "vocoder_config_path": "models/hifigan_22khz_config.json",
-        "vocoder_checkpoint_path": "models/hifigan_ljs_generator_v1.pt",
-        "log_attribute_samples": true,
-        "log_decoder_samples": true,
-        "warmstart_checkpoint_path": "outdir_pp/model_100000",
-        "use_amp": true,
-        "grad_clip_val": 1.0,
-        "loss_weights": {
-            "blank_logprob": -1,
-            "ctc_loss_weight": 0.1,
-            "binarization_loss_weight": 1.0,
-            "dur_loss_weight": 1.0,
-            "f0_loss_weight": 1.0,
-            "energy_loss_weight": 1.0,
-            "vpred_loss_weight": 1.0
-        },
-        "binarization_start_iter": 0,
-        "kl_loss_start_iter": 0,
-        "unfreeze_modules": "all"
-    },
     "data_config": {
         "training_files": {
             "LJS": {
@@ -88,10 +53,6 @@
         "distance_tx_unvoiced": false,
         "mel_noise_scale": 0.0
     },
-    "dist_config": {
-        "dist_backend": "nccl",
-        "dist_url": "tcp://localhost:54321"
-    },
     "model_config": {
         "n_speakers": 3,
         "n_speaker_dim": 16,

 {
     "data_config": {
         "training_files": {
             "LJS": {
         "distance_tx_unvoiced": false,
         "mel_noise_scale": 0.0
     },
     "model_config": {
         "n_speakers": 3,
         "n_speaker_dim": 16,

data.py CHANGED Viewed

@@ -38,43 +38,13 @@
 #
 ###############################################################################
-import os
-import pickle as pkl
-import lmdb
 import torch
 import torch.utils.data
-import numpy as np
-from librosa import pyin
-from scipy.io.wavfile import read
-from scipy.stats import betabinom
-from scipy.ndimage import distance_transform_edt as distance_transform
-from audio_processing import TacotronSTFT
 from tts_text_processing.text_processing import TextProcessing
-def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=0.05):
-    P = phoneme_count
-    M = mel_count
-    x = np.arange(0, P)
-    mel_text_probs = []
-    for i in range(1, M + 1):
-        a, b = scaling_factor * i, scaling_factor * (M + 1 - i)
-        rv = betabinom(P - 1, a, b)
-        mel_i_prob = rv.pmf(x)
-        mel_text_probs.append(mel_i_prob)
-    return torch.tensor(np.array(mel_text_probs))
-def load_wav_to_torch(full_path):
-    """Loads wavdata into torch array"""
-    sampling_rate, data = read(full_path)
-    return torch.from_numpy(np.array(data)).float(), sampling_rate
-class Data(torch.utils.data.Dataset):
     def __init__(
         self,
         datasets,
@@ -114,37 +84,6 @@ class Data(torch.utils.data.Dataset):
         combine_speaker_and_emotion=False,
         **kwargs,
     ):
-        self.combine_speaker_and_emotion = combine_speaker_and_emotion
-        self.max_wav_value = max_wav_value
-        self.audio_lmdb_dict = {}  # dictionary of lmdbs for audio data
-        self.data = self.load_data(datasets)
-        self.distance_tx_unvoiced = False
-        if "distance_tx_unvoiced" in kwargs.keys():
-            self.distance_tx_unvoiced = kwargs["distance_tx_unvoiced"]
-        self.stft = TacotronSTFT(
-            filter_length=filter_length,
-            hop_length=hop_length,
-            win_length=win_length,
-            sampling_rate=sampling_rate,
-            n_mel_channels=n_mel_channels,
-            mel_fmin=mel_fmin,
-            mel_fmax=mel_fmax,
-        )
-        self.do_mel_scaling = kwargs.get("do_mel_scaling", True)
-        self.mel_noise_scale = kwargs.get("mel_noise_scale", 0.0)
-        self.filter_length = filter_length
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.mel_fmin = mel_fmin
-        self.mel_fmax = mel_fmax
-        self.f0_min = f0_min
-        self.f0_max = f0_max
-        self.use_f0 = use_f0
-        self.use_log_f0 = use_log_f0
-        self.use_energy_avg = use_energy_avg
-        self.use_scaled_energy = use_scaled_energy
-        self.sampling_rate = sampling_rate
         self.tp = TextProcessing(
             symbol_set,
             cleaner_names,
@@ -158,306 +97,5 @@ class Data(torch.utils.data.Dataset):
             add_bos_eos_to_text=add_bos_eos_to_text,
         )
-        self.dur_min = dur_min
-        self.dur_max = dur_max
-        if speaker_ids is None or speaker_ids == "":
-            self.speaker_ids = self.create_speaker_lookup_table(self.data)
-        else:
-            self.speaker_ids = speaker_ids
-        print("Number of files", len(self.data))
-        if include_speakers is not None:
-            for speaker_set, include in include_speakers:
-                self.filter_by_speakers_(speaker_set, include)
-            print("Number of files after speaker filtering", len(self.data))
-        if dur_min is not None and dur_max is not None:
-            self.filter_by_duration_(dur_min, dur_max)
-            print("Number of files after duration filtering", len(self.data))
-        self.use_attn_prior_masking = bool(use_attn_prior_masking)
-        self.prepend_space_to_text = bool(prepend_space_to_text)
-        self.append_space_to_text = bool(append_space_to_text)
-        self.betabinom_cache_path = betabinom_cache_path
-        self.betabinom_scaling_factor = betabinom_scaling_factor
-        self.lmdb_cache_path = lmdb_cache_path
-        if self.lmdb_cache_path != "":
-            self.cache_data_lmdb = lmdb.open(
-                self.lmdb_cache_path, readonly=True, max_readers=1024, lock=False
-            ).begin()
-        # # make sure caching path exists
-        # if not os.path.exists(self.betabinom_cache_path):
-        #     os.makedirs(self.betabinom_cache_path)
-        print("Dataloader initialized with no augmentations")
-        self.speaker_map = None
-        if "speaker_map" in kwargs:
-            self.speaker_map = kwargs["speaker_map"]
-    def load_data(self, datasets, split="|"):
-        dataset = []
-        for dset_name, dset_dict in datasets.items():
-            folder_path = dset_dict["basedir"]
-            audiodir = dset_dict["audiodir"]
-            filename = dset_dict["filelist"]
-            audio_lmdb_key = None
-            if "lmdbpath" in dset_dict.keys() and len(dset_dict["lmdbpath"]) > 0:
-                self.audio_lmdb_dict[dset_name] = lmdb.open(
-                    dset_dict["lmdbpath"], readonly=True, max_readers=256, lock=False
-                ).begin()
-                audio_lmdb_key = dset_name
-            wav_folder_prefix = os.path.join(folder_path, audiodir)
-            filelist_path = os.path.join(folder_path, filename)
-            with open(filelist_path, encoding="utf-8") as f:
-                data = [line.strip().split(split) for line in f]
-            for d in data:
-                emotion = "other" if len(d) == 3 else d[3]
-                duration = -1 if len(d) == 3 else d[4]
-                dataset.append(
-                    {
-                        "audiopath": os.path.join(wav_folder_prefix, d[0]),
-                        "text": d[1],
-                        "speaker": d[2] + "-" + emotion
-                        if self.combine_speaker_and_emotion
-                        else d[2],
-                        "emotion": emotion,
-                        "duration": float(duration),
-                        "lmdb_key": audio_lmdb_key,
-                    }
-                )
-        return dataset
-    def filter_by_speakers_(self, speakers, include=True):
-        print("Include spaker {}: {}".format(speakers, include))
-        if include:
-            self.data = [x for x in self.data if x["speaker"] in speakers]
-        else:
-            self.data = [x for x in self.data if x["speaker"] not in speakers]
-    def filter_by_duration_(self, dur_min, dur_max):
-        self.data = [
-            x
-            for x in self.data
-            if x["duration"] == -1
-            or (x["duration"] >= dur_min and x["duration"] <= dur_max)
-        ]
-    def create_speaker_lookup_table(self, data):
-        speaker_ids = np.sort(np.unique([x["speaker"] for x in data]))
-        d = {speaker_ids[i]: i for i in range(len(speaker_ids))}
-        print("Number of speakers:", len(d))
-        print("Speaker IDS", d)
-        return d
-    def f0_normalize(self, x):
-        if self.use_log_f0:
-            mask = x >= self.f0_min
-            x[mask] = torch.log(x[mask])
-            x[~mask] = 0.0
-        return x
-    def f0_denormalize(self, x):
-        if self.use_log_f0:
-            log_f0_min = np.log(self.f0_min)
-            mask = x >= log_f0_min
-            x[mask] = torch.exp(x[mask])
-            x[~mask] = 0.0
-        x[x <= 0.0] = 0.0
-        return x
-    def energy_avg_normalize(self, x):
-        if self.use_scaled_energy:
-            x = (x + 20.0) / 20.0
-        return x
-    def energy_avg_denormalize(self, x):
-        if self.use_scaled_energy:
-            x = x * 20.0 - 20.0
-        return x
-    def get_f0_pvoiced(
-        self,
-        audio,
-        sampling_rate=22050,
-        frame_length=1024,
-        hop_length=256,
-        f0_min=100,
-        f0_max=300,
-    ):
-        audio_norm = audio / self.max_wav_value
-        f0, voiced_mask, p_voiced = pyin(
-            audio_norm,
-            f0_min,
-            f0_max,
-            sampling_rate,
-            frame_length=frame_length,
-            win_length=frame_length // 2,
-            hop_length=hop_length,
-        )
-        f0[~voiced_mask] = 0.0
-        f0 = torch.FloatTensor(f0)
-        p_voiced = torch.FloatTensor(p_voiced)
-        voiced_mask = torch.FloatTensor(voiced_mask)
-        return f0, voiced_mask, p_voiced
-    def get_energy_average(self, mel):
-        energy_avg = mel.mean(0)
-        energy_avg = self.energy_avg_normalize(energy_avg)
-        return energy_avg
-    def get_mel(self, audio):
-        audio_norm = audio / self.max_wav_value
-        audio_norm = audio_norm.unsqueeze(0)
-        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
-        melspec = self.stft.mel_spectrogram(audio_norm)
-        melspec = torch.squeeze(melspec, 0)
-        if self.do_mel_scaling:
-            melspec = (melspec + 5.5) / 2
-        if self.mel_noise_scale > 0:
-            melspec += torch.randn_like(melspec) * self.mel_noise_scale
-        return melspec
-    def get_speaker_id(self, speaker):
-        if self.speaker_map is not None and speaker in self.speaker_map:
-            speaker = self.speaker_map[speaker]
-        return torch.LongTensor([self.speaker_ids[speaker]])
-    def get_text(self, text):
-        text = self.tp.encode_text(text)
-        text = torch.LongTensor(text)
-        return text
-    def get_attention_prior(self, n_tokens, n_frames):
-        # cache the entire attn_prior by filename
-        if self.use_attn_prior_masking:
-            filename = "{}_{}".format(n_tokens, n_frames)
-            prior_path = os.path.join(self.betabinom_cache_path, filename)
-            prior_path += "_prior.pth"
-            if self.lmdb_cache_path != "":
-                attn_prior = pkl.loads(
-                    self.cache_data_lmdb.get(prior_path.encode("ascii"))
-                )
-            elif os.path.exists(prior_path):
-                attn_prior = torch.load(prior_path)
-            else:
-                attn_prior = beta_binomial_prior_distribution(
-                    n_tokens, n_frames, self.betabinom_scaling_factor
-                )
-                torch.save(attn_prior, prior_path)
-        else:
-            attn_prior = torch.ones(n_frames, n_tokens)  # all ones baseline
-        return attn_prior
-    def __getitem__(self, index):
-        data = self.data[index]
-        audiopath, text = data["audiopath"], data["text"]
-        speaker_id = data["speaker"]
-        if data["lmdb_key"] is not None:
-            data_dict = pkl.loads(
-                self.audio_lmdb_dict[data["lmdb_key"]].get(audiopath.encode("ascii"))
-            )
-            audio = data_dict["audio"]
-            sampling_rate = data_dict["sampling_rate"]
-        else:
-            audio, sampling_rate = load_wav_to_torch(audiopath)
-        if sampling_rate != self.sampling_rate:
-            raise ValueError(
-                "{} SR doesn't match target {} SR".format(
-                    sampling_rate, self.sampling_rate
-                )
-            )
-        mel = self.get_mel(audio)
-        f0 = None
-        p_voiced = None
-        voiced_mask = None
-        if self.use_f0:
-            filename = "_".join(audiopath.split("/")[-3:])
-            f0_path = os.path.join(self.betabinom_cache_path, filename)
-            f0_path += "_f0_sr{}_fl{}_hl{}_f0min{}_f0max{}_log{}.pt".format(
-                self.sampling_rate,
-                self.filter_length,
-                self.hop_length,
-                self.f0_min,
-                self.f0_max,
-                self.use_log_f0,
-            )
-            dikt = None
-            if len(self.lmdb_cache_path) > 0:
-                dikt = pkl.loads(self.cache_data_lmdb.get(f0_path.encode("ascii")))
-                f0 = dikt["f0"]
-                p_voiced = dikt["p_voiced"]
-                voiced_mask = dikt["voiced_mask"]
-            elif os.path.exists(f0_path):
-                try:
-                    dikt = torch.load(f0_path)
-                except Exception as e:
-                    print(e)
-                    print(f"f0 loading from {f0_path} is broken, recomputing.")
-            if dikt is not None:
-                f0 = dikt["f0"]
-                p_voiced = dikt["p_voiced"]
-                voiced_mask = dikt["voiced_mask"]
-            else:
-                f0, voiced_mask, p_voiced = self.get_f0_pvoiced(
-                    audio.cpu().numpy(),
-                    self.sampling_rate,
-                    self.filter_length,
-                    self.hop_length,
-                    self.f0_min,
-                    self.f0_max,
-                )
-                print("saving f0 to {}".format(f0_path))
-                torch.save(
-                    {"f0": f0, "voiced_mask": voiced_mask, "p_voiced": p_voiced},
-                    f0_path,
-                )
-            if f0 is None:
-                raise Exception("STOP, BROKEN F0 {}".format(audiopath))
-            f0 = self.f0_normalize(f0)
-            if self.distance_tx_unvoiced:
-                mask = f0 <= 0.0
-                distance_map = np.log(distance_transform(mask))
-                distance_map[distance_map <= 0] = 0.0
-                f0 = f0 - distance_map
-        energy_avg = None
-        if self.use_energy_avg:
-            energy_avg = self.get_energy_average(mel)
-            if self.use_scaled_energy and energy_avg.min() < 0.0:
-                print(audiopath, "has scaled energy avg smaller than 0")
-        speaker_id = self.get_speaker_id(speaker_id)
-        text_encoded = self.get_text(text)
-        attn_prior = self.get_attention_prior(text_encoded.shape[0], mel.shape[1])
-        if not self.use_attn_prior_masking:
-            attn_prior = None
-        return {
-            "mel": mel,
-            "speaker_id": speaker_id,
-            "text_encoded": text_encoded,
-            "audiopath": audiopath,
-            "attn_prior": attn_prior,
-            "f0": f0,
-            "p_voiced": p_voiced,
-            "voiced_mask": voiced_mask,
-            "energy_avg": energy_avg,
-        }
-    def __len__(self):
-        return len(self.data)

 #
 ###############################################################################
 import torch
 import torch.utils.data
 from tts_text_processing.text_processing import TextProcessing
+class TextProcessor(torch.utils.data.Dataset):
     def __init__(
         self,
         datasets,
         combine_speaker_and_emotion=False,
         **kwargs,
     ):
         self.tp = TextProcessing(
             symbol_set,
             cleaner_names,
             add_bos_eos_to_text=add_bos_eos_to_text,
         )
+    def get_processor(self):
+        return self.tp

export_weights.py CHANGED Viewed

@@ -5,9 +5,9 @@ radtts_path_state = "models/radtts-pp-dap-model/model_dap_84000_state.pt"
 checkpoint_dict = torch.load(radtts_path, map_location="cpu")
-del checkpoint_dict['iteration']
-del checkpoint_dict['optimizer']
-del checkpoint_dict['learning_rate']
 print(checkpoint_dict.keys())

 checkpoint_dict = torch.load(radtts_path, map_location="cpu")
+del checkpoint_dict["iteration"]
+del checkpoint_dict["optimizer"]
+del checkpoint_dict["learning_rate"]
 print(checkpoint_dict.keys())

radtts.py CHANGED Viewed

@@ -201,7 +201,9 @@ class RADTTS(torch.nn.Module):
                 if context_lstm_norm is not None:
                     if "spectral" in context_lstm_norm:
                         print("Applying spectral norm to context encoder LSTM")
-                        lstm_norm_fn_pntr = torch.nn.utils.parametrizations.spectral_norm
                     elif "weight" in context_lstm_norm:
                         print("Applying weight norm to context encoder LSTM")
                         lstm_norm_fn_pntr = torch.nn.utils.parametrizations.weight_norm

                 if context_lstm_norm is not None:
                     if "spectral" in context_lstm_norm:
                         print("Applying spectral norm to context encoder LSTM")
+                        lstm_norm_fn_pntr = (
+                            torch.nn.utils.parametrizations.spectral_norm
+                        )
                     elif "weight" in context_lstm_norm:
                         print("Applying weight norm to context encoder LSTM")
                         lstm_norm_fn_pntr = torch.nn.utils.parametrizations.weight_norm

requirements.txt CHANGED Viewed

@@ -1,12 +1,11 @@
 huggingface_hub
-gradio==5.18.0
 torch
 torchaudio
 scipy
 numba
-lmdb
 librosa
 git+https://github.com/langtech-bsc/vocos.git@matcha

 huggingface_hub
+gradio
 torch
 torchaudio
 scipy
 numba
 librosa
 git+https://github.com/langtech-bsc/vocos.git@matcha