Spaces:

Yehor
/

radtts-uk-vocos-demo

Running

App Files Files Community

Yehor commited on 12 days ago

Commit

6f17241

1 Parent(s): 5564ad9

Add advanced options

Browse files

Files changed (3) hide show

app.py +171 -55
archs/radtts.txt +530 -0
archs/vocos.txt +26 -0

app.py CHANGED Viewed

@@ -73,10 +73,7 @@ download_file_from_repo(
 params = []
 # Load the config
-with open("config.json") as f:
-    data = f.read()
-config = json.loads(data)
 update_params(config, params)
 data_config = config["data_config"]
@@ -115,10 +112,13 @@ vocos_params = f"{sum(param.numel() for param in vocos.parameters()):,}"
 print(f"Loaded checkpoint (RAD-TTS++), number of parameters: {radtts_params}")
 print(f"Loaded checkpoint (Vocos), number of parameters: {vocos_params}")
-ignore_keys = ["training_files", "validation_files"]
 text_processor = TextProcessor(
     data_config["training_files"],
-    **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
 )
 # Config
@@ -210,58 +210,70 @@ examples = [
 ]
-def get_speaker_id(speaker):
-    return torch.LongTensor([voices[speaker]])
-def get_text(text):
-    return torch.LongTensor(text_processor.tp.encode_text(text))
-def inference(text, voice):
     if not text:
         raise gr.Error("Please paste your text.")
-    gr.Info("Starting...", duration=0.5)
     speaker = speaker_text = speaker_attributes = voice.lower()
-    n_takes = 1
-    sigma = 0.8  # sampling sigma for decoder
-    sigma_tkndur = 0.666  # sampling sigma for duration
-    sigma_f0 = 1.0  # sampling sigma for f0
-    sigma_energy = 1.0  # sampling sigma for energy avg
-    token_dur_scaling = 1.0
-    f0_mean = 0
-    f0_std = 0
-    energy_mean = 0
-    energy_std = 0
-    tensor_text = get_text(text).to(device)
-    speaker_id = speaker_id_text = speaker_id_attributes = get_speaker_id(speaker).to(
-        device
-    )
     if speaker_text is not None:
-        speaker_id_text = get_speaker_id(speaker_text).to(device)
     if speaker_attributes is not None:
-        speaker_id_attributes = get_speaker_id(speaker_attributes).to(device)
     inference_start = time.time()
-    for take in range(n_takes):
         with torch.autocast(device, enabled=False):
             with torch.inference_mode():
                 outputs = radtts.infer(
                     speaker_id,
                     tensor_text[None],
-                    sigma,
-                    sigma_tkndur,
                     sigma_f0,
                     sigma_energy,
                     token_dur_scaling,
@@ -274,18 +286,21 @@ def inference(text, voice):
                     energy_std=energy_std,
                 )
-                mel = outputs["mel"]
-                gr.Info(
-                    "Synthesized MEL spectrogram, converting to WAVE.", duration=0.5
-                )
-                wav_gen = vocos.decode(mel)
-                wav_gen_float = wav_gen.cpu()
-                torchaudio.save("audio.wav", wav_gen_float, 44_100, encoding="PCM_S")
-                duration = len(wav_gen_float[0]) / 44_100
     elapsed_time = time.time() - inference_start
     rtf = elapsed_time / duration
@@ -303,14 +318,12 @@ def inference(text, voice):
 try:
     @spaces.GPU
-    def inference_zerogpu(text, voice):
-        return inference(text, voice)
 except NameError:
-    print("ZeroGPU is not available, skipping...")
-def inference_cpu(text, voice):
-    return inference(text, voice)
 demo = gr.Blocks(
@@ -344,17 +357,120 @@ with demo:
                 value="Tetiana",
             )
     gr.Button("Run").click(
         inference_zerogpu if use_zerogpu else inference_cpu,
         concurrency_limit=concurrency_limit,
-        inputs=[text, voice],
         outputs=[audio, rtf],
     )
     with gr.Row():
         gr.Examples(
             label="Choose an example",
-            inputs=[text, voice],
             examples=examples,
         )

 params = []
 # Load the config
+config = json.loads(Path("config.json").read_text())
 update_params(config, params)
 data_config = config["data_config"]
 print(f"Loaded checkpoint (RAD-TTS++), number of parameters: {radtts_params}")
 print(f"Loaded checkpoint (Vocos), number of parameters: {vocos_params}")
 text_processor = TextProcessor(
     data_config["training_files"],
+    **dict(
+        (k, v)
+        for k, v in data_config.items()
+        if k not in ["training_files", "validation_files"]
+    ),
 )
 # Config
 ]
+def inference(
+    text,
+    voice,
+    n_takes,
+    use_latest_take,
+    token_dur_scaling,
+    f0_mean,
+    f0_std,
+    energy_mean,
+    energy_std,
+    sigma_decoder,
+    sigma_token_duration,
+    sigma_f0,
+    sigma_energy,
+):
     if not text:
         raise gr.Error("Please paste your text.")
+    request = {
+        "text": text,
+        "voice": voice,
+        "n_takes": n_takes,
+        "use_latest_take": use_latest_take,
+        "token_dur_scaling": token_dur_scaling,
+        "f0_mean": f0_mean,
+        "f0_std": f0_std,
+        "energy_mean": energy_mean,
+        "energy_std": energy_std,
+        "sigma_decoder": sigma_decoder,
+        "sigma_token_duration": sigma_token_duration,
+        "sigma_f0": sigma_f0,
+        "sigma_energy": sigma_energy,
+    }
+    print(json.dumps(request, indent=2))
     speaker = speaker_text = speaker_attributes = voice.lower()
+    tensor_text = torch.LongTensor(text_processor.tp.encode_text(text)).to(device)
+    speaker_tensor = torch.LongTensor([voices[speaker]]).to(device)
+    speaker_id = speaker_id_text = speaker_id_attributes = speaker_tensor
     if speaker_text is not None:
+        speaker_id_text = torch.LongTensor([voices[speaker_text]]).to(device)
     if speaker_attributes is not None:
+        speaker_id_attributes = torch.LongTensor([voices[speaker_attributes]]).to(
+            device
+        )
     inference_start = time.time()
+    mels = []
+    for n_take in range(n_takes):
+        gr.Info(f"Inferencing take {n_take + 1}", duration=1)
         with torch.autocast(device, enabled=False):
             with torch.inference_mode():
                 outputs = radtts.infer(
                     speaker_id,
                     tensor_text[None],
+                    sigma_decoder,
+                    sigma_token_duration,
                     sigma_f0,
                     sigma_energy,
                     token_dur_scaling,
                     energy_std=energy_std,
                 )
+                mels.append(outputs["mel"])
+    gr.Info("Synthesized MEL spectrograms, converting to WAVE.", duration=0.5)
+    wav_gen_all = []
+    for mel in mels:
+        wav_gen_all.append(vocos.decode(mel))
+    if use_latest_take:
+        wav_gen = wav_gen_all[-1]  # Get the latest generated wav
+    else:
+        wav_gen = torch.cat(wav_gen_all, dim=1) # Concatenate all the generated wavs
+    duration = len(wav_gen[0]) / 44_100
+    torchaudio.save("audio.wav", wav_gen.cpu(), 44_100, encoding="PCM_S")
     elapsed_time = time.time() - inference_start
     rtf = elapsed_time / duration
 try:
     @spaces.GPU
+    def inference_zerogpu(*args):
+        return inference(*args)
 except NameError:
+    def inference_cpu(*args):
+        return inference(*args)
 demo = gr.Blocks(
                 value="Tetiana",
             )
+            with gr.Accordion("Advanced options", open=False):
+                gr.Markdown("You can change the voice, speed, and other parameters.")
+                with gr.Column():
+                    n_takes = gr.Number(
+                        label="Number of takes",
+                        value=1,
+                        minimum=1,
+                        maximum=10,
+                        step=1,
+                    )
+                    use_latest_take = gr.Checkbox(
+                        label="Use the latest take",
+                        value=False,
+                    )
+                    token_dur_scaling = gr.Number(
+                        label="Token duration scaling",
+                        value=1.0,
+                        minimum=0.0,
+                        maximum=10,
+                        step=0.1,
+                    )
+                with gr.Row():
+                    f0_mean = gr.Number(
+                        label="F0 mean",
+                        value=0,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                    )
+                    f0_std = gr.Number(
+                        label="F0 std",
+                        value=0,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                    )
+                    energy_mean = gr.Number(
+                        label="Energy mean",
+                        value=0,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                    )
+                    energy_std = gr.Number(
+                        label="Energy std",
+                        value=0,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                    )
+                with gr.Row():
+                    sigma_decoder = gr.Number(
+                        label="Sampling sigma for decoder",
+                        value=0.8,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                    )
+                    sigma_token_duration = gr.Number(
+                        label="Sampling sigma for duration",
+                        value=0.666,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                    )
+                    sigma_f0 = gr.Number(
+                        label="Sampling sigma for F0",
+                        value=1.0,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                    )
+                    sigma_energy = gr.Number(
+                        label="Sampling sigma for energy avg",
+                        value=1.0,
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                    )
     gr.Button("Run").click(
         inference_zerogpu if use_zerogpu else inference_cpu,
         concurrency_limit=concurrency_limit,
+        inputs=[
+            text,
+            voice,
+            n_takes,
+            use_latest_take,
+            token_dur_scaling,
+            f0_mean,
+            f0_std,
+            energy_mean,
+            energy_std,
+            sigma_decoder,
+            sigma_token_duration,
+            sigma_f0,
+            sigma_energy,
+        ],
         outputs=[audio, rtf],
     )
     with gr.Row():
         gr.Examples(
             label="Choose an example",
+            inputs=[
+                text,
+                voice,
+            ],
             examples=examples,
         )

archs/radtts.txt ADDED Viewed

	@@ -0,0 +1,530 @@

+RADTTS(
+  (speaker_embedding): Embedding(3, 16)
+  (embedding): Embedding(185, 512)
+  (flows): ModuleList(
+    (0-1): 2 x FlowStep(
+      (invtbl_conv): Invertible1x1ConvLUS()
+      (affine_tfn): AffineTransformationLayer(
+        (affine_param_predictor): WN(
+          (in_layers): ModuleList(
+            (0): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (1): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (2): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (3): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+          )
+          (res_skip_layers): ModuleList(
+            (0-3): 4 x ParametrizedConv1d(
+              1024, 1024, kernel_size=(1,), stride=(1,)
+              (parametrizations): ModuleDict(
+                (weight): ParametrizationList(
+                  (0): _WeightNorm()
+                )
+              )
+            )
+          )
+          (start): ParametrizedConv1d(
+            1120, 1024, kernel_size=(1,), stride=(1,)
+            (parametrizations): ModuleDict(
+              (weight): ParametrizationList(
+                (0): _WeightNorm()
+              )
+            )
+          )
+          (softplus): Softplus(beta=1.0, threshold=20.0)
+          (end): Conv1d(1024, 160, kernel_size=(1,), stride=(1,))
+        )
+      )
+    )
+    (2-3): 2 x FlowStep(
+      (invtbl_conv): Invertible1x1ConvLUS()
+      (affine_tfn): AffineTransformationLayer(
+        (affine_param_predictor): WN(
+          (in_layers): ModuleList(
+            (0): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (1): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (2): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (3): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+          )
+          (res_skip_layers): ModuleList(
+            (0-3): 4 x ParametrizedConv1d(
+              1024, 1024, kernel_size=(1,), stride=(1,)
+              (parametrizations): ModuleDict(
+                (weight): ParametrizationList(
+                  (0): _WeightNorm()
+                )
+              )
+            )
+          )
+          (start): ParametrizedConv1d(
+            1119, 1024, kernel_size=(1,), stride=(1,)
+            (parametrizations): ModuleDict(
+              (weight): ParametrizationList(
+                (0): _WeightNorm()
+              )
+            )
+          )
+          (softplus): Softplus(beta=1.0, threshold=20.0)
+          (end): Conv1d(1024, 158, kernel_size=(1,), stride=(1,))
+        )
+      )
+    )
+    (4-5): 2 x FlowStep(
+      (invtbl_conv): Invertible1x1ConvLUS()
+      (affine_tfn): AffineTransformationLayer(
+        (affine_param_predictor): WN(
+          (in_layers): ModuleList(
+            (0): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (1): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (2): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (3): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+          )
+          (res_skip_layers): ModuleList(
+            (0-3): 4 x ParametrizedConv1d(
+              1024, 1024, kernel_size=(1,), stride=(1,)
+              (parametrizations): ModuleDict(
+                (weight): ParametrizationList(
+                  (0): _WeightNorm()
+                )
+              )
+            )
+          )
+          (start): ParametrizedConv1d(
+            1118, 1024, kernel_size=(1,), stride=(1,)
+            (parametrizations): ModuleDict(
+              (weight): ParametrizationList(
+                (0): _WeightNorm()
+              )
+            )
+          )
+          (softplus): Softplus(beta=1.0, threshold=20.0)
+          (end): Conv1d(1024, 156, kernel_size=(1,), stride=(1,))
+        )
+      )
+    )
+    (6-7): 2 x FlowStep(
+      (invtbl_conv): Invertible1x1ConvLUS()
+      (affine_tfn): AffineTransformationLayer(
+        (affine_param_predictor): WN(
+          (in_layers): ModuleList(
+            (0): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (1): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (2): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+            (3): ConvNorm(
+              (conv): ParametrizedPartialConv1d(
+                1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
+                (parametrizations): ModuleDict(
+                  (weight): ParametrizationList(
+                    (0): _WeightNorm()
+                  )
+                )
+              )
+            )
+          )
+          (res_skip_layers): ModuleList(
+            (0-3): 4 x ParametrizedConv1d(
+              1024, 1024, kernel_size=(1,), stride=(1,)
+              (parametrizations): ModuleDict(
+                (weight): ParametrizationList(
+                  (0): _WeightNorm()
+                )
+              )
+            )
+          )
+          (start): ParametrizedConv1d(
+            1117, 1024, kernel_size=(1,), stride=(1,)
+            (parametrizations): ModuleDict(
+              (weight): ParametrizationList(
+                (0): _WeightNorm()
+              )
+            )
+          )
+          (softplus): Softplus(beta=1.0, threshold=20.0)
+          (end): Conv1d(1024, 154, kernel_size=(1,), stride=(1,))
+        )
+      )
+    )
+  )
+  (encoder): Encoder(
+    (convolutions): ModuleList(
+      (0-2): 3 x Sequential(
+        (0): ConvNorm(
+          (conv): PartialConv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
+        )
+        (1): InstanceNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
+      )
+    )
+    (lstm): ParametrizedLSTM(
+      512, 256, batch_first=True, bidirectional=True
+      (parametrizations): ModuleDict(
+        (weight_hh_l0): ParametrizationList(
+          (0): _SpectralNorm()
+        )
+        (weight_hh_l0_reverse): ParametrizationList(
+          (0): _SpectralNorm()
+        )
+      )
+    )
+  )
+  (length_regulator): LengthRegulator()
+  (attention): ConvAttention(
+    (softmax): Softmax(dim=3)
+    (log_softmax): LogSoftmax(dim=3)
+    (key_proj): Sequential(
+      (0): ConvNorm(
+        (conv): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
+      )
+      (1): ReLU()
+      (2): ConvNorm(
+        (conv): Conv1d(1024, 80, kernel_size=(1,), stride=(1,))
+      )
+    )
+    (query_proj): Sequential(
+      (0): ConvNorm(
+        (conv): Conv1d(80, 160, kernel_size=(3,), stride=(1,), padding=(1,))
+      )
+      (1): ReLU()
+      (2): ConvNorm(
+        (conv): Conv1d(160, 80, kernel_size=(1,), stride=(1,))
+      )
+      (3): ReLU()
+      (4): ConvNorm(
+        (conv): Conv1d(80, 80, kernel_size=(1,), stride=(1,))
+      )
+    )
+  )
+  (context_lstm): ParametrizedLSTM(
+    1044, 520, batch_first=True, bidirectional=True
+    (parametrizations): ModuleDict(
+      (weight_hh_l0): ParametrizationList(
+        (0): _SpectralNorm()
+      )
+      (weight_hh_l0_reverse): ParametrizationList(
+        (0): _SpectralNorm()
+      )
+    )
+  )
+  (unfold): Unfold(kernel_size=(2, 1), dilation=1, padding=0, stride=2)
+  (dur_pred_layer): DAP(
+    (bottleneck_layer): BottleneckLayerLayer(
+      (projection_fn): ConvNorm(
+        (conv): ParametrizedConv1d(
+          512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+      )
+      (non_linearity): ReLU()
+    )
+    (feat_pred_fn): ConvLSTMLinear(
+      (dropout): Dropout(p=0.25, inplace=False)
+      (convolutions): ModuleList(
+        (0): ParametrizedConv1d(
+          48, 256, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+        (1): ParametrizedConv1d(
+          256, 256, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+      )
+      (bilstm): ParametrizedLSTM(
+        256, 128, batch_first=True, bidirectional=True
+        (parametrizations): ModuleDict(
+          (weight_hh_l0): ParametrizationList(
+            (0): _SpectralNorm()
+          )
+          (weight_hh_l0_reverse): ParametrizationList(
+            (0): _SpectralNorm()
+          )
+        )
+      )
+      (dense): Linear(in_features=256, out_features=1, bias=True)
+    )
+  )
+  (unvoiced_bias_module): Sequential(
+    (0): LinearNorm(
+      (linear_layer): Linear(in_features=512, out_features=1, bias=True)
+    )
+    (1): ReLU()
+  )
+  (v_pred_module): DAP(
+    (bottleneck_layer): BottleneckLayerLayer(
+      (projection_fn): ConvNorm(
+        (conv): ParametrizedConv1d(
+          512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+      )
+      (non_linearity): ReLU()
+    )
+    (feat_pred_fn): ConvLSTMLinear(
+      (dropout): Dropout(p=0.5, inplace=False)
+      (convolutions): ModuleList(
+        (0): ParametrizedConv1d(
+          48, 256, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+        (1): ParametrizedConv1d(
+          256, 256, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+      )
+      (dense): Linear(in_features=256, out_features=1, bias=True)
+    )
+  )
+  (v_embeddings): Embedding(4, 512)
+  (f0_pred_module): DAP(
+    (bottleneck_layer): BottleneckLayerLayer(
+      (projection_fn): ConvNorm(
+        (conv): ParametrizedConv1d(
+          512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+      )
+      (non_linearity): ReLU()
+    )
+    (feat_pred_fn): ConvLSTMLinear(
+      (dropout): Dropout(p=0.5, inplace=False)
+      (convolutions): ModuleList(
+        (0): ParametrizedConv1d(
+          48, 256, kernel_size=(11,), stride=(1,), padding=(5,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+        (1): ParametrizedConv1d(
+          256, 256, kernel_size=(11,), stride=(1,), padding=(5,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+      )
+      (bilstm): ParametrizedLSTM(
+        256, 128, batch_first=True, bidirectional=True
+        (parametrizations): ModuleDict(
+          (weight_hh_l0): ParametrizationList(
+            (0): _SpectralNorm()
+          )
+          (weight_hh_l0_reverse): ParametrizationList(
+            (0): _SpectralNorm()
+          )
+        )
+      )
+      (dense): Linear(in_features=256, out_features=1, bias=True)
+    )
+  )
+  (energy_pred_module): DAP(
+    (bottleneck_layer): BottleneckLayerLayer(
+      (projection_fn): ConvNorm(
+        (conv): ParametrizedConv1d(
+          512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+      )
+      (non_linearity): ReLU()
+    )
+    (feat_pred_fn): ConvLSTMLinear(
+      (dropout): Dropout(p=0.25, inplace=False)
+      (convolutions): ModuleList(
+        (0): ParametrizedConv1d(
+          48, 256, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+        (1): ParametrizedConv1d(
+          256, 256, kernel_size=(3,), stride=(1,), padding=(1,)
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+      )
+      (bilstm): ParametrizedLSTM(
+        256, 128, batch_first=True, bidirectional=True
+        (parametrizations): ModuleDict(
+          (weight_hh_l0): ParametrizationList(
+            (0): _SpectralNorm()
+          )
+          (weight_hh_l0_reverse): ParametrizationList(
+            (0): _SpectralNorm()
+          )
+        )
+      )
+      (dense): Linear(in_features=256, out_features=1, bias=True)
+    )
+  )
+)

archs/vocos.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+Vocos(
+  (feature_extractor): MelSpectrogramFeatures(
+    (mel_spec): MelSpectrogram(
+      (spectrogram): Spectrogram()
+      (mel_scale): MelScale()
+    )
+  )
+  (backbone): VocosBackbone(
+    (embed): Conv1d(80, 512, kernel_size=(7,), stride=(1,), padding=(3,))
+    (norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
+    (convnext): ModuleList(
+      (0-7): 8 x ConvNeXtBlock(
+        (dwconv): Conv1d(512, 512, kernel_size=(7,), stride=(1,), padding=(3,), groups=512)
+        (norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
+        (pwconv1): Linear(in_features=512, out_features=1536, bias=True)
+        (act): GELU(approximate='none')
+        (pwconv2): Linear(in_features=1536, out_features=512, bias=True)
+      )
+    )
+    (final_layer_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
+  )
+  (head): ISTFTHead(
+    (out): Linear(in_features=512, out_features=2050, bias=True)
+    (istft): ISTFT()
+  )
+)