Spaces:
Running
Running
Add advanced options
Browse files- app.py +171 -55
- archs/radtts.txt +530 -0
- archs/vocos.txt +26 -0
app.py
CHANGED
@@ -73,10 +73,7 @@ download_file_from_repo(
|
|
73 |
params = []
|
74 |
|
75 |
# Load the config
|
76 |
-
|
77 |
-
data = f.read()
|
78 |
-
|
79 |
-
config = json.loads(data)
|
80 |
update_params(config, params)
|
81 |
|
82 |
data_config = config["data_config"]
|
@@ -115,10 +112,13 @@ vocos_params = f"{sum(param.numel() for param in vocos.parameters()):,}"
|
|
115 |
print(f"Loaded checkpoint (RAD-TTS++), number of parameters: {radtts_params}")
|
116 |
print(f"Loaded checkpoint (Vocos), number of parameters: {vocos_params}")
|
117 |
|
118 |
-
ignore_keys = ["training_files", "validation_files"]
|
119 |
text_processor = TextProcessor(
|
120 |
data_config["training_files"],
|
121 |
-
**dict(
|
|
|
|
|
|
|
|
|
122 |
)
|
123 |
|
124 |
# Config
|
@@ -210,58 +210,70 @@ examples = [
|
|
210 |
]
|
211 |
|
212 |
|
213 |
-
def
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
if not text:
|
223 |
raise gr.Error("Please paste your text.")
|
224 |
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
speaker = speaker_text = speaker_attributes = voice.lower()
|
228 |
|
229 |
-
|
230 |
-
|
231 |
-
sigma = 0.8 # sampling sigma for decoder
|
232 |
-
sigma_tkndur = 0.666 # sampling sigma for duration
|
233 |
-
sigma_f0 = 1.0 # sampling sigma for f0
|
234 |
-
sigma_energy = 1.0 # sampling sigma for energy avg
|
235 |
-
|
236 |
-
token_dur_scaling = 1.0
|
237 |
-
|
238 |
-
f0_mean = 0
|
239 |
-
f0_std = 0
|
240 |
-
energy_mean = 0
|
241 |
-
energy_std = 0
|
242 |
|
243 |
-
|
244 |
-
|
245 |
-
speaker_id = speaker_id_text = speaker_id_attributes = get_speaker_id(speaker).to(
|
246 |
-
device
|
247 |
-
)
|
248 |
|
249 |
if speaker_text is not None:
|
250 |
-
speaker_id_text =
|
251 |
|
252 |
if speaker_attributes is not None:
|
253 |
-
speaker_id_attributes =
|
|
|
|
|
254 |
|
255 |
inference_start = time.time()
|
256 |
|
257 |
-
|
|
|
|
|
|
|
258 |
with torch.autocast(device, enabled=False):
|
259 |
with torch.inference_mode():
|
260 |
outputs = radtts.infer(
|
261 |
speaker_id,
|
262 |
tensor_text[None],
|
263 |
-
|
264 |
-
|
265 |
sigma_f0,
|
266 |
sigma_energy,
|
267 |
token_dur_scaling,
|
@@ -274,18 +286,21 @@ def inference(text, voice):
|
|
274 |
energy_std=energy_std,
|
275 |
)
|
276 |
|
277 |
-
|
278 |
|
279 |
-
|
280 |
-
"Synthesized MEL spectrogram, converting to WAVE.", duration=0.5
|
281 |
-
)
|
282 |
|
283 |
-
|
284 |
-
|
|
|
285 |
|
286 |
-
|
|
|
|
|
|
|
287 |
|
288 |
-
|
|
|
289 |
|
290 |
elapsed_time = time.time() - inference_start
|
291 |
rtf = elapsed_time / duration
|
@@ -303,14 +318,12 @@ def inference(text, voice):
|
|
303 |
try:
|
304 |
|
305 |
@spaces.GPU
|
306 |
-
def inference_zerogpu(
|
307 |
-
return inference(
|
308 |
except NameError:
|
309 |
-
print("ZeroGPU is not available, skipping...")
|
310 |
|
311 |
-
|
312 |
-
|
313 |
-
return inference(text, voice)
|
314 |
|
315 |
|
316 |
demo = gr.Blocks(
|
@@ -344,17 +357,120 @@ with demo:
|
|
344 |
value="Tetiana",
|
345 |
)
|
346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
gr.Button("Run").click(
|
348 |
inference_zerogpu if use_zerogpu else inference_cpu,
|
349 |
concurrency_limit=concurrency_limit,
|
350 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
outputs=[audio, rtf],
|
352 |
)
|
353 |
|
354 |
with gr.Row():
|
355 |
gr.Examples(
|
356 |
label="Choose an example",
|
357 |
-
inputs=[
|
|
|
|
|
|
|
358 |
examples=examples,
|
359 |
)
|
360 |
|
|
|
73 |
params = []
|
74 |
|
75 |
# Load the config
|
76 |
+
config = json.loads(Path("config.json").read_text())
|
|
|
|
|
|
|
77 |
update_params(config, params)
|
78 |
|
79 |
data_config = config["data_config"]
|
|
|
112 |
print(f"Loaded checkpoint (RAD-TTS++), number of parameters: {radtts_params}")
|
113 |
print(f"Loaded checkpoint (Vocos), number of parameters: {vocos_params}")
|
114 |
|
|
|
115 |
text_processor = TextProcessor(
|
116 |
data_config["training_files"],
|
117 |
+
**dict(
|
118 |
+
(k, v)
|
119 |
+
for k, v in data_config.items()
|
120 |
+
if k not in ["training_files", "validation_files"]
|
121 |
+
),
|
122 |
)
|
123 |
|
124 |
# Config
|
|
|
210 |
]
|
211 |
|
212 |
|
213 |
+
def inference(
|
214 |
+
text,
|
215 |
+
voice,
|
216 |
+
n_takes,
|
217 |
+
use_latest_take,
|
218 |
+
token_dur_scaling,
|
219 |
+
f0_mean,
|
220 |
+
f0_std,
|
221 |
+
energy_mean,
|
222 |
+
energy_std,
|
223 |
+
sigma_decoder,
|
224 |
+
sigma_token_duration,
|
225 |
+
sigma_f0,
|
226 |
+
sigma_energy,
|
227 |
+
):
|
228 |
if not text:
|
229 |
raise gr.Error("Please paste your text.")
|
230 |
|
231 |
+
request = {
|
232 |
+
"text": text,
|
233 |
+
"voice": voice,
|
234 |
+
"n_takes": n_takes,
|
235 |
+
"use_latest_take": use_latest_take,
|
236 |
+
"token_dur_scaling": token_dur_scaling,
|
237 |
+
"f0_mean": f0_mean,
|
238 |
+
"f0_std": f0_std,
|
239 |
+
"energy_mean": energy_mean,
|
240 |
+
"energy_std": energy_std,
|
241 |
+
"sigma_decoder": sigma_decoder,
|
242 |
+
"sigma_token_duration": sigma_token_duration,
|
243 |
+
"sigma_f0": sigma_f0,
|
244 |
+
"sigma_energy": sigma_energy,
|
245 |
+
}
|
246 |
+
|
247 |
+
print(json.dumps(request, indent=2))
|
248 |
|
249 |
speaker = speaker_text = speaker_attributes = voice.lower()
|
250 |
|
251 |
+
tensor_text = torch.LongTensor(text_processor.tp.encode_text(text)).to(device)
|
252 |
+
speaker_tensor = torch.LongTensor([voices[speaker]]).to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
+
speaker_id = speaker_id_text = speaker_id_attributes = speaker_tensor
|
|
|
|
|
|
|
|
|
255 |
|
256 |
if speaker_text is not None:
|
257 |
+
speaker_id_text = torch.LongTensor([voices[speaker_text]]).to(device)
|
258 |
|
259 |
if speaker_attributes is not None:
|
260 |
+
speaker_id_attributes = torch.LongTensor([voices[speaker_attributes]]).to(
|
261 |
+
device
|
262 |
+
)
|
263 |
|
264 |
inference_start = time.time()
|
265 |
|
266 |
+
mels = []
|
267 |
+
for n_take in range(n_takes):
|
268 |
+
gr.Info(f"Inferencing take {n_take + 1}", duration=1)
|
269 |
+
|
270 |
with torch.autocast(device, enabled=False):
|
271 |
with torch.inference_mode():
|
272 |
outputs = radtts.infer(
|
273 |
speaker_id,
|
274 |
tensor_text[None],
|
275 |
+
sigma_decoder,
|
276 |
+
sigma_token_duration,
|
277 |
sigma_f0,
|
278 |
sigma_energy,
|
279 |
token_dur_scaling,
|
|
|
286 |
energy_std=energy_std,
|
287 |
)
|
288 |
|
289 |
+
mels.append(outputs["mel"])
|
290 |
|
291 |
+
gr.Info("Synthesized MEL spectrograms, converting to WAVE.", duration=0.5)
|
|
|
|
|
292 |
|
293 |
+
wav_gen_all = []
|
294 |
+
for mel in mels:
|
295 |
+
wav_gen_all.append(vocos.decode(mel))
|
296 |
|
297 |
+
if use_latest_take:
|
298 |
+
wav_gen = wav_gen_all[-1] # Get the latest generated wav
|
299 |
+
else:
|
300 |
+
wav_gen = torch.cat(wav_gen_all, dim=1) # Concatenate all the generated wavs
|
301 |
|
302 |
+
duration = len(wav_gen[0]) / 44_100
|
303 |
+
torchaudio.save("audio.wav", wav_gen.cpu(), 44_100, encoding="PCM_S")
|
304 |
|
305 |
elapsed_time = time.time() - inference_start
|
306 |
rtf = elapsed_time / duration
|
|
|
318 |
try:
|
319 |
|
320 |
@spaces.GPU
|
321 |
+
def inference_zerogpu(*args):
|
322 |
+
return inference(*args)
|
323 |
except NameError:
|
|
|
324 |
|
325 |
+
def inference_cpu(*args):
|
326 |
+
return inference(*args)
|
|
|
327 |
|
328 |
|
329 |
demo = gr.Blocks(
|
|
|
357 |
value="Tetiana",
|
358 |
)
|
359 |
|
360 |
+
with gr.Accordion("Advanced options", open=False):
|
361 |
+
gr.Markdown("You can change the voice, speed, and other parameters.")
|
362 |
+
|
363 |
+
with gr.Column():
|
364 |
+
n_takes = gr.Number(
|
365 |
+
label="Number of takes",
|
366 |
+
value=1,
|
367 |
+
minimum=1,
|
368 |
+
maximum=10,
|
369 |
+
step=1,
|
370 |
+
)
|
371 |
+
|
372 |
+
use_latest_take = gr.Checkbox(
|
373 |
+
label="Use the latest take",
|
374 |
+
value=False,
|
375 |
+
)
|
376 |
+
|
377 |
+
token_dur_scaling = gr.Number(
|
378 |
+
label="Token duration scaling",
|
379 |
+
value=1.0,
|
380 |
+
minimum=0.0,
|
381 |
+
maximum=10,
|
382 |
+
step=0.1,
|
383 |
+
)
|
384 |
+
|
385 |
+
with gr.Row():
|
386 |
+
f0_mean = gr.Number(
|
387 |
+
label="F0 mean",
|
388 |
+
value=0,
|
389 |
+
minimum=0.0,
|
390 |
+
maximum=1.0,
|
391 |
+
step=0.1,
|
392 |
+
)
|
393 |
+
f0_std = gr.Number(
|
394 |
+
label="F0 std",
|
395 |
+
value=0,
|
396 |
+
minimum=0.0,
|
397 |
+
maximum=1.0,
|
398 |
+
step=0.1,
|
399 |
+
)
|
400 |
+
|
401 |
+
energy_mean = gr.Number(
|
402 |
+
label="Energy mean",
|
403 |
+
value=0,
|
404 |
+
minimum=0.0,
|
405 |
+
maximum=1.0,
|
406 |
+
step=0.1,
|
407 |
+
)
|
408 |
+
energy_std = gr.Number(
|
409 |
+
label="Energy std",
|
410 |
+
value=0,
|
411 |
+
minimum=0.0,
|
412 |
+
maximum=1.0,
|
413 |
+
step=0.1,
|
414 |
+
)
|
415 |
+
|
416 |
+
with gr.Row():
|
417 |
+
sigma_decoder = gr.Number(
|
418 |
+
label="Sampling sigma for decoder",
|
419 |
+
value=0.8,
|
420 |
+
minimum=0.0,
|
421 |
+
maximum=1.0,
|
422 |
+
step=0.1,
|
423 |
+
)
|
424 |
+
sigma_token_duration = gr.Number(
|
425 |
+
label="Sampling sigma for duration",
|
426 |
+
value=0.666,
|
427 |
+
minimum=0.0,
|
428 |
+
maximum=1.0,
|
429 |
+
step=0.1,
|
430 |
+
)
|
431 |
+
sigma_f0 = gr.Number(
|
432 |
+
label="Sampling sigma for F0",
|
433 |
+
value=1.0,
|
434 |
+
minimum=0.0,
|
435 |
+
maximum=1.0,
|
436 |
+
step=0.1,
|
437 |
+
)
|
438 |
+
sigma_energy = gr.Number(
|
439 |
+
label="Sampling sigma for energy avg",
|
440 |
+
value=1.0,
|
441 |
+
minimum=0.0,
|
442 |
+
maximum=1.0,
|
443 |
+
step=0.1,
|
444 |
+
)
|
445 |
+
|
446 |
gr.Button("Run").click(
|
447 |
inference_zerogpu if use_zerogpu else inference_cpu,
|
448 |
concurrency_limit=concurrency_limit,
|
449 |
+
inputs=[
|
450 |
+
text,
|
451 |
+
voice,
|
452 |
+
n_takes,
|
453 |
+
use_latest_take,
|
454 |
+
token_dur_scaling,
|
455 |
+
f0_mean,
|
456 |
+
f0_std,
|
457 |
+
energy_mean,
|
458 |
+
energy_std,
|
459 |
+
sigma_decoder,
|
460 |
+
sigma_token_duration,
|
461 |
+
sigma_f0,
|
462 |
+
sigma_energy,
|
463 |
+
],
|
464 |
outputs=[audio, rtf],
|
465 |
)
|
466 |
|
467 |
with gr.Row():
|
468 |
gr.Examples(
|
469 |
label="Choose an example",
|
470 |
+
inputs=[
|
471 |
+
text,
|
472 |
+
voice,
|
473 |
+
],
|
474 |
examples=examples,
|
475 |
)
|
476 |
|
archs/radtts.txt
ADDED
@@ -0,0 +1,530 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
RADTTS(
|
2 |
+
(speaker_embedding): Embedding(3, 16)
|
3 |
+
(embedding): Embedding(185, 512)
|
4 |
+
(flows): ModuleList(
|
5 |
+
(0-1): 2 x FlowStep(
|
6 |
+
(invtbl_conv): Invertible1x1ConvLUS()
|
7 |
+
(affine_tfn): AffineTransformationLayer(
|
8 |
+
(affine_param_predictor): WN(
|
9 |
+
(in_layers): ModuleList(
|
10 |
+
(0): ConvNorm(
|
11 |
+
(conv): ParametrizedPartialConv1d(
|
12 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
|
13 |
+
(parametrizations): ModuleDict(
|
14 |
+
(weight): ParametrizationList(
|
15 |
+
(0): _WeightNorm()
|
16 |
+
)
|
17 |
+
)
|
18 |
+
)
|
19 |
+
)
|
20 |
+
(1): ConvNorm(
|
21 |
+
(conv): ParametrizedPartialConv1d(
|
22 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
|
23 |
+
(parametrizations): ModuleDict(
|
24 |
+
(weight): ParametrizationList(
|
25 |
+
(0): _WeightNorm()
|
26 |
+
)
|
27 |
+
)
|
28 |
+
)
|
29 |
+
)
|
30 |
+
(2): ConvNorm(
|
31 |
+
(conv): ParametrizedPartialConv1d(
|
32 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
|
33 |
+
(parametrizations): ModuleDict(
|
34 |
+
(weight): ParametrizationList(
|
35 |
+
(0): _WeightNorm()
|
36 |
+
)
|
37 |
+
)
|
38 |
+
)
|
39 |
+
)
|
40 |
+
(3): ConvNorm(
|
41 |
+
(conv): ParametrizedPartialConv1d(
|
42 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
|
43 |
+
(parametrizations): ModuleDict(
|
44 |
+
(weight): ParametrizationList(
|
45 |
+
(0): _WeightNorm()
|
46 |
+
)
|
47 |
+
)
|
48 |
+
)
|
49 |
+
)
|
50 |
+
)
|
51 |
+
(res_skip_layers): ModuleList(
|
52 |
+
(0-3): 4 x ParametrizedConv1d(
|
53 |
+
1024, 1024, kernel_size=(1,), stride=(1,)
|
54 |
+
(parametrizations): ModuleDict(
|
55 |
+
(weight): ParametrizationList(
|
56 |
+
(0): _WeightNorm()
|
57 |
+
)
|
58 |
+
)
|
59 |
+
)
|
60 |
+
)
|
61 |
+
(start): ParametrizedConv1d(
|
62 |
+
1120, 1024, kernel_size=(1,), stride=(1,)
|
63 |
+
(parametrizations): ModuleDict(
|
64 |
+
(weight): ParametrizationList(
|
65 |
+
(0): _WeightNorm()
|
66 |
+
)
|
67 |
+
)
|
68 |
+
)
|
69 |
+
(softplus): Softplus(beta=1.0, threshold=20.0)
|
70 |
+
(end): Conv1d(1024, 160, kernel_size=(1,), stride=(1,))
|
71 |
+
)
|
72 |
+
)
|
73 |
+
)
|
74 |
+
(2-3): 2 x FlowStep(
|
75 |
+
(invtbl_conv): Invertible1x1ConvLUS()
|
76 |
+
(affine_tfn): AffineTransformationLayer(
|
77 |
+
(affine_param_predictor): WN(
|
78 |
+
(in_layers): ModuleList(
|
79 |
+
(0): ConvNorm(
|
80 |
+
(conv): ParametrizedPartialConv1d(
|
81 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
|
82 |
+
(parametrizations): ModuleDict(
|
83 |
+
(weight): ParametrizationList(
|
84 |
+
(0): _WeightNorm()
|
85 |
+
)
|
86 |
+
)
|
87 |
+
)
|
88 |
+
)
|
89 |
+
(1): ConvNorm(
|
90 |
+
(conv): ParametrizedPartialConv1d(
|
91 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
|
92 |
+
(parametrizations): ModuleDict(
|
93 |
+
(weight): ParametrizationList(
|
94 |
+
(0): _WeightNorm()
|
95 |
+
)
|
96 |
+
)
|
97 |
+
)
|
98 |
+
)
|
99 |
+
(2): ConvNorm(
|
100 |
+
(conv): ParametrizedPartialConv1d(
|
101 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
|
102 |
+
(parametrizations): ModuleDict(
|
103 |
+
(weight): ParametrizationList(
|
104 |
+
(0): _WeightNorm()
|
105 |
+
)
|
106 |
+
)
|
107 |
+
)
|
108 |
+
)
|
109 |
+
(3): ConvNorm(
|
110 |
+
(conv): ParametrizedPartialConv1d(
|
111 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
|
112 |
+
(parametrizations): ModuleDict(
|
113 |
+
(weight): ParametrizationList(
|
114 |
+
(0): _WeightNorm()
|
115 |
+
)
|
116 |
+
)
|
117 |
+
)
|
118 |
+
)
|
119 |
+
)
|
120 |
+
(res_skip_layers): ModuleList(
|
121 |
+
(0-3): 4 x ParametrizedConv1d(
|
122 |
+
1024, 1024, kernel_size=(1,), stride=(1,)
|
123 |
+
(parametrizations): ModuleDict(
|
124 |
+
(weight): ParametrizationList(
|
125 |
+
(0): _WeightNorm()
|
126 |
+
)
|
127 |
+
)
|
128 |
+
)
|
129 |
+
)
|
130 |
+
(start): ParametrizedConv1d(
|
131 |
+
1119, 1024, kernel_size=(1,), stride=(1,)
|
132 |
+
(parametrizations): ModuleDict(
|
133 |
+
(weight): ParametrizationList(
|
134 |
+
(0): _WeightNorm()
|
135 |
+
)
|
136 |
+
)
|
137 |
+
)
|
138 |
+
(softplus): Softplus(beta=1.0, threshold=20.0)
|
139 |
+
(end): Conv1d(1024, 158, kernel_size=(1,), stride=(1,))
|
140 |
+
)
|
141 |
+
)
|
142 |
+
)
|
143 |
+
(4-5): 2 x FlowStep(
|
144 |
+
(invtbl_conv): Invertible1x1ConvLUS()
|
145 |
+
(affine_tfn): AffineTransformationLayer(
|
146 |
+
(affine_param_predictor): WN(
|
147 |
+
(in_layers): ModuleList(
|
148 |
+
(0): ConvNorm(
|
149 |
+
(conv): ParametrizedPartialConv1d(
|
150 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
|
151 |
+
(parametrizations): ModuleDict(
|
152 |
+
(weight): ParametrizationList(
|
153 |
+
(0): _WeightNorm()
|
154 |
+
)
|
155 |
+
)
|
156 |
+
)
|
157 |
+
)
|
158 |
+
(1): ConvNorm(
|
159 |
+
(conv): ParametrizedPartialConv1d(
|
160 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
|
161 |
+
(parametrizations): ModuleDict(
|
162 |
+
(weight): ParametrizationList(
|
163 |
+
(0): _WeightNorm()
|
164 |
+
)
|
165 |
+
)
|
166 |
+
)
|
167 |
+
)
|
168 |
+
(2): ConvNorm(
|
169 |
+
(conv): ParametrizedPartialConv1d(
|
170 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
|
171 |
+
(parametrizations): ModuleDict(
|
172 |
+
(weight): ParametrizationList(
|
173 |
+
(0): _WeightNorm()
|
174 |
+
)
|
175 |
+
)
|
176 |
+
)
|
177 |
+
)
|
178 |
+
(3): ConvNorm(
|
179 |
+
(conv): ParametrizedPartialConv1d(
|
180 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
|
181 |
+
(parametrizations): ModuleDict(
|
182 |
+
(weight): ParametrizationList(
|
183 |
+
(0): _WeightNorm()
|
184 |
+
)
|
185 |
+
)
|
186 |
+
)
|
187 |
+
)
|
188 |
+
)
|
189 |
+
(res_skip_layers): ModuleList(
|
190 |
+
(0-3): 4 x ParametrizedConv1d(
|
191 |
+
1024, 1024, kernel_size=(1,), stride=(1,)
|
192 |
+
(parametrizations): ModuleDict(
|
193 |
+
(weight): ParametrizationList(
|
194 |
+
(0): _WeightNorm()
|
195 |
+
)
|
196 |
+
)
|
197 |
+
)
|
198 |
+
)
|
199 |
+
(start): ParametrizedConv1d(
|
200 |
+
1118, 1024, kernel_size=(1,), stride=(1,)
|
201 |
+
(parametrizations): ModuleDict(
|
202 |
+
(weight): ParametrizationList(
|
203 |
+
(0): _WeightNorm()
|
204 |
+
)
|
205 |
+
)
|
206 |
+
)
|
207 |
+
(softplus): Softplus(beta=1.0, threshold=20.0)
|
208 |
+
(end): Conv1d(1024, 156, kernel_size=(1,), stride=(1,))
|
209 |
+
)
|
210 |
+
)
|
211 |
+
)
|
212 |
+
(6-7): 2 x FlowStep(
|
213 |
+
(invtbl_conv): Invertible1x1ConvLUS()
|
214 |
+
(affine_tfn): AffineTransformationLayer(
|
215 |
+
(affine_param_predictor): WN(
|
216 |
+
(in_layers): ModuleList(
|
217 |
+
(0): ConvNorm(
|
218 |
+
(conv): ParametrizedPartialConv1d(
|
219 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
|
220 |
+
(parametrizations): ModuleDict(
|
221 |
+
(weight): ParametrizationList(
|
222 |
+
(0): _WeightNorm()
|
223 |
+
)
|
224 |
+
)
|
225 |
+
)
|
226 |
+
)
|
227 |
+
(1): ConvNorm(
|
228 |
+
(conv): ParametrizedPartialConv1d(
|
229 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
|
230 |
+
(parametrizations): ModuleDict(
|
231 |
+
(weight): ParametrizationList(
|
232 |
+
(0): _WeightNorm()
|
233 |
+
)
|
234 |
+
)
|
235 |
+
)
|
236 |
+
)
|
237 |
+
(2): ConvNorm(
|
238 |
+
(conv): ParametrizedPartialConv1d(
|
239 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
|
240 |
+
(parametrizations): ModuleDict(
|
241 |
+
(weight): ParametrizationList(
|
242 |
+
(0): _WeightNorm()
|
243 |
+
)
|
244 |
+
)
|
245 |
+
)
|
246 |
+
)
|
247 |
+
(3): ConvNorm(
|
248 |
+
(conv): ParametrizedPartialConv1d(
|
249 |
+
1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
|
250 |
+
(parametrizations): ModuleDict(
|
251 |
+
(weight): ParametrizationList(
|
252 |
+
(0): _WeightNorm()
|
253 |
+
)
|
254 |
+
)
|
255 |
+
)
|
256 |
+
)
|
257 |
+
)
|
258 |
+
(res_skip_layers): ModuleList(
|
259 |
+
(0-3): 4 x ParametrizedConv1d(
|
260 |
+
1024, 1024, kernel_size=(1,), stride=(1,)
|
261 |
+
(parametrizations): ModuleDict(
|
262 |
+
(weight): ParametrizationList(
|
263 |
+
(0): _WeightNorm()
|
264 |
+
)
|
265 |
+
)
|
266 |
+
)
|
267 |
+
)
|
268 |
+
(start): ParametrizedConv1d(
|
269 |
+
1117, 1024, kernel_size=(1,), stride=(1,)
|
270 |
+
(parametrizations): ModuleDict(
|
271 |
+
(weight): ParametrizationList(
|
272 |
+
(0): _WeightNorm()
|
273 |
+
)
|
274 |
+
)
|
275 |
+
)
|
276 |
+
(softplus): Softplus(beta=1.0, threshold=20.0)
|
277 |
+
(end): Conv1d(1024, 154, kernel_size=(1,), stride=(1,))
|
278 |
+
)
|
279 |
+
)
|
280 |
+
)
|
281 |
+
)
|
282 |
+
(encoder): Encoder(
|
283 |
+
(convolutions): ModuleList(
|
284 |
+
(0-2): 3 x Sequential(
|
285 |
+
(0): ConvNorm(
|
286 |
+
(conv): PartialConv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
|
287 |
+
)
|
288 |
+
(1): InstanceNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
|
289 |
+
)
|
290 |
+
)
|
291 |
+
(lstm): ParametrizedLSTM(
|
292 |
+
512, 256, batch_first=True, bidirectional=True
|
293 |
+
(parametrizations): ModuleDict(
|
294 |
+
(weight_hh_l0): ParametrizationList(
|
295 |
+
(0): _SpectralNorm()
|
296 |
+
)
|
297 |
+
(weight_hh_l0_reverse): ParametrizationList(
|
298 |
+
(0): _SpectralNorm()
|
299 |
+
)
|
300 |
+
)
|
301 |
+
)
|
302 |
+
)
|
303 |
+
(length_regulator): LengthRegulator()
|
304 |
+
(attention): ConvAttention(
|
305 |
+
(softmax): Softmax(dim=3)
|
306 |
+
(log_softmax): LogSoftmax(dim=3)
|
307 |
+
(key_proj): Sequential(
|
308 |
+
(0): ConvNorm(
|
309 |
+
(conv): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
|
310 |
+
)
|
311 |
+
(1): ReLU()
|
312 |
+
(2): ConvNorm(
|
313 |
+
(conv): Conv1d(1024, 80, kernel_size=(1,), stride=(1,))
|
314 |
+
)
|
315 |
+
)
|
316 |
+
(query_proj): Sequential(
|
317 |
+
(0): ConvNorm(
|
318 |
+
(conv): Conv1d(80, 160, kernel_size=(3,), stride=(1,), padding=(1,))
|
319 |
+
)
|
320 |
+
(1): ReLU()
|
321 |
+
(2): ConvNorm(
|
322 |
+
(conv): Conv1d(160, 80, kernel_size=(1,), stride=(1,))
|
323 |
+
)
|
324 |
+
(3): ReLU()
|
325 |
+
(4): ConvNorm(
|
326 |
+
(conv): Conv1d(80, 80, kernel_size=(1,), stride=(1,))
|
327 |
+
)
|
328 |
+
)
|
329 |
+
)
|
330 |
+
(context_lstm): ParametrizedLSTM(
|
331 |
+
1044, 520, batch_first=True, bidirectional=True
|
332 |
+
(parametrizations): ModuleDict(
|
333 |
+
(weight_hh_l0): ParametrizationList(
|
334 |
+
(0): _SpectralNorm()
|
335 |
+
)
|
336 |
+
(weight_hh_l0_reverse): ParametrizationList(
|
337 |
+
(0): _SpectralNorm()
|
338 |
+
)
|
339 |
+
)
|
340 |
+
)
|
341 |
+
(unfold): Unfold(kernel_size=(2, 1), dilation=1, padding=0, stride=2)
|
342 |
+
(dur_pred_layer): DAP(
|
343 |
+
(bottleneck_layer): BottleneckLayerLayer(
|
344 |
+
(projection_fn): ConvNorm(
|
345 |
+
(conv): ParametrizedConv1d(
|
346 |
+
512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
|
347 |
+
(parametrizations): ModuleDict(
|
348 |
+
(weight): ParametrizationList(
|
349 |
+
(0): _WeightNorm()
|
350 |
+
)
|
351 |
+
)
|
352 |
+
)
|
353 |
+
)
|
354 |
+
(non_linearity): ReLU()
|
355 |
+
)
|
356 |
+
(feat_pred_fn): ConvLSTMLinear(
|
357 |
+
(dropout): Dropout(p=0.25, inplace=False)
|
358 |
+
(convolutions): ModuleList(
|
359 |
+
(0): ParametrizedConv1d(
|
360 |
+
48, 256, kernel_size=(3,), stride=(1,), padding=(1,)
|
361 |
+
(parametrizations): ModuleDict(
|
362 |
+
(weight): ParametrizationList(
|
363 |
+
(0): _WeightNorm()
|
364 |
+
)
|
365 |
+
)
|
366 |
+
)
|
367 |
+
(1): ParametrizedConv1d(
|
368 |
+
256, 256, kernel_size=(3,), stride=(1,), padding=(1,)
|
369 |
+
(parametrizations): ModuleDict(
|
370 |
+
(weight): ParametrizationList(
|
371 |
+
(0): _WeightNorm()
|
372 |
+
)
|
373 |
+
)
|
374 |
+
)
|
375 |
+
)
|
376 |
+
(bilstm): ParametrizedLSTM(
|
377 |
+
256, 128, batch_first=True, bidirectional=True
|
378 |
+
(parametrizations): ModuleDict(
|
379 |
+
(weight_hh_l0): ParametrizationList(
|
380 |
+
(0): _SpectralNorm()
|
381 |
+
)
|
382 |
+
(weight_hh_l0_reverse): ParametrizationList(
|
383 |
+
(0): _SpectralNorm()
|
384 |
+
)
|
385 |
+
)
|
386 |
+
)
|
387 |
+
(dense): Linear(in_features=256, out_features=1, bias=True)
|
388 |
+
)
|
389 |
+
)
|
390 |
+
(unvoiced_bias_module): Sequential(
|
391 |
+
(0): LinearNorm(
|
392 |
+
(linear_layer): Linear(in_features=512, out_features=1, bias=True)
|
393 |
+
)
|
394 |
+
(1): ReLU()
|
395 |
+
)
|
396 |
+
(v_pred_module): DAP(
|
397 |
+
(bottleneck_layer): BottleneckLayerLayer(
|
398 |
+
(projection_fn): ConvNorm(
|
399 |
+
(conv): ParametrizedConv1d(
|
400 |
+
512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
|
401 |
+
(parametrizations): ModuleDict(
|
402 |
+
(weight): ParametrizationList(
|
403 |
+
(0): _WeightNorm()
|
404 |
+
)
|
405 |
+
)
|
406 |
+
)
|
407 |
+
)
|
408 |
+
(non_linearity): ReLU()
|
409 |
+
)
|
410 |
+
(feat_pred_fn): ConvLSTMLinear(
|
411 |
+
(dropout): Dropout(p=0.5, inplace=False)
|
412 |
+
(convolutions): ModuleList(
|
413 |
+
(0): ParametrizedConv1d(
|
414 |
+
48, 256, kernel_size=(3,), stride=(1,), padding=(1,)
|
415 |
+
(parametrizations): ModuleDict(
|
416 |
+
(weight): ParametrizationList(
|
417 |
+
(0): _WeightNorm()
|
418 |
+
)
|
419 |
+
)
|
420 |
+
)
|
421 |
+
(1): ParametrizedConv1d(
|
422 |
+
256, 256, kernel_size=(3,), stride=(1,), padding=(1,)
|
423 |
+
(parametrizations): ModuleDict(
|
424 |
+
(weight): ParametrizationList(
|
425 |
+
(0): _WeightNorm()
|
426 |
+
)
|
427 |
+
)
|
428 |
+
)
|
429 |
+
)
|
430 |
+
(dense): Linear(in_features=256, out_features=1, bias=True)
|
431 |
+
)
|
432 |
+
)
|
433 |
+
(v_embeddings): Embedding(4, 512)
|
434 |
+
(f0_pred_module): DAP(
|
435 |
+
(bottleneck_layer): BottleneckLayerLayer(
|
436 |
+
(projection_fn): ConvNorm(
|
437 |
+
(conv): ParametrizedConv1d(
|
438 |
+
512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
|
439 |
+
(parametrizations): ModuleDict(
|
440 |
+
(weight): ParametrizationList(
|
441 |
+
(0): _WeightNorm()
|
442 |
+
)
|
443 |
+
)
|
444 |
+
)
|
445 |
+
)
|
446 |
+
(non_linearity): ReLU()
|
447 |
+
)
|
448 |
+
(feat_pred_fn): ConvLSTMLinear(
|
449 |
+
(dropout): Dropout(p=0.5, inplace=False)
|
450 |
+
(convolutions): ModuleList(
|
451 |
+
(0): ParametrizedConv1d(
|
452 |
+
48, 256, kernel_size=(11,), stride=(1,), padding=(5,)
|
453 |
+
(parametrizations): ModuleDict(
|
454 |
+
(weight): ParametrizationList(
|
455 |
+
(0): _WeightNorm()
|
456 |
+
)
|
457 |
+
)
|
458 |
+
)
|
459 |
+
(1): ParametrizedConv1d(
|
460 |
+
256, 256, kernel_size=(11,), stride=(1,), padding=(5,)
|
461 |
+
(parametrizations): ModuleDict(
|
462 |
+
(weight): ParametrizationList(
|
463 |
+
(0): _WeightNorm()
|
464 |
+
)
|
465 |
+
)
|
466 |
+
)
|
467 |
+
)
|
468 |
+
(bilstm): ParametrizedLSTM(
|
469 |
+
256, 128, batch_first=True, bidirectional=True
|
470 |
+
(parametrizations): ModuleDict(
|
471 |
+
(weight_hh_l0): ParametrizationList(
|
472 |
+
(0): _SpectralNorm()
|
473 |
+
)
|
474 |
+
(weight_hh_l0_reverse): ParametrizationList(
|
475 |
+
(0): _SpectralNorm()
|
476 |
+
)
|
477 |
+
)
|
478 |
+
)
|
479 |
+
(dense): Linear(in_features=256, out_features=1, bias=True)
|
480 |
+
)
|
481 |
+
)
|
482 |
+
(energy_pred_module): DAP(
|
483 |
+
(bottleneck_layer): BottleneckLayerLayer(
|
484 |
+
(projection_fn): ConvNorm(
|
485 |
+
(conv): ParametrizedConv1d(
|
486 |
+
512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
|
487 |
+
(parametrizations): ModuleDict(
|
488 |
+
(weight): ParametrizationList(
|
489 |
+
(0): _WeightNorm()
|
490 |
+
)
|
491 |
+
)
|
492 |
+
)
|
493 |
+
)
|
494 |
+
(non_linearity): ReLU()
|
495 |
+
)
|
496 |
+
(feat_pred_fn): ConvLSTMLinear(
|
497 |
+
(dropout): Dropout(p=0.25, inplace=False)
|
498 |
+
(convolutions): ModuleList(
|
499 |
+
(0): ParametrizedConv1d(
|
500 |
+
48, 256, kernel_size=(3,), stride=(1,), padding=(1,)
|
501 |
+
(parametrizations): ModuleDict(
|
502 |
+
(weight): ParametrizationList(
|
503 |
+
(0): _WeightNorm()
|
504 |
+
)
|
505 |
+
)
|
506 |
+
)
|
507 |
+
(1): ParametrizedConv1d(
|
508 |
+
256, 256, kernel_size=(3,), stride=(1,), padding=(1,)
|
509 |
+
(parametrizations): ModuleDict(
|
510 |
+
(weight): ParametrizationList(
|
511 |
+
(0): _WeightNorm()
|
512 |
+
)
|
513 |
+
)
|
514 |
+
)
|
515 |
+
)
|
516 |
+
(bilstm): ParametrizedLSTM(
|
517 |
+
256, 128, batch_first=True, bidirectional=True
|
518 |
+
(parametrizations): ModuleDict(
|
519 |
+
(weight_hh_l0): ParametrizationList(
|
520 |
+
(0): _SpectralNorm()
|
521 |
+
)
|
522 |
+
(weight_hh_l0_reverse): ParametrizationList(
|
523 |
+
(0): _SpectralNorm()
|
524 |
+
)
|
525 |
+
)
|
526 |
+
)
|
527 |
+
(dense): Linear(in_features=256, out_features=1, bias=True)
|
528 |
+
)
|
529 |
+
)
|
530 |
+
)
|
archs/vocos.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Vocos(
|
2 |
+
(feature_extractor): MelSpectrogramFeatures(
|
3 |
+
(mel_spec): MelSpectrogram(
|
4 |
+
(spectrogram): Spectrogram()
|
5 |
+
(mel_scale): MelScale()
|
6 |
+
)
|
7 |
+
)
|
8 |
+
(backbone): VocosBackbone(
|
9 |
+
(embed): Conv1d(80, 512, kernel_size=(7,), stride=(1,), padding=(3,))
|
10 |
+
(norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
|
11 |
+
(convnext): ModuleList(
|
12 |
+
(0-7): 8 x ConvNeXtBlock(
|
13 |
+
(dwconv): Conv1d(512, 512, kernel_size=(7,), stride=(1,), padding=(3,), groups=512)
|
14 |
+
(norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
|
15 |
+
(pwconv1): Linear(in_features=512, out_features=1536, bias=True)
|
16 |
+
(act): GELU(approximate='none')
|
17 |
+
(pwconv2): Linear(in_features=1536, out_features=512, bias=True)
|
18 |
+
)
|
19 |
+
)
|
20 |
+
(final_layer_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
|
21 |
+
)
|
22 |
+
(head): ISTFTHead(
|
23 |
+
(out): Linear(in_features=512, out_features=2050, bias=True)
|
24 |
+
(istft): ISTFT()
|
25 |
+
)
|
26 |
+
)
|