Yehor commited on
Commit
6f17241
·
1 Parent(s): 5564ad9

Add advanced options

Browse files
Files changed (3) hide show
  1. app.py +171 -55
  2. archs/radtts.txt +530 -0
  3. archs/vocos.txt +26 -0
app.py CHANGED
@@ -73,10 +73,7 @@ download_file_from_repo(
73
  params = []
74
 
75
  # Load the config
76
- with open("config.json") as f:
77
- data = f.read()
78
-
79
- config = json.loads(data)
80
  update_params(config, params)
81
 
82
  data_config = config["data_config"]
@@ -115,10 +112,13 @@ vocos_params = f"{sum(param.numel() for param in vocos.parameters()):,}"
115
  print(f"Loaded checkpoint (RAD-TTS++), number of parameters: {radtts_params}")
116
  print(f"Loaded checkpoint (Vocos), number of parameters: {vocos_params}")
117
 
118
- ignore_keys = ["training_files", "validation_files"]
119
  text_processor = TextProcessor(
120
  data_config["training_files"],
121
- **dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
 
 
 
 
122
  )
123
 
124
  # Config
@@ -210,58 +210,70 @@ examples = [
210
  ]
211
 
212
 
213
- def get_speaker_id(speaker):
214
- return torch.LongTensor([voices[speaker]])
215
-
216
-
217
- def get_text(text):
218
- return torch.LongTensor(text_processor.tp.encode_text(text))
219
-
220
-
221
- def inference(text, voice):
 
 
 
 
 
 
222
  if not text:
223
  raise gr.Error("Please paste your text.")
224
 
225
- gr.Info("Starting...", duration=0.5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  speaker = speaker_text = speaker_attributes = voice.lower()
228
 
229
- n_takes = 1
230
-
231
- sigma = 0.8 # sampling sigma for decoder
232
- sigma_tkndur = 0.666 # sampling sigma for duration
233
- sigma_f0 = 1.0 # sampling sigma for f0
234
- sigma_energy = 1.0 # sampling sigma for energy avg
235
-
236
- token_dur_scaling = 1.0
237
-
238
- f0_mean = 0
239
- f0_std = 0
240
- energy_mean = 0
241
- energy_std = 0
242
 
243
- tensor_text = get_text(text).to(device)
244
-
245
- speaker_id = speaker_id_text = speaker_id_attributes = get_speaker_id(speaker).to(
246
- device
247
- )
248
 
249
  if speaker_text is not None:
250
- speaker_id_text = get_speaker_id(speaker_text).to(device)
251
 
252
  if speaker_attributes is not None:
253
- speaker_id_attributes = get_speaker_id(speaker_attributes).to(device)
 
 
254
 
255
  inference_start = time.time()
256
 
257
- for take in range(n_takes):
 
 
 
258
  with torch.autocast(device, enabled=False):
259
  with torch.inference_mode():
260
  outputs = radtts.infer(
261
  speaker_id,
262
  tensor_text[None],
263
- sigma,
264
- sigma_tkndur,
265
  sigma_f0,
266
  sigma_energy,
267
  token_dur_scaling,
@@ -274,18 +286,21 @@ def inference(text, voice):
274
  energy_std=energy_std,
275
  )
276
 
277
- mel = outputs["mel"]
278
 
279
- gr.Info(
280
- "Synthesized MEL spectrogram, converting to WAVE.", duration=0.5
281
- )
282
 
283
- wav_gen = vocos.decode(mel)
284
- wav_gen_float = wav_gen.cpu()
 
285
 
286
- torchaudio.save("audio.wav", wav_gen_float, 44_100, encoding="PCM_S")
 
 
 
287
 
288
- duration = len(wav_gen_float[0]) / 44_100
 
289
 
290
  elapsed_time = time.time() - inference_start
291
  rtf = elapsed_time / duration
@@ -303,14 +318,12 @@ def inference(text, voice):
303
  try:
304
 
305
  @spaces.GPU
306
- def inference_zerogpu(text, voice):
307
- return inference(text, voice)
308
  except NameError:
309
- print("ZeroGPU is not available, skipping...")
310
 
311
-
312
- def inference_cpu(text, voice):
313
- return inference(text, voice)
314
 
315
 
316
  demo = gr.Blocks(
@@ -344,17 +357,120 @@ with demo:
344
  value="Tetiana",
345
  )
346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  gr.Button("Run").click(
348
  inference_zerogpu if use_zerogpu else inference_cpu,
349
  concurrency_limit=concurrency_limit,
350
- inputs=[text, voice],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  outputs=[audio, rtf],
352
  )
353
 
354
  with gr.Row():
355
  gr.Examples(
356
  label="Choose an example",
357
- inputs=[text, voice],
 
 
 
358
  examples=examples,
359
  )
360
 
 
73
  params = []
74
 
75
  # Load the config
76
+ config = json.loads(Path("config.json").read_text())
 
 
 
77
  update_params(config, params)
78
 
79
  data_config = config["data_config"]
 
112
  print(f"Loaded checkpoint (RAD-TTS++), number of parameters: {radtts_params}")
113
  print(f"Loaded checkpoint (Vocos), number of parameters: {vocos_params}")
114
 
 
115
  text_processor = TextProcessor(
116
  data_config["training_files"],
117
+ **dict(
118
+ (k, v)
119
+ for k, v in data_config.items()
120
+ if k not in ["training_files", "validation_files"]
121
+ ),
122
  )
123
 
124
  # Config
 
210
  ]
211
 
212
 
213
+ def inference(
214
+ text,
215
+ voice,
216
+ n_takes,
217
+ use_latest_take,
218
+ token_dur_scaling,
219
+ f0_mean,
220
+ f0_std,
221
+ energy_mean,
222
+ energy_std,
223
+ sigma_decoder,
224
+ sigma_token_duration,
225
+ sigma_f0,
226
+ sigma_energy,
227
+ ):
228
  if not text:
229
  raise gr.Error("Please paste your text.")
230
 
231
+ request = {
232
+ "text": text,
233
+ "voice": voice,
234
+ "n_takes": n_takes,
235
+ "use_latest_take": use_latest_take,
236
+ "token_dur_scaling": token_dur_scaling,
237
+ "f0_mean": f0_mean,
238
+ "f0_std": f0_std,
239
+ "energy_mean": energy_mean,
240
+ "energy_std": energy_std,
241
+ "sigma_decoder": sigma_decoder,
242
+ "sigma_token_duration": sigma_token_duration,
243
+ "sigma_f0": sigma_f0,
244
+ "sigma_energy": sigma_energy,
245
+ }
246
+
247
+ print(json.dumps(request, indent=2))
248
 
249
  speaker = speaker_text = speaker_attributes = voice.lower()
250
 
251
+ tensor_text = torch.LongTensor(text_processor.tp.encode_text(text)).to(device)
252
+ speaker_tensor = torch.LongTensor([voices[speaker]]).to(device)
 
 
 
 
 
 
 
 
 
 
 
253
 
254
+ speaker_id = speaker_id_text = speaker_id_attributes = speaker_tensor
 
 
 
 
255
 
256
  if speaker_text is not None:
257
+ speaker_id_text = torch.LongTensor([voices[speaker_text]]).to(device)
258
 
259
  if speaker_attributes is not None:
260
+ speaker_id_attributes = torch.LongTensor([voices[speaker_attributes]]).to(
261
+ device
262
+ )
263
 
264
  inference_start = time.time()
265
 
266
+ mels = []
267
+ for n_take in range(n_takes):
268
+ gr.Info(f"Inferencing take {n_take + 1}", duration=1)
269
+
270
  with torch.autocast(device, enabled=False):
271
  with torch.inference_mode():
272
  outputs = radtts.infer(
273
  speaker_id,
274
  tensor_text[None],
275
+ sigma_decoder,
276
+ sigma_token_duration,
277
  sigma_f0,
278
  sigma_energy,
279
  token_dur_scaling,
 
286
  energy_std=energy_std,
287
  )
288
 
289
+ mels.append(outputs["mel"])
290
 
291
+ gr.Info("Synthesized MEL spectrograms, converting to WAVE.", duration=0.5)
 
 
292
 
293
+ wav_gen_all = []
294
+ for mel in mels:
295
+ wav_gen_all.append(vocos.decode(mel))
296
 
297
+ if use_latest_take:
298
+ wav_gen = wav_gen_all[-1] # Get the latest generated wav
299
+ else:
300
+ wav_gen = torch.cat(wav_gen_all, dim=1) # Concatenate all the generated wavs
301
 
302
+ duration = len(wav_gen[0]) / 44_100
303
+ torchaudio.save("audio.wav", wav_gen.cpu(), 44_100, encoding="PCM_S")
304
 
305
  elapsed_time = time.time() - inference_start
306
  rtf = elapsed_time / duration
 
318
  try:
319
 
320
  @spaces.GPU
321
+ def inference_zerogpu(*args):
322
+ return inference(*args)
323
  except NameError:
 
324
 
325
+ def inference_cpu(*args):
326
+ return inference(*args)
 
327
 
328
 
329
  demo = gr.Blocks(
 
357
  value="Tetiana",
358
  )
359
 
360
+ with gr.Accordion("Advanced options", open=False):
361
+ gr.Markdown("You can change the voice, speed, and other parameters.")
362
+
363
+ with gr.Column():
364
+ n_takes = gr.Number(
365
+ label="Number of takes",
366
+ value=1,
367
+ minimum=1,
368
+ maximum=10,
369
+ step=1,
370
+ )
371
+
372
+ use_latest_take = gr.Checkbox(
373
+ label="Use the latest take",
374
+ value=False,
375
+ )
376
+
377
+ token_dur_scaling = gr.Number(
378
+ label="Token duration scaling",
379
+ value=1.0,
380
+ minimum=0.0,
381
+ maximum=10,
382
+ step=0.1,
383
+ )
384
+
385
+ with gr.Row():
386
+ f0_mean = gr.Number(
387
+ label="F0 mean",
388
+ value=0,
389
+ minimum=0.0,
390
+ maximum=1.0,
391
+ step=0.1,
392
+ )
393
+ f0_std = gr.Number(
394
+ label="F0 std",
395
+ value=0,
396
+ minimum=0.0,
397
+ maximum=1.0,
398
+ step=0.1,
399
+ )
400
+
401
+ energy_mean = gr.Number(
402
+ label="Energy mean",
403
+ value=0,
404
+ minimum=0.0,
405
+ maximum=1.0,
406
+ step=0.1,
407
+ )
408
+ energy_std = gr.Number(
409
+ label="Energy std",
410
+ value=0,
411
+ minimum=0.0,
412
+ maximum=1.0,
413
+ step=0.1,
414
+ )
415
+
416
+ with gr.Row():
417
+ sigma_decoder = gr.Number(
418
+ label="Sampling sigma for decoder",
419
+ value=0.8,
420
+ minimum=0.0,
421
+ maximum=1.0,
422
+ step=0.1,
423
+ )
424
+ sigma_token_duration = gr.Number(
425
+ label="Sampling sigma for duration",
426
+ value=0.666,
427
+ minimum=0.0,
428
+ maximum=1.0,
429
+ step=0.1,
430
+ )
431
+ sigma_f0 = gr.Number(
432
+ label="Sampling sigma for F0",
433
+ value=1.0,
434
+ minimum=0.0,
435
+ maximum=1.0,
436
+ step=0.1,
437
+ )
438
+ sigma_energy = gr.Number(
439
+ label="Sampling sigma for energy avg",
440
+ value=1.0,
441
+ minimum=0.0,
442
+ maximum=1.0,
443
+ step=0.1,
444
+ )
445
+
446
  gr.Button("Run").click(
447
  inference_zerogpu if use_zerogpu else inference_cpu,
448
  concurrency_limit=concurrency_limit,
449
+ inputs=[
450
+ text,
451
+ voice,
452
+ n_takes,
453
+ use_latest_take,
454
+ token_dur_scaling,
455
+ f0_mean,
456
+ f0_std,
457
+ energy_mean,
458
+ energy_std,
459
+ sigma_decoder,
460
+ sigma_token_duration,
461
+ sigma_f0,
462
+ sigma_energy,
463
+ ],
464
  outputs=[audio, rtf],
465
  )
466
 
467
  with gr.Row():
468
  gr.Examples(
469
  label="Choose an example",
470
+ inputs=[
471
+ text,
472
+ voice,
473
+ ],
474
  examples=examples,
475
  )
476
 
archs/radtts.txt ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ RADTTS(
2
+ (speaker_embedding): Embedding(3, 16)
3
+ (embedding): Embedding(185, 512)
4
+ (flows): ModuleList(
5
+ (0-1): 2 x FlowStep(
6
+ (invtbl_conv): Invertible1x1ConvLUS()
7
+ (affine_tfn): AffineTransformationLayer(
8
+ (affine_param_predictor): WN(
9
+ (in_layers): ModuleList(
10
+ (0): ConvNorm(
11
+ (conv): ParametrizedPartialConv1d(
12
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
13
+ (parametrizations): ModuleDict(
14
+ (weight): ParametrizationList(
15
+ (0): _WeightNorm()
16
+ )
17
+ )
18
+ )
19
+ )
20
+ (1): ConvNorm(
21
+ (conv): ParametrizedPartialConv1d(
22
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
23
+ (parametrizations): ModuleDict(
24
+ (weight): ParametrizationList(
25
+ (0): _WeightNorm()
26
+ )
27
+ )
28
+ )
29
+ )
30
+ (2): ConvNorm(
31
+ (conv): ParametrizedPartialConv1d(
32
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
33
+ (parametrizations): ModuleDict(
34
+ (weight): ParametrizationList(
35
+ (0): _WeightNorm()
36
+ )
37
+ )
38
+ )
39
+ )
40
+ (3): ConvNorm(
41
+ (conv): ParametrizedPartialConv1d(
42
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
43
+ (parametrizations): ModuleDict(
44
+ (weight): ParametrizationList(
45
+ (0): _WeightNorm()
46
+ )
47
+ )
48
+ )
49
+ )
50
+ )
51
+ (res_skip_layers): ModuleList(
52
+ (0-3): 4 x ParametrizedConv1d(
53
+ 1024, 1024, kernel_size=(1,), stride=(1,)
54
+ (parametrizations): ModuleDict(
55
+ (weight): ParametrizationList(
56
+ (0): _WeightNorm()
57
+ )
58
+ )
59
+ )
60
+ )
61
+ (start): ParametrizedConv1d(
62
+ 1120, 1024, kernel_size=(1,), stride=(1,)
63
+ (parametrizations): ModuleDict(
64
+ (weight): ParametrizationList(
65
+ (0): _WeightNorm()
66
+ )
67
+ )
68
+ )
69
+ (softplus): Softplus(beta=1.0, threshold=20.0)
70
+ (end): Conv1d(1024, 160, kernel_size=(1,), stride=(1,))
71
+ )
72
+ )
73
+ )
74
+ (2-3): 2 x FlowStep(
75
+ (invtbl_conv): Invertible1x1ConvLUS()
76
+ (affine_tfn): AffineTransformationLayer(
77
+ (affine_param_predictor): WN(
78
+ (in_layers): ModuleList(
79
+ (0): ConvNorm(
80
+ (conv): ParametrizedPartialConv1d(
81
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
82
+ (parametrizations): ModuleDict(
83
+ (weight): ParametrizationList(
84
+ (0): _WeightNorm()
85
+ )
86
+ )
87
+ )
88
+ )
89
+ (1): ConvNorm(
90
+ (conv): ParametrizedPartialConv1d(
91
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
92
+ (parametrizations): ModuleDict(
93
+ (weight): ParametrizationList(
94
+ (0): _WeightNorm()
95
+ )
96
+ )
97
+ )
98
+ )
99
+ (2): ConvNorm(
100
+ (conv): ParametrizedPartialConv1d(
101
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
102
+ (parametrizations): ModuleDict(
103
+ (weight): ParametrizationList(
104
+ (0): _WeightNorm()
105
+ )
106
+ )
107
+ )
108
+ )
109
+ (3): ConvNorm(
110
+ (conv): ParametrizedPartialConv1d(
111
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
112
+ (parametrizations): ModuleDict(
113
+ (weight): ParametrizationList(
114
+ (0): _WeightNorm()
115
+ )
116
+ )
117
+ )
118
+ )
119
+ )
120
+ (res_skip_layers): ModuleList(
121
+ (0-3): 4 x ParametrizedConv1d(
122
+ 1024, 1024, kernel_size=(1,), stride=(1,)
123
+ (parametrizations): ModuleDict(
124
+ (weight): ParametrizationList(
125
+ (0): _WeightNorm()
126
+ )
127
+ )
128
+ )
129
+ )
130
+ (start): ParametrizedConv1d(
131
+ 1119, 1024, kernel_size=(1,), stride=(1,)
132
+ (parametrizations): ModuleDict(
133
+ (weight): ParametrizationList(
134
+ (0): _WeightNorm()
135
+ )
136
+ )
137
+ )
138
+ (softplus): Softplus(beta=1.0, threshold=20.0)
139
+ (end): Conv1d(1024, 158, kernel_size=(1,), stride=(1,))
140
+ )
141
+ )
142
+ )
143
+ (4-5): 2 x FlowStep(
144
+ (invtbl_conv): Invertible1x1ConvLUS()
145
+ (affine_tfn): AffineTransformationLayer(
146
+ (affine_param_predictor): WN(
147
+ (in_layers): ModuleList(
148
+ (0): ConvNorm(
149
+ (conv): ParametrizedPartialConv1d(
150
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
151
+ (parametrizations): ModuleDict(
152
+ (weight): ParametrizationList(
153
+ (0): _WeightNorm()
154
+ )
155
+ )
156
+ )
157
+ )
158
+ (1): ConvNorm(
159
+ (conv): ParametrizedPartialConv1d(
160
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
161
+ (parametrizations): ModuleDict(
162
+ (weight): ParametrizationList(
163
+ (0): _WeightNorm()
164
+ )
165
+ )
166
+ )
167
+ )
168
+ (2): ConvNorm(
169
+ (conv): ParametrizedPartialConv1d(
170
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
171
+ (parametrizations): ModuleDict(
172
+ (weight): ParametrizationList(
173
+ (0): _WeightNorm()
174
+ )
175
+ )
176
+ )
177
+ )
178
+ (3): ConvNorm(
179
+ (conv): ParametrizedPartialConv1d(
180
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
181
+ (parametrizations): ModuleDict(
182
+ (weight): ParametrizationList(
183
+ (0): _WeightNorm()
184
+ )
185
+ )
186
+ )
187
+ )
188
+ )
189
+ (res_skip_layers): ModuleList(
190
+ (0-3): 4 x ParametrizedConv1d(
191
+ 1024, 1024, kernel_size=(1,), stride=(1,)
192
+ (parametrizations): ModuleDict(
193
+ (weight): ParametrizationList(
194
+ (0): _WeightNorm()
195
+ )
196
+ )
197
+ )
198
+ )
199
+ (start): ParametrizedConv1d(
200
+ 1118, 1024, kernel_size=(1,), stride=(1,)
201
+ (parametrizations): ModuleDict(
202
+ (weight): ParametrizationList(
203
+ (0): _WeightNorm()
204
+ )
205
+ )
206
+ )
207
+ (softplus): Softplus(beta=1.0, threshold=20.0)
208
+ (end): Conv1d(1024, 156, kernel_size=(1,), stride=(1,))
209
+ )
210
+ )
211
+ )
212
+ (6-7): 2 x FlowStep(
213
+ (invtbl_conv): Invertible1x1ConvLUS()
214
+ (affine_tfn): AffineTransformationLayer(
215
+ (affine_param_predictor): WN(
216
+ (in_layers): ModuleList(
217
+ (0): ConvNorm(
218
+ (conv): ParametrizedPartialConv1d(
219
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)
220
+ (parametrizations): ModuleDict(
221
+ (weight): ParametrizationList(
222
+ (0): _WeightNorm()
223
+ )
224
+ )
225
+ )
226
+ )
227
+ (1): ConvNorm(
228
+ (conv): ParametrizedPartialConv1d(
229
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,), dilation=(2,)
230
+ (parametrizations): ModuleDict(
231
+ (weight): ParametrizationList(
232
+ (0): _WeightNorm()
233
+ )
234
+ )
235
+ )
236
+ )
237
+ (2): ConvNorm(
238
+ (conv): ParametrizedPartialConv1d(
239
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(4,)
240
+ (parametrizations): ModuleDict(
241
+ (weight): ParametrizationList(
242
+ (0): _WeightNorm()
243
+ )
244
+ )
245
+ )
246
+ )
247
+ (3): ConvNorm(
248
+ (conv): ParametrizedPartialConv1d(
249
+ 1024, 1024, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(8,)
250
+ (parametrizations): ModuleDict(
251
+ (weight): ParametrizationList(
252
+ (0): _WeightNorm()
253
+ )
254
+ )
255
+ )
256
+ )
257
+ )
258
+ (res_skip_layers): ModuleList(
259
+ (0-3): 4 x ParametrizedConv1d(
260
+ 1024, 1024, kernel_size=(1,), stride=(1,)
261
+ (parametrizations): ModuleDict(
262
+ (weight): ParametrizationList(
263
+ (0): _WeightNorm()
264
+ )
265
+ )
266
+ )
267
+ )
268
+ (start): ParametrizedConv1d(
269
+ 1117, 1024, kernel_size=(1,), stride=(1,)
270
+ (parametrizations): ModuleDict(
271
+ (weight): ParametrizationList(
272
+ (0): _WeightNorm()
273
+ )
274
+ )
275
+ )
276
+ (softplus): Softplus(beta=1.0, threshold=20.0)
277
+ (end): Conv1d(1024, 154, kernel_size=(1,), stride=(1,))
278
+ )
279
+ )
280
+ )
281
+ )
282
+ (encoder): Encoder(
283
+ (convolutions): ModuleList(
284
+ (0-2): 3 x Sequential(
285
+ (0): ConvNorm(
286
+ (conv): PartialConv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
287
+ )
288
+ (1): InstanceNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
289
+ )
290
+ )
291
+ (lstm): ParametrizedLSTM(
292
+ 512, 256, batch_first=True, bidirectional=True
293
+ (parametrizations): ModuleDict(
294
+ (weight_hh_l0): ParametrizationList(
295
+ (0): _SpectralNorm()
296
+ )
297
+ (weight_hh_l0_reverse): ParametrizationList(
298
+ (0): _SpectralNorm()
299
+ )
300
+ )
301
+ )
302
+ )
303
+ (length_regulator): LengthRegulator()
304
+ (attention): ConvAttention(
305
+ (softmax): Softmax(dim=3)
306
+ (log_softmax): LogSoftmax(dim=3)
307
+ (key_proj): Sequential(
308
+ (0): ConvNorm(
309
+ (conv): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
310
+ )
311
+ (1): ReLU()
312
+ (2): ConvNorm(
313
+ (conv): Conv1d(1024, 80, kernel_size=(1,), stride=(1,))
314
+ )
315
+ )
316
+ (query_proj): Sequential(
317
+ (0): ConvNorm(
318
+ (conv): Conv1d(80, 160, kernel_size=(3,), stride=(1,), padding=(1,))
319
+ )
320
+ (1): ReLU()
321
+ (2): ConvNorm(
322
+ (conv): Conv1d(160, 80, kernel_size=(1,), stride=(1,))
323
+ )
324
+ (3): ReLU()
325
+ (4): ConvNorm(
326
+ (conv): Conv1d(80, 80, kernel_size=(1,), stride=(1,))
327
+ )
328
+ )
329
+ )
330
+ (context_lstm): ParametrizedLSTM(
331
+ 1044, 520, batch_first=True, bidirectional=True
332
+ (parametrizations): ModuleDict(
333
+ (weight_hh_l0): ParametrizationList(
334
+ (0): _SpectralNorm()
335
+ )
336
+ (weight_hh_l0_reverse): ParametrizationList(
337
+ (0): _SpectralNorm()
338
+ )
339
+ )
340
+ )
341
+ (unfold): Unfold(kernel_size=(2, 1), dilation=1, padding=0, stride=2)
342
+ (dur_pred_layer): DAP(
343
+ (bottleneck_layer): BottleneckLayerLayer(
344
+ (projection_fn): ConvNorm(
345
+ (conv): ParametrizedConv1d(
346
+ 512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
347
+ (parametrizations): ModuleDict(
348
+ (weight): ParametrizationList(
349
+ (0): _WeightNorm()
350
+ )
351
+ )
352
+ )
353
+ )
354
+ (non_linearity): ReLU()
355
+ )
356
+ (feat_pred_fn): ConvLSTMLinear(
357
+ (dropout): Dropout(p=0.25, inplace=False)
358
+ (convolutions): ModuleList(
359
+ (0): ParametrizedConv1d(
360
+ 48, 256, kernel_size=(3,), stride=(1,), padding=(1,)
361
+ (parametrizations): ModuleDict(
362
+ (weight): ParametrizationList(
363
+ (0): _WeightNorm()
364
+ )
365
+ )
366
+ )
367
+ (1): ParametrizedConv1d(
368
+ 256, 256, kernel_size=(3,), stride=(1,), padding=(1,)
369
+ (parametrizations): ModuleDict(
370
+ (weight): ParametrizationList(
371
+ (0): _WeightNorm()
372
+ )
373
+ )
374
+ )
375
+ )
376
+ (bilstm): ParametrizedLSTM(
377
+ 256, 128, batch_first=True, bidirectional=True
378
+ (parametrizations): ModuleDict(
379
+ (weight_hh_l0): ParametrizationList(
380
+ (0): _SpectralNorm()
381
+ )
382
+ (weight_hh_l0_reverse): ParametrizationList(
383
+ (0): _SpectralNorm()
384
+ )
385
+ )
386
+ )
387
+ (dense): Linear(in_features=256, out_features=1, bias=True)
388
+ )
389
+ )
390
+ (unvoiced_bias_module): Sequential(
391
+ (0): LinearNorm(
392
+ (linear_layer): Linear(in_features=512, out_features=1, bias=True)
393
+ )
394
+ (1): ReLU()
395
+ )
396
+ (v_pred_module): DAP(
397
+ (bottleneck_layer): BottleneckLayerLayer(
398
+ (projection_fn): ConvNorm(
399
+ (conv): ParametrizedConv1d(
400
+ 512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
401
+ (parametrizations): ModuleDict(
402
+ (weight): ParametrizationList(
403
+ (0): _WeightNorm()
404
+ )
405
+ )
406
+ )
407
+ )
408
+ (non_linearity): ReLU()
409
+ )
410
+ (feat_pred_fn): ConvLSTMLinear(
411
+ (dropout): Dropout(p=0.5, inplace=False)
412
+ (convolutions): ModuleList(
413
+ (0): ParametrizedConv1d(
414
+ 48, 256, kernel_size=(3,), stride=(1,), padding=(1,)
415
+ (parametrizations): ModuleDict(
416
+ (weight): ParametrizationList(
417
+ (0): _WeightNorm()
418
+ )
419
+ )
420
+ )
421
+ (1): ParametrizedConv1d(
422
+ 256, 256, kernel_size=(3,), stride=(1,), padding=(1,)
423
+ (parametrizations): ModuleDict(
424
+ (weight): ParametrizationList(
425
+ (0): _WeightNorm()
426
+ )
427
+ )
428
+ )
429
+ )
430
+ (dense): Linear(in_features=256, out_features=1, bias=True)
431
+ )
432
+ )
433
+ (v_embeddings): Embedding(4, 512)
434
+ (f0_pred_module): DAP(
435
+ (bottleneck_layer): BottleneckLayerLayer(
436
+ (projection_fn): ConvNorm(
437
+ (conv): ParametrizedConv1d(
438
+ 512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
439
+ (parametrizations): ModuleDict(
440
+ (weight): ParametrizationList(
441
+ (0): _WeightNorm()
442
+ )
443
+ )
444
+ )
445
+ )
446
+ (non_linearity): ReLU()
447
+ )
448
+ (feat_pred_fn): ConvLSTMLinear(
449
+ (dropout): Dropout(p=0.5, inplace=False)
450
+ (convolutions): ModuleList(
451
+ (0): ParametrizedConv1d(
452
+ 48, 256, kernel_size=(11,), stride=(1,), padding=(5,)
453
+ (parametrizations): ModuleDict(
454
+ (weight): ParametrizationList(
455
+ (0): _WeightNorm()
456
+ )
457
+ )
458
+ )
459
+ (1): ParametrizedConv1d(
460
+ 256, 256, kernel_size=(11,), stride=(1,), padding=(5,)
461
+ (parametrizations): ModuleDict(
462
+ (weight): ParametrizationList(
463
+ (0): _WeightNorm()
464
+ )
465
+ )
466
+ )
467
+ )
468
+ (bilstm): ParametrizedLSTM(
469
+ 256, 128, batch_first=True, bidirectional=True
470
+ (parametrizations): ModuleDict(
471
+ (weight_hh_l0): ParametrizationList(
472
+ (0): _SpectralNorm()
473
+ )
474
+ (weight_hh_l0_reverse): ParametrizationList(
475
+ (0): _SpectralNorm()
476
+ )
477
+ )
478
+ )
479
+ (dense): Linear(in_features=256, out_features=1, bias=True)
480
+ )
481
+ )
482
+ (energy_pred_module): DAP(
483
+ (bottleneck_layer): BottleneckLayerLayer(
484
+ (projection_fn): ConvNorm(
485
+ (conv): ParametrizedConv1d(
486
+ 512, 32, kernel_size=(3,), stride=(1,), padding=(1,)
487
+ (parametrizations): ModuleDict(
488
+ (weight): ParametrizationList(
489
+ (0): _WeightNorm()
490
+ )
491
+ )
492
+ )
493
+ )
494
+ (non_linearity): ReLU()
495
+ )
496
+ (feat_pred_fn): ConvLSTMLinear(
497
+ (dropout): Dropout(p=0.25, inplace=False)
498
+ (convolutions): ModuleList(
499
+ (0): ParametrizedConv1d(
500
+ 48, 256, kernel_size=(3,), stride=(1,), padding=(1,)
501
+ (parametrizations): ModuleDict(
502
+ (weight): ParametrizationList(
503
+ (0): _WeightNorm()
504
+ )
505
+ )
506
+ )
507
+ (1): ParametrizedConv1d(
508
+ 256, 256, kernel_size=(3,), stride=(1,), padding=(1,)
509
+ (parametrizations): ModuleDict(
510
+ (weight): ParametrizationList(
511
+ (0): _WeightNorm()
512
+ )
513
+ )
514
+ )
515
+ )
516
+ (bilstm): ParametrizedLSTM(
517
+ 256, 128, batch_first=True, bidirectional=True
518
+ (parametrizations): ModuleDict(
519
+ (weight_hh_l0): ParametrizationList(
520
+ (0): _SpectralNorm()
521
+ )
522
+ (weight_hh_l0_reverse): ParametrizationList(
523
+ (0): _SpectralNorm()
524
+ )
525
+ )
526
+ )
527
+ (dense): Linear(in_features=256, out_features=1, bias=True)
528
+ )
529
+ )
530
+ )
archs/vocos.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Vocos(
2
+ (feature_extractor): MelSpectrogramFeatures(
3
+ (mel_spec): MelSpectrogram(
4
+ (spectrogram): Spectrogram()
5
+ (mel_scale): MelScale()
6
+ )
7
+ )
8
+ (backbone): VocosBackbone(
9
+ (embed): Conv1d(80, 512, kernel_size=(7,), stride=(1,), padding=(3,))
10
+ (norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
11
+ (convnext): ModuleList(
12
+ (0-7): 8 x ConvNeXtBlock(
13
+ (dwconv): Conv1d(512, 512, kernel_size=(7,), stride=(1,), padding=(3,), groups=512)
14
+ (norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
15
+ (pwconv1): Linear(in_features=512, out_features=1536, bias=True)
16
+ (act): GELU(approximate='none')
17
+ (pwconv2): Linear(in_features=1536, out_features=512, bias=True)
18
+ )
19
+ )
20
+ (final_layer_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
21
+ )
22
+ (head): ISTFTHead(
23
+ (out): Linear(in_features=512, out_features=2050, bias=True)
24
+ (istft): ISTFT()
25
+ )
26
+ )