Text-to-Speech
ONNX
English
hexgrad commited on
Commit
96b9a7b
·
verified ·
1 Parent(s): 3095858

Upload 3 files

Browse files
Files changed (2) hide show
  1. README.md +1 -0
  2. models.py +2 -219
README.md CHANGED
@@ -31,6 +31,7 @@ You can find a hosted demo at [hf.co/spaces/hexgrad/Kokoro-TTS](https://huggingf
31
  The following can be run in a single cell on [Google Colab](https://colab.research.google.com/).
32
  ```py
33
  # 1️⃣ Install dependencies silently
 
34
  !git clone https://huggingface.co/hexgrad/Kokoro-82M
35
  %cd Kokoro-82M
36
  !apt-get -qq -y install espeak-ng > /dev/null 2>&1
 
31
  The following can be run in a single cell on [Google Colab](https://colab.research.google.com/).
32
  ```py
33
  # 1️⃣ Install dependencies silently
34
+ !git lfs install
35
  !git clone https://huggingface.co/hexgrad/Kokoro-82M
36
  %cd Kokoro-82M
37
  !apt-get -qq -y install espeak-ng > /dev/null 2>&1
models.py CHANGED
@@ -1,5 +1,5 @@
1
  # https://github.com/yl4579/StyleTTS2/blob/main/models.py
2
- from istftnet import Decoder
3
  from munch import Munch
4
  from pathlib import Path
5
  from plbert import load_plbert
@@ -12,118 +12,6 @@ import torch
12
  import torch.nn as nn
13
  import torch.nn.functional as F
14
 
15
- class LearnedDownSample(nn.Module):
16
- def __init__(self, layer_type, dim_in):
17
- super().__init__()
18
- self.layer_type = layer_type
19
-
20
- if self.layer_type == 'none':
21
- self.conv = nn.Identity()
22
- elif self.layer_type == 'timepreserve':
23
- self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
24
- elif self.layer_type == 'half':
25
- self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
26
- else:
27
- raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
28
-
29
- def forward(self, x):
30
- return self.conv(x)
31
-
32
- class LearnedUpSample(nn.Module):
33
- def __init__(self, layer_type, dim_in):
34
- super().__init__()
35
- self.layer_type = layer_type
36
-
37
- if self.layer_type == 'none':
38
- self.conv = nn.Identity()
39
- elif self.layer_type == 'timepreserve':
40
- self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
41
- elif self.layer_type == 'half':
42
- self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
43
- else:
44
- raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
45
-
46
-
47
- def forward(self, x):
48
- return self.conv(x)
49
-
50
- class DownSample(nn.Module):
51
- def __init__(self, layer_type):
52
- super().__init__()
53
- self.layer_type = layer_type
54
-
55
- def forward(self, x):
56
- if self.layer_type == 'none':
57
- return x
58
- elif self.layer_type == 'timepreserve':
59
- return F.avg_pool2d(x, (2, 1))
60
- elif self.layer_type == 'half':
61
- if x.shape[-1] % 2 != 0:
62
- x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
63
- return F.avg_pool2d(x, 2)
64
- else:
65
- raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
66
-
67
-
68
- class UpSample(nn.Module):
69
- def __init__(self, layer_type):
70
- super().__init__()
71
- self.layer_type = layer_type
72
-
73
- def forward(self, x):
74
- if self.layer_type == 'none':
75
- return x
76
- elif self.layer_type == 'timepreserve':
77
- return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
78
- elif self.layer_type == 'half':
79
- return F.interpolate(x, scale_factor=2, mode='nearest')
80
- else:
81
- raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
82
-
83
-
84
- class ResBlk(nn.Module):
85
- def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
86
- normalize=False, downsample='none'):
87
- super().__init__()
88
- self.actv = actv
89
- self.normalize = normalize
90
- self.downsample = DownSample(downsample)
91
- self.downsample_res = LearnedDownSample(downsample, dim_in)
92
- self.learned_sc = dim_in != dim_out
93
- self._build_weights(dim_in, dim_out)
94
-
95
- def _build_weights(self, dim_in, dim_out):
96
- self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
97
- self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
98
- if self.normalize:
99
- self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
100
- self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
101
- if self.learned_sc:
102
- self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
103
-
104
- def _shortcut(self, x):
105
- if self.learned_sc:
106
- x = self.conv1x1(x)
107
- if self.downsample:
108
- x = self.downsample(x)
109
- return x
110
-
111
- def _residual(self, x):
112
- if self.normalize:
113
- x = self.norm1(x)
114
- x = self.actv(x)
115
- x = self.conv1(x)
116
- x = self.downsample_res(x)
117
- if self.normalize:
118
- x = self.norm2(x)
119
- x = self.actv(x)
120
- x = self.conv2(x)
121
- return x
122
-
123
- def forward(self, x):
124
- x = self._shortcut(x) + self._residual(x)
125
- return x / np.sqrt(2) # unit variance
126
-
127
  class LinearNorm(torch.nn.Module):
128
  def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
129
  super(LinearNorm, self).__init__()
@@ -136,98 +24,6 @@ class LinearNorm(torch.nn.Module):
136
  def forward(self, x):
137
  return self.linear_layer(x)
138
 
139
- class Discriminator2d(nn.Module):
140
- def __init__(self, dim_in=48, num_domains=1, max_conv_dim=384, repeat_num=4):
141
- super().__init__()
142
- blocks = []
143
- blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
144
-
145
- for lid in range(repeat_num):
146
- dim_out = min(dim_in*2, max_conv_dim)
147
- blocks += [ResBlk(dim_in, dim_out, downsample='half')]
148
- dim_in = dim_out
149
-
150
- blocks += [nn.LeakyReLU(0.2)]
151
- blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
152
- blocks += [nn.LeakyReLU(0.2)]
153
- blocks += [nn.AdaptiveAvgPool2d(1)]
154
- blocks += [spectral_norm(nn.Conv2d(dim_out, num_domains, 1, 1, 0))]
155
- self.main = nn.Sequential(*blocks)
156
-
157
- def get_feature(self, x):
158
- features = []
159
- for l in self.main:
160
- x = l(x)
161
- features.append(x)
162
- out = features[-1]
163
- out = out.view(out.size(0), -1) # (batch, num_domains)
164
- return out, features
165
-
166
- def forward(self, x):
167
- out, features = self.get_feature(x)
168
- out = out.squeeze() # (batch)
169
- return out, features
170
-
171
- class ResBlk1d(nn.Module):
172
- def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
173
- normalize=False, downsample='none', dropout_p=0.2):
174
- super().__init__()
175
- self.actv = actv
176
- self.normalize = normalize
177
- self.downsample_type = downsample
178
- self.learned_sc = dim_in != dim_out
179
- self._build_weights(dim_in, dim_out)
180
- self.dropout_p = dropout_p
181
-
182
- if self.downsample_type == 'none':
183
- self.pool = nn.Identity()
184
- else:
185
- self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
186
-
187
- def _build_weights(self, dim_in, dim_out):
188
- self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
189
- self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
190
- if self.normalize:
191
- self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
192
- self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
193
- if self.learned_sc:
194
- self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
195
-
196
- def downsample(self, x):
197
- if self.downsample_type == 'none':
198
- return x
199
- else:
200
- if x.shape[-1] % 2 != 0:
201
- x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
202
- return F.avg_pool1d(x, 2)
203
-
204
- def _shortcut(self, x):
205
- if self.learned_sc:
206
- x = self.conv1x1(x)
207
- x = self.downsample(x)
208
- return x
209
-
210
- def _residual(self, x):
211
- if self.normalize:
212
- x = self.norm1(x)
213
- x = self.actv(x)
214
- x = F.dropout(x, p=self.dropout_p, training=self.training)
215
-
216
- x = self.conv1(x)
217
- x = self.pool(x)
218
- if self.normalize:
219
- x = self.norm2(x)
220
-
221
- x = self.actv(x)
222
- x = F.dropout(x, p=self.dropout_p, training=self.training)
223
-
224
- x = self.conv2(x)
225
- return x
226
-
227
- def forward(self, x):
228
- x = self._shortcut(x) + self._residual(x)
229
- return x / np.sqrt(2) # unit variance
230
-
231
  class LayerNorm(nn.Module):
232
  def __init__(self, channels, eps=1e-5):
233
  super().__init__()
@@ -306,19 +102,6 @@ class TextEncoder(nn.Module):
306
  return mask
307
 
308
 
309
-
310
- class AdaIN1d(nn.Module):
311
- def __init__(self, style_dim, num_features):
312
- super().__init__()
313
- self.norm = nn.InstanceNorm1d(num_features, affine=False)
314
- self.fc = nn.Linear(style_dim, num_features*2)
315
-
316
- def forward(self, x, s):
317
- h = self.fc(s)
318
- h = h.view(h.size(0), h.size(1), 1)
319
- gamma, beta = torch.chunk(h, chunks=2, dim=1)
320
- return (1 + gamma) * self.norm(x) + beta
321
-
322
  class UpSample1d(nn.Module):
323
  def __init__(self, layer_type):
324
  super().__init__()
@@ -474,7 +257,7 @@ class ProsodyPredictor(nn.Module):
474
  mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
475
  mask = torch.gt(mask+1, lengths.unsqueeze(1))
476
  return mask
477
-
478
  class DurationEncoder(nn.Module):
479
 
480
  def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
 
1
  # https://github.com/yl4579/StyleTTS2/blob/main/models.py
2
+ from istftnet import AdaIN1d, Decoder
3
  from munch import Munch
4
  from pathlib import Path
5
  from plbert import load_plbert
 
12
  import torch.nn as nn
13
  import torch.nn.functional as F
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  class LinearNorm(torch.nn.Module):
16
  def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
17
  super(LinearNorm, self).__init__()
 
24
  def forward(self, x):
25
  return self.linear_layer(x)
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  class LayerNorm(nn.Module):
28
  def __init__(self, channels, eps=1e-5):
29
  super().__init__()
 
102
  return mask
103
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  class UpSample1d(nn.Module):
106
  def __init__(self, layer_type):
107
  super().__init__()
 
257
  mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
258
  mask = torch.gt(mask+1, lengths.unsqueeze(1))
259
  return mask
260
+
261
  class DurationEncoder(nn.Module):
262
 
263
  def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):