Yehor commited on
Commit
71a422e
·
1 Parent(s): a800743
Files changed (2) hide show
  1. cleaners.py +0 -52
  2. data.py +49 -2
cleaners.py DELETED
@@ -1,52 +0,0 @@
1
- """adapted from https://github.com/keithito/tacotron"""
2
-
3
- import re
4
- from string import punctuation
5
- from functools import reduce
6
-
7
-
8
- # Regular expression matching whitespace:
9
- _whitespace_re = re.compile(r"\s+")
10
-
11
- # Regular expression separating words enclosed in curly braces for cleaning
12
- _arpa_re = re.compile(r"{[^}]+}|\S+")
13
-
14
-
15
- def lowercase(text):
16
- return text.lower()
17
-
18
-
19
- def collapse_whitespace(text):
20
- return re.sub(_whitespace_re, " ", text)
21
-
22
-
23
- def remove_space_before_punctuation(text):
24
- return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text)
25
-
26
-
27
- class Cleaner(object):
28
- def __init__(self, cleaner_names, phonemedict):
29
- self.cleaner_names = cleaner_names
30
- self.phonemedict = phonemedict
31
-
32
- def __call__(self, text):
33
- for cleaner_name in self.cleaner_names:
34
- sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name)
35
- for fn in sequence_fns:
36
- text = fn(text)
37
-
38
- text = [
39
- reduce(lambda x, y: y(x), word_fns, split) if split[0] != "{" else split
40
- for split in _arpa_re.findall(text)
41
- ]
42
- text = " ".join(text)
43
-
44
- text = remove_space_before_punctuation(text)
45
-
46
- return text
47
-
48
- def get_cleaner_fns(self, cleaner_name):
49
- sequence_fns = [lowercase, collapse_whitespace]
50
- word_fns = []
51
-
52
- return sequence_fns, word_fns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data.py CHANGED
@@ -41,12 +41,13 @@
41
  """adapted from https://github.com/keithito/tacotron"""
42
 
43
  import re
 
 
 
44
 
45
  import torch
46
  import torch.utils.data
47
 
48
- from .cleaners import Cleaner
49
-
50
  #########
51
  # REGEX #
52
  #########
@@ -54,6 +55,52 @@ from .cleaners import Cleaner
54
  # Regular expression matching text enclosed in curly braces for encoding
55
  _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def get_symbols():
59
  _punctuation = "'.,?! "
 
41
  """adapted from https://github.com/keithito/tacotron"""
42
 
43
  import re
44
+ from string import punctuation
45
+ from functools import reduce
46
+
47
 
48
  import torch
49
  import torch.utils.data
50
 
 
 
51
  #########
52
  # REGEX #
53
  #########
 
55
  # Regular expression matching text enclosed in curly braces for encoding
56
  _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
57
 
58
+ # Regular expression matching whitespace:
59
+ _whitespace_re = re.compile(r"\s+")
60
+
61
+ # Regular expression separating words enclosed in curly braces for cleaning
62
+ _arpa_re = re.compile(r"{[^}]+}|\S+")
63
+
64
+
65
+ def lowercase(text):
66
+ return text.lower()
67
+
68
+
69
+ def collapse_whitespace(text):
70
+ return re.sub(_whitespace_re, " ", text)
71
+
72
+
73
+ def remove_space_before_punctuation(text):
74
+ return re.sub(r"\s([{}](?:\s|$))".format(punctuation), r"\1", text)
75
+
76
+
77
+ class Cleaner:
78
+ def __init__(self, cleaner_names, phonemedict):
79
+ self.cleaner_names = cleaner_names
80
+ self.phonemedict = phonemedict
81
+
82
+ def __call__(self, text):
83
+ for cleaner_name in self.cleaner_names:
84
+ sequence_fns, word_fns = self.get_cleaner_fns(cleaner_name)
85
+ for fn in sequence_fns:
86
+ text = fn(text)
87
+
88
+ text = [
89
+ reduce(lambda x, y: y(x), word_fns, split) if split[0] != "{" else split
90
+ for split in _arpa_re.findall(text)
91
+ ]
92
+ text = " ".join(text)
93
+
94
+ text = remove_space_before_punctuation(text)
95
+
96
+ return text
97
+
98
+ def get_cleaner_fns(self, cleaner_name):
99
+ sequence_fns = [lowercase, collapse_whitespace]
100
+ word_fns = []
101
+
102
+ return sequence_fns, word_fns
103
+
104
 
105
  def get_symbols():
106
  _punctuation = "'.,?! "