File size: 2,520 Bytes
274e641
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
{
    "backbone": {
        "d_model": 2048,
        "d_intermediate": 0,
        "attn_mlp_d_intermediate": 8192,
        "n_layer": 26,
        "ssm_cfg": {},
        "attn_layer_idx": [
            0,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
            16,
            17,
            18,
            19,
            20,
            21,
            22,
            23,
            24,
            25
        ],
        "attn_cfg": {
            "causal": true,
            "num_heads": 16,
            "num_heads_kv": 4,
            "rotary_emb_dim": 128,
            "rotary_emb_interleaved": true,
            "qkv_proj_bias": false,
            "out_proj_bias": false
        },
        "rms_norm": false,
        "residual_in_fp32": false,
        "norm_epsilon": 1e-05
    },
    "prefix_conditioner": {
        "conditioners": [
            {
                "type": "EspeakPhonemeConditioner",
                "name": "espeak"
            },
            {
                "cond_dim": 128,
                "uncond_type": "learned",
                "projection": "linear",
                "type": "PassthroughConditioner",
                "name": "speaker"
            },
            {
                "input_dim": 8,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "emotion"
            },
            {
                "min_val": 0,
                "max_val": 24000,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "fmax"
            },
            {
                "min_val": 0,
                "max_val": 400,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "pitch_std"
            },
            {
                "min_val": 0,
                "max_val": 40,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "speaking_rate"
            },
            {
                "min_val": -1,
                "max_val": 126,
                "uncond_type": "learned",
                "type": "IntegerConditioner",
                "name": "language_id"
            }
        ],
        "projection": "linear"
    },
    "eos_token_id": 1024,
    "masked_token_id": 1025
}