pretrain core model
Browse files
scripts/prepare_core_datasets.py
CHANGED
@@ -8,6 +8,9 @@ from utils import tokenize_fn
|
|
8 |
from core_base_datasets import core_base_datasets
|
9 |
from core_instruct_datasets import core_instruct_datasets
|
10 |
|
|
|
|
|
|
|
11 |
seqs = [
|
12 |
(0, 1073741824, 4097, 4000),
|
13 |
]
|
@@ -24,8 +27,8 @@ for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
|
|
24 |
tokenize_fn,
|
25 |
min_len=min_len,
|
26 |
max_len=max_len,
|
27 |
-
hf_tokenizer=AutoTokenizer.from_pretrained(
|
28 |
-
tokenizer=Tokenizer(
|
29 |
),
|
30 |
inputs=core_base_datasets + core_instruct_datasets,
|
31 |
output_dir=output_dir,
|
|
|
8 |
from core_base_datasets import core_base_datasets
|
9 |
from core_instruct_datasets import core_instruct_datasets
|
10 |
|
11 |
+
|
12 |
+
tokenizer_path = '../tokenizer'
|
13 |
+
|
14 |
seqs = [
|
15 |
(0, 1073741824, 4097, 4000),
|
16 |
]
|
|
|
27 |
tokenize_fn,
|
28 |
min_len=min_len,
|
29 |
max_len=max_len,
|
30 |
+
hf_tokenizer=AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True, use_fast=True),
|
31 |
+
tokenizer=Tokenizer(tokenizer_path),
|
32 |
),
|
33 |
inputs=core_base_datasets + core_instruct_datasets,
|
34 |
output_dir=output_dir,
|