mtasic85 commited on
Commit
dab3f73
·
1 Parent(s): 8500ccf

pretrain core model

Browse files
Files changed (1) hide show
  1. scripts/prepare_core_datasets.py +5 -2
scripts/prepare_core_datasets.py CHANGED
@@ -8,6 +8,9 @@ from utils import tokenize_fn
8
  from core_base_datasets import core_base_datasets
9
  from core_instruct_datasets import core_instruct_datasets
10
 
 
 
 
11
  seqs = [
12
  (0, 1073741824, 4097, 4000),
13
  ]
@@ -24,8 +27,8 @@ for i, (min_len, max_len, block_size, subchunk_size) in enumerate(seqs):
24
  tokenize_fn,
25
  min_len=min_len,
26
  max_len=max_len,
27
- hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
28
- tokenizer=Tokenizer('..'),
29
  ),
30
  inputs=core_base_datasets + core_instruct_datasets,
31
  output_dir=output_dir,
 
8
  from core_base_datasets import core_base_datasets
9
  from core_instruct_datasets import core_instruct_datasets
10
 
11
+
12
+ tokenizer_path = '../tokenizer'
13
+
14
  seqs = [
15
  (0, 1073741824, 4097, 4000),
16
  ]
 
27
  tokenize_fn,
28
  min_len=min_len,
29
  max_len=max_len,
30
+ hf_tokenizer=AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True, use_fast=True),
31
+ tokenizer=Tokenizer(tokenizer_path),
32
  ),
33
  inputs=core_base_datasets + core_instruct_datasets,
34
  output_dir=output_dir,