pretrain core 0
Browse files
scripts/pretrain-core-model-0.yaml
CHANGED
@@ -64,7 +64,7 @@ train:
|
|
64 |
# global_batch_size: 64
|
65 |
|
66 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
67 |
-
micro_batch_size:
|
68 |
# micro_batch_size: 4
|
69 |
# micro_batch_size: 2
|
70 |
# micro_batch_size: 1
|
@@ -113,8 +113,8 @@ eval:
|
|
113 |
# Optimizer-related arguments
|
114 |
|
115 |
optimizer:
|
116 |
-
|
117 |
-
class_path: torchao.prototype.low_bit_optim.AdamW8bit
|
118 |
# class_path: torchao.prototype.low_bit_optim.AdamW4bit
|
119 |
# class_path: bitsandbytes.optim.AdamW8bit
|
120 |
# class_path: bitsandbytes.optim.PagedAdamW8bit
|
|
|
64 |
# global_batch_size: 64
|
65 |
|
66 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
67 |
+
micro_batch_size: 5
|
68 |
# micro_batch_size: 4
|
69 |
# micro_batch_size: 2
|
70 |
# micro_batch_size: 1
|
|
|
113 |
# Optimizer-related arguments
|
114 |
|
115 |
optimizer:
|
116 |
+
class_path: torch.optim.AdamW
|
117 |
+
# class_path: torchao.prototype.low_bit_optim.AdamW8bit
|
118 |
# class_path: torchao.prototype.low_bit_optim.AdamW4bit
|
119 |
# class_path: bitsandbytes.optim.AdamW8bit
|
120 |
# class_path: bitsandbytes.optim.PagedAdamW8bit
|