sam2 / config_resolved.yaml
hosseinbv's picture
Uploading /ephemeral/hossein/output/sam2
dab2297 verified
scratch:
resolution: 1024
train_batch_size: 1
num_train_workers: 3
num_frames: 8
max_num_objects: 4
base_lr: 5.0e-06
vision_lr: 3.0e-06
phases_per_epoch: 1
num_epochs: 40
dataset:
img_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/JPEGImages
gt_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/Annotations
file_list_txt: null
multiplier: 2
vos:
train_transforms:
- _target_: training.dataset.transforms.ComposeAPI
transforms:
- _target_: training.dataset.transforms.RandomHorizontalFlip
consistent_transform: true
- _target_: training.dataset.transforms.RandomAffine
degrees: 25
shear: 20
image_interpolation: bilinear
consistent_transform: true
- _target_: training.dataset.transforms.RandomResizeAPI
sizes: 1024
square: true
consistent_transform: true
- _target_: training.dataset.transforms.ColorJitter
consistent_transform: true
brightness: 0.1
contrast: 0.03
saturation: 0.03
hue: null
- _target_: training.dataset.transforms.RandomGrayscale
p: 0.05
consistent_transform: true
- _target_: training.dataset.transforms.ColorJitter
consistent_transform: false
brightness: 0.1
contrast: 0.05
saturation: 0.05
hue: null
- _target_: training.dataset.transforms.ToTensorAPI
- _target_: training.dataset.transforms.NormalizeAPI
mean:
- 0.485
- 0.456
- 0.406
std:
- 0.229
- 0.224
- 0.225
trainer:
_target_: training.trainer.Trainer
mode: train_only
max_epochs: 40
accelerator: cuda
seed_value: 123
model:
_target_: training.model.sam2.SAM2Train
image_encoder:
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
scalp: 1
trunk:
_target_: sam2.modeling.backbones.hieradet.Hiera
embed_dim: 144
num_heads: 2
stages:
- 2
- 6
- 36
- 4
global_att_blocks:
- 23
- 33
- 43
window_pos_embed_bkg_spatial_size:
- 7
- 7
window_spec:
- 8
- 4
- 16
- 8
neck:
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 256
normalize: true
scale: null
temperature: 10000
d_model: 256
backbone_channel_list:
- 1152
- 576
- 288
- 144
fpn_top_down_levels:
- 2
- 3
fpn_interp_model: nearest
memory_attention:
_target_: sam2.modeling.memory_attention.MemoryAttention
d_model: 256
pos_enc_at_input: true
layer:
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
activation: relu
dim_feedforward: 2048
dropout: 0.1
pos_enc_at_attn: false
self_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes:
- 64
- 64
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
d_model: 256
pos_enc_at_cross_attn_keys: true
pos_enc_at_cross_attn_queries: false
cross_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes:
- 64
- 64
rope_k_repeat: true
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
kv_in_dim: 64
num_layers: 4
memory_encoder:
_target_: sam2.modeling.memory_encoder.MemoryEncoder
out_dim: 64
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 64
normalize: true
scale: null
temperature: 10000
mask_downsampler:
_target_: sam2.modeling.memory_encoder.MaskDownSampler
kernel_size: 3
stride: 2
padding: 1
fuser:
_target_: sam2.modeling.memory_encoder.Fuser
layer:
_target_: sam2.modeling.memory_encoder.CXBlock
dim: 256
kernel_size: 7
padding: 3
layer_scale_init_value: 1.0e-06
use_dwconv: true
num_layers: 2
num_maskmem: 7
image_size: 1024
sigmoid_scale_for_mem_enc: 20.0
sigmoid_bias_for_mem_enc: -10.0
use_mask_input_as_output_without_sam: true
directly_add_no_mem_embed: true
no_obj_embed_spatial: true
use_high_res_features_in_sam: true
multimask_output_in_sam: true
iou_prediction_use_sigmoid: true
use_obj_ptrs_in_encoder: true
add_tpos_enc_to_obj_ptrs: true
proj_tpos_enc_in_obj_ptrs: true
use_signed_tpos_enc_to_obj_ptrs: true
only_obj_ptrs_in_the_past_for_eval: true
pred_obj_scores: true
pred_obj_scores_mlp: true
fixed_no_obj_ptr: true
multimask_output_for_tracking: true
use_multimask_token_for_obj_ptr: true
multimask_min_pt_num: 0
multimask_max_pt_num: 1
use_mlp_for_obj_ptr_proj: true
compile_image_encoder: false
prob_to_use_pt_input_for_train: 0.5
prob_to_use_pt_input_for_eval: 0.0
prob_to_use_box_input_for_train: 0.5
prob_to_use_box_input_for_eval: 0.0
prob_to_sample_from_gt_for_train: 0.1
num_frames_to_correct_for_train: 2
num_frames_to_correct_for_eval: 1
rand_frames_to_correct_for_train: true
add_all_frames_to_correct_as_cond: true
num_init_cond_frames_for_train: 2
rand_init_cond_frames_for_train: true
num_correction_pt_per_frame: 7
use_act_ckpt_iterative_pt_sampling: false
num_init_cond_frames_for_eval: 1
forward_backbone_per_frame_for_eval: true
data:
train:
_target_: training.dataset.sam2_datasets.TorchTrainMixedDataset
phases_per_epoch: 1
batch_sizes:
- 1
datasets:
- _target_: training.dataset.utils.RepeatFactorWrapper
dataset:
_target_: training.dataset.utils.ConcatDataset
datasets:
- _target_: training.dataset.vos_dataset.VOSDataset
transforms:
- _target_: training.dataset.transforms.ComposeAPI
transforms:
- _target_: training.dataset.transforms.RandomHorizontalFlip
consistent_transform: true
- _target_: training.dataset.transforms.RandomAffine
degrees: 25
shear: 20
image_interpolation: bilinear
consistent_transform: true
- _target_: training.dataset.transforms.RandomResizeAPI
sizes: 1024
square: true
consistent_transform: true
- _target_: training.dataset.transforms.ColorJitter
consistent_transform: true
brightness: 0.1
contrast: 0.03
saturation: 0.03
hue: null
- _target_: training.dataset.transforms.RandomGrayscale
p: 0.05
consistent_transform: true
- _target_: training.dataset.transforms.ColorJitter
consistent_transform: false
brightness: 0.1
contrast: 0.05
saturation: 0.05
hue: null
- _target_: training.dataset.transforms.ToTensorAPI
- _target_: training.dataset.transforms.NormalizeAPI
mean:
- 0.485
- 0.456
- 0.406
std:
- 0.229
- 0.224
- 0.225
training: true
video_dataset:
_target_: training.dataset.vos_raw_dataset.PNGRawDataset
img_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/JPEGImages
gt_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/Annotations
file_list_txt: null
sampler:
_target_: training.dataset.vos_sampler.RandomUniformSampler
num_frames: 8
max_num_objects: 4
multiplier: 2
shuffle: true
num_workers: 3
pin_memory: true
drop_last: true
collate_fn:
_target_: training.utils.data_utils.collate_fn
_partial_: true
dict_key: all
optim:
amp:
enabled: true
amp_dtype: bfloat16
optimizer:
_target_: torch.optim.AdamW
gradient_clip:
_target_: training.optimizer.GradientClipper
max_norm: 0.1
norm_type: 2
param_group_modifiers:
- _target_: training.optimizer.layer_decay_param_modifier
_partial_: true
layer_decay_value: 0.9
apply_to: image_encoder.trunk
overrides:
- pattern: '*pos_embed*'
value: 1.0
options:
lr:
- scheduler:
_target_: fvcore.common.param_scheduler.CosineParamScheduler
start_value: 5.0e-06
end_value: 5.000000000000001e-07
- scheduler:
_target_: fvcore.common.param_scheduler.CosineParamScheduler
start_value: 3.0e-06
end_value: 3.0e-07
param_names:
- image_encoder.*
weight_decay:
- scheduler:
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
value: 0.1
- scheduler:
_target_: fvcore.common.param_scheduler.ConstantParamScheduler
value: 0.0
param_names:
- '*bias*'
module_cls_names:
- torch.nn.LayerNorm
loss:
all:
_target_: training.loss_fns.MultiStepMultiMasksAndIous
weight_dict:
loss_mask: 20
loss_dice: 1
loss_iou: 1
loss_class: 1
supervise_all_iou: true
iou_use_l1_loss: true
pred_obj_scores: true
focal_gamma_obj_score: 0.0
focal_alpha_obj_score: -1.0
distributed:
backend: nccl
find_unused_parameters: true
logging:
tensorboard_writer:
_target_: training.utils.logger.make_tensorboard_logger
log_dir: /ephemeral/hossein/output/sam2/tensorboard
flush_secs: 120
should_log: true
log_dir: /ephemeral/hossein/output/sam2/logs
log_freq: 10
checkpoint:
save_dir: /ephemeral/hossein/output/sam2/checkpoints
save_freq: 1
model_weight_initializer:
_partial_: true
_target_: training.utils.checkpoint_utils.load_state_dict_into_model
strict: true
ignore_unexpected_keys: null
ignore_missing_keys: null
state_dict:
_target_: training.utils.checkpoint_utils.load_checkpoint_and_apply_kernels
checkpoint_path: /home/hossein/hossein/projects/sam2/checkpoints/sam2.1_hiera_large.pt
ckpt_state_dict_keys:
- model
launcher:
num_nodes: 1
gpus_per_node: 4
experiment_log_dir: /ephemeral/hossein/output/sam2
submitit:
partition: null
account: null
qos: null
cpus_per_task: 10
use_cluster: false
timeout_hour: 24
name: null
port_range:
- 10000
- 65000