scratch: resolution: 1024 train_batch_size: 1 num_train_workers: 3 num_frames: 8 max_num_objects: 4 base_lr: 5.0e-06 vision_lr: 3.0e-06 phases_per_epoch: 1 num_epochs: 40 dataset: img_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/JPEGImages gt_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/Annotations file_list_txt: null multiplier: 2 vos: train_transforms: - _target_: training.dataset.transforms.ComposeAPI transforms: - _target_: training.dataset.transforms.RandomHorizontalFlip consistent_transform: true - _target_: training.dataset.transforms.RandomAffine degrees: 25 shear: 20 image_interpolation: bilinear consistent_transform: true - _target_: training.dataset.transforms.RandomResizeAPI sizes: 1024 square: true consistent_transform: true - _target_: training.dataset.transforms.ColorJitter consistent_transform: true brightness: 0.1 contrast: 0.03 saturation: 0.03 hue: null - _target_: training.dataset.transforms.RandomGrayscale p: 0.05 consistent_transform: true - _target_: training.dataset.transforms.ColorJitter consistent_transform: false brightness: 0.1 contrast: 0.05 saturation: 0.05 hue: null - _target_: training.dataset.transforms.ToTensorAPI - _target_: training.dataset.transforms.NormalizeAPI mean: - 0.485 - 0.456 - 0.406 std: - 0.229 - 0.224 - 0.225 trainer: _target_: training.trainer.Trainer mode: train_only max_epochs: 40 accelerator: cuda seed_value: 123 model: _target_: training.model.sam2.SAM2Train image_encoder: _target_: sam2.modeling.backbones.image_encoder.ImageEncoder scalp: 1 trunk: _target_: sam2.modeling.backbones.hieradet.Hiera embed_dim: 144 num_heads: 2 stages: - 2 - 6 - 36 - 4 global_att_blocks: - 23 - 33 - 43 window_pos_embed_bkg_spatial_size: - 7 - 7 window_spec: - 8 - 4 - 16 - 8 neck: _target_: sam2.modeling.backbones.image_encoder.FpnNeck position_encoding: _target_: sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 256 normalize: true scale: null temperature: 10000 d_model: 256 backbone_channel_list: - 1152 - 576 - 288 - 144 fpn_top_down_levels: - 2 - 3 fpn_interp_model: nearest memory_attention: _target_: sam2.modeling.memory_attention.MemoryAttention d_model: 256 pos_enc_at_input: true layer: _target_: sam2.modeling.memory_attention.MemoryAttentionLayer activation: relu dim_feedforward: 2048 dropout: 0.1 pos_enc_at_attn: false self_attention: _target_: sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: - 64 - 64 embedding_dim: 256 num_heads: 1 downsample_rate: 1 dropout: 0.1 d_model: 256 pos_enc_at_cross_attn_keys: true pos_enc_at_cross_attn_queries: false cross_attention: _target_: sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: - 64 - 64 rope_k_repeat: true embedding_dim: 256 num_heads: 1 downsample_rate: 1 dropout: 0.1 kv_in_dim: 64 num_layers: 4 memory_encoder: _target_: sam2.modeling.memory_encoder.MemoryEncoder out_dim: 64 position_encoding: _target_: sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 64 normalize: true scale: null temperature: 10000 mask_downsampler: _target_: sam2.modeling.memory_encoder.MaskDownSampler kernel_size: 3 stride: 2 padding: 1 fuser: _target_: sam2.modeling.memory_encoder.Fuser layer: _target_: sam2.modeling.memory_encoder.CXBlock dim: 256 kernel_size: 7 padding: 3 layer_scale_init_value: 1.0e-06 use_dwconv: true num_layers: 2 num_maskmem: 7 image_size: 1024 sigmoid_scale_for_mem_enc: 20.0 sigmoid_bias_for_mem_enc: -10.0 use_mask_input_as_output_without_sam: true directly_add_no_mem_embed: true no_obj_embed_spatial: true use_high_res_features_in_sam: true multimask_output_in_sam: true iou_prediction_use_sigmoid: true use_obj_ptrs_in_encoder: true add_tpos_enc_to_obj_ptrs: true proj_tpos_enc_in_obj_ptrs: true use_signed_tpos_enc_to_obj_ptrs: true only_obj_ptrs_in_the_past_for_eval: true pred_obj_scores: true pred_obj_scores_mlp: true fixed_no_obj_ptr: true multimask_output_for_tracking: true use_multimask_token_for_obj_ptr: true multimask_min_pt_num: 0 multimask_max_pt_num: 1 use_mlp_for_obj_ptr_proj: true compile_image_encoder: false prob_to_use_pt_input_for_train: 0.5 prob_to_use_pt_input_for_eval: 0.0 prob_to_use_box_input_for_train: 0.5 prob_to_use_box_input_for_eval: 0.0 prob_to_sample_from_gt_for_train: 0.1 num_frames_to_correct_for_train: 2 num_frames_to_correct_for_eval: 1 rand_frames_to_correct_for_train: true add_all_frames_to_correct_as_cond: true num_init_cond_frames_for_train: 2 rand_init_cond_frames_for_train: true num_correction_pt_per_frame: 7 use_act_ckpt_iterative_pt_sampling: false num_init_cond_frames_for_eval: 1 forward_backbone_per_frame_for_eval: true data: train: _target_: training.dataset.sam2_datasets.TorchTrainMixedDataset phases_per_epoch: 1 batch_sizes: - 1 datasets: - _target_: training.dataset.utils.RepeatFactorWrapper dataset: _target_: training.dataset.utils.ConcatDataset datasets: - _target_: training.dataset.vos_dataset.VOSDataset transforms: - _target_: training.dataset.transforms.ComposeAPI transforms: - _target_: training.dataset.transforms.RandomHorizontalFlip consistent_transform: true - _target_: training.dataset.transforms.RandomAffine degrees: 25 shear: 20 image_interpolation: bilinear consistent_transform: true - _target_: training.dataset.transforms.RandomResizeAPI sizes: 1024 square: true consistent_transform: true - _target_: training.dataset.transforms.ColorJitter consistent_transform: true brightness: 0.1 contrast: 0.03 saturation: 0.03 hue: null - _target_: training.dataset.transforms.RandomGrayscale p: 0.05 consistent_transform: true - _target_: training.dataset.transforms.ColorJitter consistent_transform: false brightness: 0.1 contrast: 0.05 saturation: 0.05 hue: null - _target_: training.dataset.transforms.ToTensorAPI - _target_: training.dataset.transforms.NormalizeAPI mean: - 0.485 - 0.456 - 0.406 std: - 0.229 - 0.224 - 0.225 training: true video_dataset: _target_: training.dataset.vos_raw_dataset.PNGRawDataset img_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/JPEGImages gt_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/Annotations file_list_txt: null sampler: _target_: training.dataset.vos_sampler.RandomUniformSampler num_frames: 8 max_num_objects: 4 multiplier: 2 shuffle: true num_workers: 3 pin_memory: true drop_last: true collate_fn: _target_: training.utils.data_utils.collate_fn _partial_: true dict_key: all optim: amp: enabled: true amp_dtype: bfloat16 optimizer: _target_: torch.optim.AdamW gradient_clip: _target_: training.optimizer.GradientClipper max_norm: 0.1 norm_type: 2 param_group_modifiers: - _target_: training.optimizer.layer_decay_param_modifier _partial_: true layer_decay_value: 0.9 apply_to: image_encoder.trunk overrides: - pattern: '*pos_embed*' value: 1.0 options: lr: - scheduler: _target_: fvcore.common.param_scheduler.CosineParamScheduler start_value: 5.0e-06 end_value: 5.000000000000001e-07 - scheduler: _target_: fvcore.common.param_scheduler.CosineParamScheduler start_value: 3.0e-06 end_value: 3.0e-07 param_names: - image_encoder.* weight_decay: - scheduler: _target_: fvcore.common.param_scheduler.ConstantParamScheduler value: 0.1 - scheduler: _target_: fvcore.common.param_scheduler.ConstantParamScheduler value: 0.0 param_names: - '*bias*' module_cls_names: - torch.nn.LayerNorm loss: all: _target_: training.loss_fns.MultiStepMultiMasksAndIous weight_dict: loss_mask: 20 loss_dice: 1 loss_iou: 1 loss_class: 1 supervise_all_iou: true iou_use_l1_loss: true pred_obj_scores: true focal_gamma_obj_score: 0.0 focal_alpha_obj_score: -1.0 distributed: backend: nccl find_unused_parameters: true logging: tensorboard_writer: _target_: training.utils.logger.make_tensorboard_logger log_dir: /ephemeral/hossein/output/sam2/tensorboard flush_secs: 120 should_log: true log_dir: /ephemeral/hossein/output/sam2/logs log_freq: 10 checkpoint: save_dir: /ephemeral/hossein/output/sam2/checkpoints save_freq: 1 model_weight_initializer: _partial_: true _target_: training.utils.checkpoint_utils.load_state_dict_into_model strict: true ignore_unexpected_keys: null ignore_missing_keys: null state_dict: _target_: training.utils.checkpoint_utils.load_checkpoint_and_apply_kernels checkpoint_path: /home/hossein/hossein/projects/sam2/checkpoints/sam2.1_hiera_large.pt ckpt_state_dict_keys: - model launcher: num_nodes: 1 gpus_per_node: 4 experiment_log_dir: /ephemeral/hossein/output/sam2 submitit: partition: null account: null qos: null cpus_per_task: 10 use_cluster: false timeout_hour: 24 name: null port_range: - 10000 - 65000