File size: 3,468 Bytes
e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb b13892a 8b99d80 e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb b13892a e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb 8b99d80 e4b53eb 8b99d80 b13892a e4b53eb 8b99d80 e4b53eb 8b99d80 b13892a e4b53eb 8b99d80 e4b53eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
{
"_name_or_path": "trained_models/t5_qanom-joint-23.03.22",
"append_verb_form": true,
"architectures": [
"T5ForConditionalGeneration"
],
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"debug_mode": false,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"description": "optimal joint config from sweep, mainly for qanom",
"dir_switch": "joint_optimal",
"do_eval_on": "validation",
"dropout_rate": 0.1,
"eos_token_id": 1,
"eval_steps": 500,
"evaluation_strategy": "steps",
"feed_forward_proj": "relu",
"fp16": true,
"gradient_accumulation_steps": 14,
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"layer_norm_epsilon": 1e-06,
"learning_rate": 0.001,
"load_best_model_at_end": true,
"logging_steps": 500,
"logging_strategy": "steps",
"metric_for_best_model": "eval_loss",
"model_type": "t5",
"n_positions": 512,
"num_beams": 5,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"overwrite_output_dir": true,
"pad_token_id": 0,
"per_device_eval_batch_size": 12,
"per_device_train_batch_size": 12,
"predicate_marker_type": "generic",
"predict_with_generate": true,
"preprocess_input_func": "input_predicate_marker",
"preprocessing_kwargs": {
"append_verb_form": true,
"debug_mode": false,
"description": "optimal joint config from sweep, mainly for qanom",
"dir_switch": "joint_optimal",
"do_eval_on": "validation",
"dropout_rate": 0.1,
"eval_steps": 500,
"evaluation_strategy": "steps",
"fp16": true,
"gradient_accumulation_steps": 14,
"learning_rate": 0.001,
"load_best_model_at_end": true,
"logging_steps": 500,
"logging_strategy": "steps",
"metric_for_best_model": "eval_loss",
"model_type": "t5",
"num_beams": 5,
"overwrite_output_dir": true,
"per_device_eval_batch_size": 12,
"per_device_train_batch_size": 12,
"predicate_marker_type": "generic",
"predict_with_generate": true,
"preprocess_input_func": "input_predicate_marker",
"qanom_joint_factor": 14,
"save_steps": 500,
"save_strategy": "steps",
"seed": 44,
"source_prefix": "parse: ",
"train_dataset": "joint_qanom",
"train_epochs": 20,
"use_bilateral_predicate_marker": true
},
"qanom_joint_factor": 14,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"save_steps": 500,
"save_strategy": "steps",
"seed": 44,
"source_prefix": "parse: ",
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"train_dataset": "joint_qanom",
"train_epochs": 20,
"transformers_version": "4.26.1",
"use_bilateral_predicate_marker": true,
"use_cache": true,
"vocab_size": 32101
}
|