File size: 4,320 Bytes
ff495b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
{
  "architectures": [
    "InternVideo2_CLIP_small"
  ],
  "auto_map": {
    "AutoConfig": "config.InternVideo2Config",
    "AutoModel": "modeling_internvideo2encoder.InternVideo2_CLIP_small"
  },
  "auto_resume": false,
  "batch_size": 64,
  "batch_size_test": 4,
  "best_key": [
    "msrvtt_1k_test_match",
    "t2v_r1"
  ],
  "compile_model": false,
  "criterion": {
    "clip_loss_ratio": [
      1.0,
      1.0
    ],
    "distill_final_features": true,
    "loss_weight": {
      "mlm": 1.0,
      "mvm": 0.0,
      "uta": 0.0,
      "vtc": 1.0,
      "vtm": 1.0
    },
    "mlm_masking_prob": 0.5,
    "vtm_hard_neg": true
  },
  "debug": false,
  "deep_fusion": false,
  "deepspeed": {
    "enable": true,
    "stage": 1
  },
  "delete_ds_optim_states": true,
  "device": "cuda",
  "dist_url": "env://",
  "evaluate": false,
  "evaluation": {
    "eval_frame_ensemble": "concat",
    "eval_offload": true,
    "eval_x_only": false,
    "k_test": 128
  },
  "gradient_checkpointing": true,
  "inputs": {
    "batch_size": {
      "image": 64,
      "video": 64
    },
    "batch_size_test": {
      "image": 4,
      "video": 4
    },
    "image_res": 224,
    "max_txt_l": {
      "image": 32,
      "video": 32
    },
    "video_input": {
      "num_frames": 8,
      "num_frames_test": 8,
      "random_aug": false,
      "sample_type": "middle",
      "sample_type_test": "middle"
    }
  },
  "jump_evaluate": false,
  "log_freq": 100,
  "max_txt_l": 32,
  "mode": "pt",
  "model": {
    "embed_dim": 1024,
    "find_unused_parameters": false,
    "freeze_text": true,
    "freeze_vision": true,
    "load_vision_ckpt_from_internvideo2_stage2": false,
    "model_cls": "InternVideo2_CLIP_small",
    "multimodal": {
      "enable": true
    },
    "open_text_projection": false,
    "open_vision_clip_projector": true,
    "temp": 0.01,
    "temp_min": 0.01,
    "text_encoder": {
      "embed_dim": 512,
      "image_cfg": {
        "image_size": 224,
        "model_name": "vit_b16"
      },
      "text_cfg": {
        "causal_masking": true,
        "context_length": 77,
        "dim": 512,
        "ffn_multiplier_per_layer": 4.0,
        "model_name": "base",
        "n_heads_per_layer": 8,
        "n_transformer_layers": 12,
        "norm_layer": "layer_norm_fp32",
        "vocab_size": 49408
      }
    },
    "vision_encoder": {
      "align_dim": 512,
      "attn_pool_num_heads": 16,
      "checkpoint_num": 0,
      "clip_embed_dim": 768,
      "depth": 24,
      "drop_cls_token": false,
      "drop_path_rate": 0.0,
      "embed_dim": 1024,
      "fused_mlp_heuristic": 1,
      "head_drop_path_rate": 0.0,
      "img_size": 224,
      "in_chans": 3,
      "init_values": 0.1,
      "layerscale_no_force_fp32": true,
      "mlp_ratio": 4,
      "name": "internvideo2_1B",
      "num_frames": 8,
      "num_heads": 16,
      "patch_size": 14,
      "qk_normalization": true,
      "qkv_bias": false,
      "sep_pos_embed": false,
      "tubelet_size": 1,
      "use_checkpoint": false,
      "use_flash_attn": false,
      "use_fused_mlp": false,
      "use_fused_rmsnorm": false
    }
  },
  "model_type": "internvideo2",
  "num_frames": 8,
  "num_frames_test": 8,
  "num_workers": 6,
  "optimizer": {
    "different_lr": {
      "enable": false,
      "lr": 0.001,
      "module_names": []
    },
    "lr": 5e-05,
    "max_grad_norm": 3.0,
    "opt": "adamW",
    "opt_betas": [
      0.9,
      0.98
    ],
    "weight_decay": 0.05
  },
  "output_dir": null,
  "pretrained_path": "",
  "resume": false,
  "save_ckpt_iter": null,
  "save_latest": true,
  "scheduler": {
    "epochs": 10,
    "min_lr_multi": 0.01,
    "sched": "cosine",
    "warmup_epochs": 1
  },
  "seed": 42,
  "test_file": {
    "didemo_ret_test": "available_corpus[\"didemo_ret_test\"]",
    "msrvtt_1k_test": "available_corpus[\"msrvtt_1k_test\"]"
  },
  "test_types": [
    "msrvtt_1k_test",
    "didemo_ret_test"
  ],
  "text_enc": "bert_large",
  "tokenizer": null,
  "torch_dtype": "float16",
  "train_file": "available_corpus[\"pretrain_example_data_1B\"]",
  "transformers_version": "4.51.3",
  "use_bf16": true,
  "use_flash_sdp": false,
  "use_half_precision": false,
  "use_mem_efficient_sdp": false,
  "wandb": {
    "enable": false,
    "entity": "opengvlab",
    "project": "InternVideo2-Stage2"
  }
}