| { | |
| "architectures": [ | |
| "InternVideo2_CLIP_small" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "config.InternVideo2Config", | |
| "AutoModel": "modeling_internvideo2encoder.InternVideo2_CLIP_small" | |
| }, | |
| "auto_resume": false, | |
| "batch_size": 64, | |
| "batch_size_test": 4, | |
| "best_key": [ | |
| "msrvtt_1k_test_match", | |
| "t2v_r1" | |
| ], | |
| "compile_model": false, | |
| "criterion": { | |
| "clip_loss_ratio": [ | |
| 1.0, | |
| 1.0 | |
| ], | |
| "distill_final_features": true, | |
| "loss_weight": { | |
| "mlm": 1.0, | |
| "mvm": 0.0, | |
| "uta": 0.0, | |
| "vtc": 1.0, | |
| "vtm": 1.0 | |
| }, | |
| "mlm_masking_prob": 0.5, | |
| "vtm_hard_neg": true | |
| }, | |
| "debug": false, | |
| "deep_fusion": false, | |
| "deepspeed": { | |
| "enable": true, | |
| "stage": 1 | |
| }, | |
| "delete_ds_optim_states": true, | |
| "device": "cuda", | |
| "dist_url": "env://", | |
| "evaluate": false, | |
| "evaluation": { | |
| "eval_frame_ensemble": "concat", | |
| "eval_offload": true, | |
| "eval_x_only": false, | |
| "k_test": 128 | |
| }, | |
| "gradient_checkpointing": true, | |
| "inputs": { | |
| "batch_size": { | |
| "image": 64, | |
| "video": 64 | |
| }, | |
| "batch_size_test": { | |
| "image": 4, | |
| "video": 4 | |
| }, | |
| "image_res": 224, | |
| "max_txt_l": { | |
| "image": 32, | |
| "video": 32 | |
| }, | |
| "video_input": { | |
| "num_frames": 8, | |
| "num_frames_test": 8, | |
| "random_aug": false, | |
| "sample_type": "middle", | |
| "sample_type_test": "middle" | |
| } | |
| }, | |
| "jump_evaluate": false, | |
| "log_freq": 100, | |
| "max_txt_l": 32, | |
| "mode": "pt", | |
| "model": { | |
| "embed_dim": 1024, | |
| "find_unused_parameters": false, | |
| "freeze_text": true, | |
| "freeze_vision": true, | |
| "load_vision_ckpt_from_internvideo2_stage2": false, | |
| "model_cls": "InternVideo2_CLIP_small", | |
| "multimodal": { | |
| "enable": true | |
| }, | |
| "open_text_projection": false, | |
| "open_vision_clip_projector": true, | |
| "temp": 0.01, | |
| "temp_min": 0.01, | |
| "text_encoder": { | |
| "embed_dim": 512, | |
| "image_cfg": { | |
| "image_size": 224, | |
| "model_name": "vit_b16" | |
| }, | |
| "text_cfg": { | |
| "causal_masking": true, | |
| "context_length": 77, | |
| "dim": 512, | |
| "ffn_multiplier_per_layer": 4.0, | |
| "model_name": "base", | |
| "n_heads_per_layer": 8, | |
| "n_transformer_layers": 12, | |
| "norm_layer": "layer_norm_fp32", | |
| "vocab_size": 49408 | |
| } | |
| }, | |
| "vision_encoder": { | |
| "align_dim": 512, | |
| "attn_pool_num_heads": 16, | |
| "checkpoint_num": 0, | |
| "clip_embed_dim": 768, | |
| "depth": 24, | |
| "drop_cls_token": false, | |
| "drop_path_rate": 0.0, | |
| "embed_dim": 1024, | |
| "fused_mlp_heuristic": 1, | |
| "head_drop_path_rate": 0.0, | |
| "img_size": 224, | |
| "in_chans": 3, | |
| "init_values": 0.1, | |
| "layerscale_no_force_fp32": true, | |
| "mlp_ratio": 4, | |
| "name": "internvideo2_1B", | |
| "num_frames": 8, | |
| "num_heads": 16, | |
| "patch_size": 14, | |
| "qk_normalization": true, | |
| "qkv_bias": false, | |
| "sep_pos_embed": false, | |
| "tubelet_size": 1, | |
| "use_checkpoint": false, | |
| "use_flash_attn": false, | |
| "use_fused_mlp": false, | |
| "use_fused_rmsnorm": false | |
| } | |
| }, | |
| "model_type": "internvideo2", | |
| "num_frames": 8, | |
| "num_frames_test": 8, | |
| "num_workers": 6, | |
| "optimizer": { | |
| "different_lr": { | |
| "enable": false, | |
| "lr": 0.001, | |
| "module_names": [] | |
| }, | |
| "lr": 5e-05, | |
| "max_grad_norm": 3.0, | |
| "opt": "adamW", | |
| "opt_betas": [ | |
| 0.9, | |
| 0.98 | |
| ], | |
| "weight_decay": 0.05 | |
| }, | |
| "output_dir": null, | |
| "pretrained_path": "", | |
| "resume": false, | |
| "save_ckpt_iter": null, | |
| "save_latest": true, | |
| "scheduler": { | |
| "epochs": 10, | |
| "min_lr_multi": 0.01, | |
| "sched": "cosine", | |
| "warmup_epochs": 1 | |
| }, | |
| "seed": 42, | |
| "test_file": { | |
| "didemo_ret_test": "available_corpus[\"didemo_ret_test\"]", | |
| "msrvtt_1k_test": "available_corpus[\"msrvtt_1k_test\"]" | |
| }, | |
| "test_types": [ | |
| "msrvtt_1k_test", | |
| "didemo_ret_test" | |
| ], | |
| "text_enc": "bert_large", | |
| "tokenizer": null, | |
| "torch_dtype": "float16", | |
| "train_file": "available_corpus[\"pretrain_example_data_1B\"]", | |
| "transformers_version": "4.51.3", | |
| "use_bf16": true, | |
| "use_flash_sdp": false, | |
| "use_half_precision": false, | |
| "use_mem_efficient_sdp": false, | |
| "wandb": { | |
| "enable": false, | |
| "entity": "opengvlab", | |
| "project": "InternVideo2-Stage2" | |
| } | |
| } | |