File size: 2,383 Bytes
d1ebe7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
config: configs/vq_8k_siglip_b_res_p02_pw15_enc.yaml
exp_index:
data_path: /mnt/bn/cloud-project-lq/code/liuyh/data/vq_data/train
cloud_save_path: experiments/tokenizer
no_local_save: true
vq_model: VQ-16
vq_ckpt:
finetune: false
finetune_decoder: false
model_weight_strict: true
ema: false
codebook_size: 8192
codebook_embed_dim: 8
codebook_l2_norm: true
codebook_weight: 1.0
entropy_loss_ratio: 0.0
vq_loss_ratio: 1.0
commit_loss_beta: 0.25
reconstruction_weight: 1.0
reconstruction_loss: l2
kl_loss_weight: 1e-06
tau: 0.07
num_codebooks: 1
perceptual_weight: 1.0
perceptual_loss: vgg
perceptual_model: vgg
perceptual_dino_variants: depth12_no_train
perceptual_intermediate_loss: false
perceptual_logit_loss: false
perceptual_resize: false
perceptual_warmup: 10000
disc_weight: 0.2
disc_start: 40000
disc_dim: 64
disc_type: dino
disc_loss: hinge
gen_loss: hinge
lecam_loss_weight: 0.001
use_diff_aug: true
disc_cr_loss_weight: 4.0
disc_adaptive_weight: false
compile: false
dropout_p: 0.0
results_dir: ./logs/task/detailflow_demo_task_256token
dataset: imagenet
image_size: 256
epochs: 250
optimizer: adam
lr: 1.0e-4
lr_warmup_epochs: 1
lr_scheduler: cosine
weight_decay: 0.0001
beta1: 0.9
beta2: 0.95
max_grad_norm: 1.0
global_batch_size: 256
global_seed: 42
num_workers: 16
log_every: 50
vis_every: 5000
ckpt_every: 5000
save_epochs: 1
gradient_accumulation_steps: 1
mixed_precision: bf16
enc_type: siglip2
dec_type: siglip2
num_latent_tokens: 256
encoder_model: siglip2_base
decoder_model: siglip2_base
encoder_tuning_method: full
decoder_tuning_method: full
encoder_pretrained: true
decoder_pretrained: false
encoder_patch_size: 16
decoder_patch_size: 16
repa: false
repa_model: siglip2
repa_patch_size: 16
repa_proj_dim: 1024
repa_loss_weight: 0.5
repa_align: global
repa_layer_indices: 1
resume_from_newest_ckpt: true
gradient_checkpointing_encoder: false
gradient_checkpointing_decoder: false
debug_mode: false
content_degradation: resolution_power
degradation_prob: 0.2
degradation_loss_res: 224
degradation_power: 1.5
causal_encoder: true
causal_decoder: false
max_image_size: 256
min_image_size: 256
dynamic_max_image_size:
dynamic_resolution_prob: 0
max_resolution_prob: 0
adjust_bs_by_resolution: false
group_size: 8
global_token_loss_weight: 1.0
correction_training: true
causal_num:
rank: 0
world_size: 16
gpu: 0
dist_url: env://
distributed: true
dist_backend: nccl