|
|
import verifiers as vf |
|
|
|
|
|
""" |
|
|
# install |
|
|
vf-install complex-json-output (-p /path/to/environments) |
|
|
|
|
|
# quick eval |
|
|
vf-eval complex-json-output (-m model_name in endpoints.py) |
|
|
|
|
|
inference: |
|
|
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 vf-vllm --model Qwen/Qwen2.5-1.5B-Instruct \ |
|
|
--data-parallel-size 6 --enforce-eager --disable-log-requests |
|
|
|
|
|
training: |
|
|
CUDA_VISIBLE_DEVICES=6,7 accelerate launch --num-processes 2 \ |
|
|
--config-file configs/zero3.yaml examples/grpo/train_complex_json_output.py |
|
|
""" |
|
|
|
|
|
|
|
|
HPARAMS = [ |
|
|
"per_device_train_batch_size", |
|
|
"num_generations", |
|
|
"gradient_accumulation_steps", |
|
|
"max_tokens", |
|
|
"max_seq_len", |
|
|
"max_prompt_length", |
|
|
"max_completion_length", |
|
|
"temperature", |
|
|
"learning_rate", |
|
|
"max_steps", |
|
|
"warmup_steps", |
|
|
"eval_steps", |
|
|
"save_steps", |
|
|
"beta", |
|
|
"loss_type", |
|
|
] |
|
|
|
|
|
|
|
|
vf_env = vf.load_environment( |
|
|
env_id="complex-json-output", |
|
|
num_train_examples=8000, |
|
|
num_eval_examples=50 |
|
|
) |
|
|
|
|
|
|
|
|
model_name = "/raid/workspace/Mango/verifiers/MS3.2-0.35-Beta" |
|
|
run_name = "complex-json-grpo_" + model_name.split("/")[-1].lower() |
|
|
|
|
|
|
|
|
model, tokenizer = vf.get_model_and_tokenizer(model_name) |
|
|
|
|
|
|
|
|
training_args = vf.grpo_defaults(run_name=run_name) |
|
|
|
|
|
|
|
|
training_args.per_device_train_batch_size = 2 |
|
|
training_args.num_generations = 16 |
|
|
training_args.gradient_accumulation_steps = 2 |
|
|
|
|
|
|
|
|
training_args.max_tokens = 2048 |
|
|
training_args.max_seq_len = 16000 |
|
|
training_args.max_prompt_length = 8192 |
|
|
training_args.max_completion_length = 4096 |
|
|
training_args.temperature = 0.1 |
|
|
|
|
|
|
|
|
training_args.learning_rate = 5e-6 |
|
|
training_args.max_steps = 1000 |
|
|
training_args.warmup_steps = 15 |
|
|
|
|
|
|
|
|
training_args.eval_strategy = "none" |
|
|
training_args.eval_steps = 50 |
|
|
training_args.per_device_eval_batch_size = 8 |
|
|
|
|
|
|
|
|
training_args.save_strategy = "steps" |
|
|
training_args.save_steps = 100 |
|
|
|
|
|
|
|
|
training_args.beta = 0.001 |
|
|
training_args.loss_type = "dr_grpo" |
|
|
|
|
|
|
|
|
training_args.logging_steps = 1 |
|
|
training_args.log_completions = True |
|
|
training_args.num_completions_to_print = 3 |
|
|
training_args.report_to = "wandb" |
|
|
|
|
|
|
|
|
trainer = vf.GRPOTrainer( |
|
|
model=model, |
|
|
processing_class=tokenizer, |
|
|
env=vf_env, |
|
|
args=training_args, |
|
|
peft_config=vf.lora_defaults(r=8, alpha=16), |
|
|
) |
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|