HassanSamo commited on
Commit
c52239f
·
1 Parent(s): 310d25b

initial commit

Browse files
eval.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
mistral-finetune-own-data.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
train.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
verb-workspace/verb.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ build:
2
+ system_packages: []
3
+ python_version: '3.10'
4
+ cuda: 12.0.1
5
+ python_packages:
6
+ - jupyterlab
7
+ run:
8
+ - sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended
9
+ user:
10
+ shell: zsh
11
+ authorized_keys_path: /home/ubuntu/.ssh/authorized_keys
12
+ ports:
13
+ - '2222:22'
14
+ services:
15
+ - name: jupyter
16
+ entrypoint: jupyter-lab --ip=0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.password=''
17
+ ports:
18
+ - '8888'
wandb/debug-internal.log ADDED
@@ -0,0 +1 @@
 
 
1
+ run-20231119_155754-znnni89w/logs/debug-internal.log
wandb/debug.log ADDED
@@ -0,0 +1 @@
 
 
1
+ run-20231119_155754-znnni89w/logs/debug.log
wandb/latest-run ADDED
@@ -0,0 +1 @@
 
 
1
+ run-20231119_155754-znnni89w
wandb/run-20231119_155754-znnni89w/files/config.yaml ADDED
@@ -0,0 +1,663 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.10.12
7
+ cli_version: 0.16.0
8
+ framework: huggingface
9
+ huggingface_version: 4.36.0.dev0
10
+ is_jupyter_run: true
11
+ is_kaggle_kernel: false
12
+ start_time: 1700409474.807221
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 11
17
+ - 49
18
+ - 51
19
+ - 55
20
+ - 71
21
+ - 98
22
+ 2:
23
+ - 1
24
+ - 11
25
+ - 49
26
+ - 51
27
+ - 55
28
+ - 71
29
+ - 98
30
+ 3:
31
+ - 7
32
+ - 13
33
+ - 23
34
+ 4: 3.10.12
35
+ 5: 0.16.0
36
+ 6: 4.36.0.dev0
37
+ 8:
38
+ - 1
39
+ - 5
40
+ 9:
41
+ 1: transformers_trainer
42
+ 13: linux-x86_64
43
+ m:
44
+ - 1: train/global_step
45
+ 6:
46
+ - 3
47
+ - 1: train/loss
48
+ 5: 1
49
+ 6:
50
+ - 1
51
+ - 1: train/learning_rate
52
+ 5: 1
53
+ 6:
54
+ - 1
55
+ - 1: train/epoch
56
+ 5: 1
57
+ 6:
58
+ - 1
59
+ - 1: eval/loss
60
+ 5: 1
61
+ 6:
62
+ - 1
63
+ - 1: eval/runtime
64
+ 5: 1
65
+ 6:
66
+ - 1
67
+ - 1: eval/samples_per_second
68
+ 5: 1
69
+ 6:
70
+ - 1
71
+ - 1: eval/steps_per_second
72
+ 5: 1
73
+ 6:
74
+ - 1
75
+ vocab_size:
76
+ desc: null
77
+ value: 32000
78
+ max_position_embeddings:
79
+ desc: null
80
+ value: 32768
81
+ hidden_size:
82
+ desc: null
83
+ value: 4096
84
+ intermediate_size:
85
+ desc: null
86
+ value: 14336
87
+ num_hidden_layers:
88
+ desc: null
89
+ value: 32
90
+ num_attention_heads:
91
+ desc: null
92
+ value: 32
93
+ num_key_value_heads:
94
+ desc: null
95
+ value: 8
96
+ hidden_act:
97
+ desc: null
98
+ value: silu
99
+ initializer_range:
100
+ desc: null
101
+ value: 0.02
102
+ rms_norm_eps:
103
+ desc: null
104
+ value: 1.0e-05
105
+ pretraining_tp:
106
+ desc: null
107
+ value: 1
108
+ use_cache:
109
+ desc: null
110
+ value: false
111
+ rope_theta:
112
+ desc: null
113
+ value: 10000.0
114
+ rope_scaling:
115
+ desc: null
116
+ value: null
117
+ attention_bias:
118
+ desc: null
119
+ value: false
120
+ attention_dropout:
121
+ desc: null
122
+ value: 0.0
123
+ return_dict:
124
+ desc: null
125
+ value: true
126
+ output_hidden_states:
127
+ desc: null
128
+ value: false
129
+ output_attentions:
130
+ desc: null
131
+ value: false
132
+ torchscript:
133
+ desc: null
134
+ value: false
135
+ torch_dtype:
136
+ desc: null
137
+ value: float16
138
+ use_bfloat16:
139
+ desc: null
140
+ value: false
141
+ tf_legacy_loss:
142
+ desc: null
143
+ value: false
144
+ pruned_heads:
145
+ desc: null
146
+ value: {}
147
+ tie_word_embeddings:
148
+ desc: null
149
+ value: false
150
+ is_encoder_decoder:
151
+ desc: null
152
+ value: false
153
+ is_decoder:
154
+ desc: null
155
+ value: false
156
+ cross_attention_hidden_size:
157
+ desc: null
158
+ value: null
159
+ add_cross_attention:
160
+ desc: null
161
+ value: false
162
+ tie_encoder_decoder:
163
+ desc: null
164
+ value: false
165
+ max_length:
166
+ desc: null
167
+ value: 20
168
+ min_length:
169
+ desc: null
170
+ value: 0
171
+ do_sample:
172
+ desc: null
173
+ value: false
174
+ early_stopping:
175
+ desc: null
176
+ value: false
177
+ num_beams:
178
+ desc: null
179
+ value: 1
180
+ num_beam_groups:
181
+ desc: null
182
+ value: 1
183
+ diversity_penalty:
184
+ desc: null
185
+ value: 0.0
186
+ temperature:
187
+ desc: null
188
+ value: 1.0
189
+ top_k:
190
+ desc: null
191
+ value: 50
192
+ top_p:
193
+ desc: null
194
+ value: 1.0
195
+ typical_p:
196
+ desc: null
197
+ value: 1.0
198
+ repetition_penalty:
199
+ desc: null
200
+ value: 1.0
201
+ length_penalty:
202
+ desc: null
203
+ value: 1.0
204
+ no_repeat_ngram_size:
205
+ desc: null
206
+ value: 0
207
+ encoder_no_repeat_ngram_size:
208
+ desc: null
209
+ value: 0
210
+ bad_words_ids:
211
+ desc: null
212
+ value: null
213
+ num_return_sequences:
214
+ desc: null
215
+ value: 1
216
+ chunk_size_feed_forward:
217
+ desc: null
218
+ value: 0
219
+ output_scores:
220
+ desc: null
221
+ value: false
222
+ return_dict_in_generate:
223
+ desc: null
224
+ value: false
225
+ forced_bos_token_id:
226
+ desc: null
227
+ value: null
228
+ forced_eos_token_id:
229
+ desc: null
230
+ value: null
231
+ remove_invalid_values:
232
+ desc: null
233
+ value: false
234
+ exponential_decay_length_penalty:
235
+ desc: null
236
+ value: null
237
+ suppress_tokens:
238
+ desc: null
239
+ value: null
240
+ begin_suppress_tokens:
241
+ desc: null
242
+ value: null
243
+ architectures:
244
+ desc: null
245
+ value:
246
+ - LlamaForCausalLM
247
+ finetuning_task:
248
+ desc: null
249
+ value: null
250
+ id2label:
251
+ desc: null
252
+ value:
253
+ '0': LABEL_0
254
+ '1': LABEL_1
255
+ label2id:
256
+ desc: null
257
+ value:
258
+ LABEL_0: 0
259
+ LABEL_1: 1
260
+ tokenizer_class:
261
+ desc: null
262
+ value: null
263
+ prefix:
264
+ desc: null
265
+ value: null
266
+ bos_token_id:
267
+ desc: null
268
+ value: 1
269
+ pad_token_id:
270
+ desc: null
271
+ value: null
272
+ eos_token_id:
273
+ desc: null
274
+ value: 2
275
+ sep_token_id:
276
+ desc: null
277
+ value: null
278
+ decoder_start_token_id:
279
+ desc: null
280
+ value: null
281
+ task_specific_params:
282
+ desc: null
283
+ value: null
284
+ problem_type:
285
+ desc: null
286
+ value: null
287
+ _name_or_path:
288
+ desc: null
289
+ value: filipealmeida/Mistral-7B-v0.1-sharded
290
+ transformers_version:
291
+ desc: null
292
+ value: 4.36.0.dev0
293
+ model_type:
294
+ desc: null
295
+ value: llama
296
+ sliding_window:
297
+ desc: null
298
+ value: 4096
299
+ quantization_config:
300
+ desc: null
301
+ value:
302
+ quant_method: QuantizationMethod.BITS_AND_BYTES
303
+ load_in_8bit: false
304
+ load_in_4bit: true
305
+ llm_int8_threshold: 6.0
306
+ llm_int8_skip_modules: null
307
+ llm_int8_enable_fp32_cpu_offload: false
308
+ llm_int8_has_fp16_weight: false
309
+ bnb_4bit_quant_type: nf4
310
+ bnb_4bit_use_double_quant: true
311
+ bnb_4bit_compute_dtype: bfloat16
312
+ output_dir:
313
+ desc: null
314
+ value: ./mistral-finetune
315
+ overwrite_output_dir:
316
+ desc: null
317
+ value: false
318
+ do_train:
319
+ desc: null
320
+ value: false
321
+ do_eval:
322
+ desc: null
323
+ value: true
324
+ do_predict:
325
+ desc: null
326
+ value: false
327
+ evaluation_strategy:
328
+ desc: null
329
+ value: steps
330
+ prediction_loss_only:
331
+ desc: null
332
+ value: false
333
+ per_device_train_batch_size:
334
+ desc: null
335
+ value: 2
336
+ per_device_eval_batch_size:
337
+ desc: null
338
+ value: 8
339
+ per_gpu_train_batch_size:
340
+ desc: null
341
+ value: null
342
+ per_gpu_eval_batch_size:
343
+ desc: null
344
+ value: null
345
+ gradient_accumulation_steps:
346
+ desc: null
347
+ value: 1
348
+ eval_accumulation_steps:
349
+ desc: null
350
+ value: null
351
+ eval_delay:
352
+ desc: null
353
+ value: 0
354
+ learning_rate:
355
+ desc: null
356
+ value: 2.5e-05
357
+ weight_decay:
358
+ desc: null
359
+ value: 0.0
360
+ adam_beta1:
361
+ desc: null
362
+ value: 0.9
363
+ adam_beta2:
364
+ desc: null
365
+ value: 0.999
366
+ adam_epsilon:
367
+ desc: null
368
+ value: 1.0e-08
369
+ max_grad_norm:
370
+ desc: null
371
+ value: 1.0
372
+ num_train_epochs:
373
+ desc: null
374
+ value: 3.0
375
+ max_steps:
376
+ desc: null
377
+ value: 500
378
+ lr_scheduler_type:
379
+ desc: null
380
+ value: linear
381
+ lr_scheduler_kwargs:
382
+ desc: null
383
+ value: {}
384
+ warmup_ratio:
385
+ desc: null
386
+ value: 0.0
387
+ warmup_steps:
388
+ desc: null
389
+ value: 1
390
+ log_level:
391
+ desc: null
392
+ value: passive
393
+ log_level_replica:
394
+ desc: null
395
+ value: warning
396
+ log_on_each_node:
397
+ desc: null
398
+ value: true
399
+ logging_dir:
400
+ desc: null
401
+ value: ./logs
402
+ logging_strategy:
403
+ desc: null
404
+ value: steps
405
+ logging_first_step:
406
+ desc: null
407
+ value: false
408
+ logging_steps:
409
+ desc: null
410
+ value: 10
411
+ logging_nan_inf_filter:
412
+ desc: null
413
+ value: true
414
+ save_strategy:
415
+ desc: null
416
+ value: steps
417
+ save_steps:
418
+ desc: null
419
+ value: 10
420
+ save_total_limit:
421
+ desc: null
422
+ value: null
423
+ save_safetensors:
424
+ desc: null
425
+ value: true
426
+ save_on_each_node:
427
+ desc: null
428
+ value: false
429
+ no_cuda:
430
+ desc: null
431
+ value: false
432
+ use_cpu:
433
+ desc: null
434
+ value: false
435
+ use_mps_device:
436
+ desc: null
437
+ value: false
438
+ seed:
439
+ desc: null
440
+ value: 42
441
+ data_seed:
442
+ desc: null
443
+ value: null
444
+ jit_mode_eval:
445
+ desc: null
446
+ value: false
447
+ use_ipex:
448
+ desc: null
449
+ value: false
450
+ bf16:
451
+ desc: null
452
+ value: true
453
+ fp16:
454
+ desc: null
455
+ value: false
456
+ fp16_opt_level:
457
+ desc: null
458
+ value: O1
459
+ half_precision_backend:
460
+ desc: null
461
+ value: auto
462
+ bf16_full_eval:
463
+ desc: null
464
+ value: false
465
+ fp16_full_eval:
466
+ desc: null
467
+ value: false
468
+ tf32:
469
+ desc: null
470
+ value: null
471
+ local_rank:
472
+ desc: null
473
+ value: 0
474
+ ddp_backend:
475
+ desc: null
476
+ value: null
477
+ tpu_num_cores:
478
+ desc: null
479
+ value: null
480
+ tpu_metrics_debug:
481
+ desc: null
482
+ value: false
483
+ debug:
484
+ desc: null
485
+ value: []
486
+ dataloader_drop_last:
487
+ desc: null
488
+ value: false
489
+ eval_steps:
490
+ desc: null
491
+ value: 10
492
+ dataloader_num_workers:
493
+ desc: null
494
+ value: 0
495
+ past_index:
496
+ desc: null
497
+ value: -1
498
+ run_name:
499
+ desc: null
500
+ value: mistral-finetune-2023-11-19-16-01
501
+ disable_tqdm:
502
+ desc: null
503
+ value: false
504
+ remove_unused_columns:
505
+ desc: null
506
+ value: true
507
+ label_names:
508
+ desc: null
509
+ value: null
510
+ load_best_model_at_end:
511
+ desc: null
512
+ value: false
513
+ metric_for_best_model:
514
+ desc: null
515
+ value: null
516
+ greater_is_better:
517
+ desc: null
518
+ value: null
519
+ ignore_data_skip:
520
+ desc: null
521
+ value: false
522
+ fsdp:
523
+ desc: null
524
+ value: []
525
+ fsdp_min_num_params:
526
+ desc: null
527
+ value: 0
528
+ fsdp_config:
529
+ desc: null
530
+ value:
531
+ min_num_params: 0
532
+ xla: false
533
+ xla_fsdp_grad_ckpt: false
534
+ fsdp_transformer_layer_cls_to_wrap:
535
+ desc: null
536
+ value: null
537
+ deepspeed:
538
+ desc: null
539
+ value: null
540
+ label_smoothing_factor:
541
+ desc: null
542
+ value: 0.0
543
+ optim:
544
+ desc: null
545
+ value: paged_adamw_8bit
546
+ optim_args:
547
+ desc: null
548
+ value: null
549
+ adafactor:
550
+ desc: null
551
+ value: false
552
+ group_by_length:
553
+ desc: null
554
+ value: false
555
+ length_column_name:
556
+ desc: null
557
+ value: length
558
+ report_to:
559
+ desc: null
560
+ value:
561
+ - wandb
562
+ ddp_find_unused_parameters:
563
+ desc: null
564
+ value: null
565
+ ddp_bucket_cap_mb:
566
+ desc: null
567
+ value: null
568
+ ddp_broadcast_buffers:
569
+ desc: null
570
+ value: null
571
+ dataloader_pin_memory:
572
+ desc: null
573
+ value: true
574
+ skip_memory_metrics:
575
+ desc: null
576
+ value: true
577
+ use_legacy_prediction_loop:
578
+ desc: null
579
+ value: false
580
+ push_to_hub:
581
+ desc: null
582
+ value: false
583
+ resume_from_checkpoint:
584
+ desc: null
585
+ value: null
586
+ hub_model_id:
587
+ desc: null
588
+ value: null
589
+ hub_strategy:
590
+ desc: null
591
+ value: every_save
592
+ hub_token:
593
+ desc: null
594
+ value: <HUB_TOKEN>
595
+ hub_private_repo:
596
+ desc: null
597
+ value: false
598
+ hub_always_push:
599
+ desc: null
600
+ value: false
601
+ gradient_checkpointing:
602
+ desc: null
603
+ value: true
604
+ gradient_checkpointing_kwargs:
605
+ desc: null
606
+ value: null
607
+ include_inputs_for_metrics:
608
+ desc: null
609
+ value: false
610
+ fp16_backend:
611
+ desc: null
612
+ value: auto
613
+ push_to_hub_model_id:
614
+ desc: null
615
+ value: null
616
+ push_to_hub_organization:
617
+ desc: null
618
+ value: null
619
+ push_to_hub_token:
620
+ desc: null
621
+ value: <PUSH_TO_HUB_TOKEN>
622
+ mp_parameters:
623
+ desc: null
624
+ value: ''
625
+ auto_find_batch_size:
626
+ desc: null
627
+ value: false
628
+ full_determinism:
629
+ desc: null
630
+ value: false
631
+ torchdynamo:
632
+ desc: null
633
+ value: null
634
+ ray_scope:
635
+ desc: null
636
+ value: last
637
+ ddp_timeout:
638
+ desc: null
639
+ value: 1800
640
+ torch_compile:
641
+ desc: null
642
+ value: false
643
+ torch_compile_backend:
644
+ desc: null
645
+ value: null
646
+ torch_compile_mode:
647
+ desc: null
648
+ value: null
649
+ dispatch_batches:
650
+ desc: null
651
+ value: null
652
+ split_batches:
653
+ desc: null
654
+ value: false
655
+ include_tokens_per_second:
656
+ desc: null
657
+ value: false
658
+ include_num_input_tokens_seen:
659
+ desc: null
660
+ value: false
661
+ neftune_noise_alpha:
662
+ desc: null
663
+ value: null
wandb/run-20231119_155754-znnni89w/files/output.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
3
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
4
+ warnings.warn(
5
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
6
+ warnings.warn(
7
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
8
+ warnings.warn(
9
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
10
+ warnings.warn(
11
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
12
+ warnings.warn(
13
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
14
+ warnings.warn(
15
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
16
+ warnings.warn(
17
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
18
+ warnings.warn(
19
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
20
+ warnings.warn(
21
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
22
+ warnings.warn(
23
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
24
+ warnings.warn(
25
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
26
+ warnings.warn(
27
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
28
+ warnings.warn(
29
+ /home/ubuntu/.pyenv/versions/3.10.13/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py:472: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.
30
+ warnings.warn(
wandb/run-20231119_155754-znnni89w/files/requirements.txt ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.25.0.dev0
2
+ aiohttp==3.9.0
3
+ aiosignal==1.3.1
4
+ anyio==4.0.0
5
+ appdirs==1.4.4
6
+ argon2-cffi-bindings==21.2.0
7
+ argon2-cffi==23.1.0
8
+ arrow==1.3.0
9
+ asttokens==2.4.1
10
+ async-lru==2.0.4
11
+ async-timeout==4.0.3
12
+ attrs==23.1.0
13
+ babel==2.13.1
14
+ beautifulsoup4==4.12.2
15
+ bitsandbytes==0.41.2.post2
16
+ bleach==6.1.0
17
+ certifi==2023.11.17
18
+ cffi==1.16.0
19
+ charset-normalizer==3.3.2
20
+ click==8.1.7
21
+ comm==0.2.0
22
+ contourpy==1.2.0
23
+ cycler==0.12.1
24
+ datasets==2.15.0
25
+ debugpy==1.8.0
26
+ decorator==5.1.1
27
+ defusedxml==0.7.1
28
+ dill==0.3.7
29
+ docker-pycreds==0.4.0
30
+ exceptiongroup==1.1.3
31
+ executing==2.0.1
32
+ fastjsonschema==2.19.0
33
+ filelock==3.13.1
34
+ fonttools==4.44.3
35
+ fqdn==1.5.1
36
+ frozenlist==1.4.0
37
+ fsspec==2023.10.0
38
+ gitdb==4.0.11
39
+ gitpython==3.1.40
40
+ huggingface-hub==0.19.4
41
+ idna==3.4
42
+ ipykernel==6.26.0
43
+ ipython==8.17.2
44
+ ipywidgets==8.1.1
45
+ isoduration==20.11.0
46
+ jedi==0.19.1
47
+ jinja2==3.1.2
48
+ json5==0.9.14
49
+ jsonpointer==2.4
50
+ jsonschema-specifications==2023.11.1
51
+ jsonschema==4.20.0
52
+ jupyter-client==8.6.0
53
+ jupyter-core==5.5.0
54
+ jupyter-events==0.9.0
55
+ jupyter-lsp==2.2.0
56
+ jupyter-server-terminals==0.4.4
57
+ jupyter-server==2.10.1
58
+ jupyterlab-pygments==0.2.2
59
+ jupyterlab-server==2.25.2
60
+ jupyterlab-widgets==3.0.9
61
+ jupyterlab==4.0.9
62
+ kiwisolver==1.4.5
63
+ markupsafe==2.1.3
64
+ matplotlib-inline==0.1.6
65
+ matplotlib==3.8.2
66
+ mistune==3.0.2
67
+ mpmath==1.3.0
68
+ multidict==6.0.4
69
+ multiprocess==0.70.15
70
+ nbclient==0.9.0
71
+ nbconvert==7.11.0
72
+ nbformat==5.9.2
73
+ nest-asyncio==1.5.8
74
+ networkx==3.2.1
75
+ notebook-shim==0.2.3
76
+ numpy==1.26.2
77
+ nvidia-cublas-cu12==12.1.3.1
78
+ nvidia-cuda-cupti-cu12==12.1.105
79
+ nvidia-cuda-nvrtc-cu12==12.1.105
80
+ nvidia-cuda-runtime-cu12==12.1.105
81
+ nvidia-cudnn-cu12==8.9.2.26
82
+ nvidia-cufft-cu12==11.0.2.54
83
+ nvidia-curand-cu12==10.3.2.106
84
+ nvidia-cusolver-cu12==11.4.5.107
85
+ nvidia-cusparse-cu12==12.1.0.106
86
+ nvidia-nccl-cu12==2.18.1
87
+ nvidia-nvjitlink-cu12==12.3.101
88
+ nvidia-nvtx-cu12==12.1.105
89
+ overrides==7.4.0
90
+ packaging==23.2
91
+ pandas==2.1.3
92
+ pandocfilters==1.5.0
93
+ parso==0.8.3
94
+ peft==0.6.3.dev0
95
+ pexpect==4.8.0
96
+ pillow==10.1.0
97
+ pip==23.0.1
98
+ platformdirs==4.0.0
99
+ prometheus-client==0.18.0
100
+ prompt-toolkit==3.0.41
101
+ protobuf==4.25.1
102
+ psutil==5.9.6
103
+ ptyprocess==0.7.0
104
+ pure-eval==0.2.2
105
+ pyarrow-hotfix==0.5
106
+ pyarrow==14.0.1
107
+ pycparser==2.21
108
+ pygments==2.17.1
109
+ pyparsing==3.1.1
110
+ python-dateutil==2.8.2
111
+ python-json-logger==2.0.7
112
+ pytz==2023.3.post1
113
+ pyyaml==6.0.1
114
+ pyzmq==25.1.1
115
+ referencing==0.31.0
116
+ regex==2023.10.3
117
+ requests==2.31.0
118
+ rfc3339-validator==0.1.4
119
+ rfc3986-validator==0.1.1
120
+ rpds-py==0.13.0
121
+ safetensors==0.4.0
122
+ scipy==1.11.4
123
+ send2trash==1.8.2
124
+ sentry-sdk==1.35.0
125
+ setproctitle==1.3.3
126
+ setuptools==65.5.0
127
+ six==1.16.0
128
+ smmap==5.0.1
129
+ sniffio==1.3.0
130
+ soupsieve==2.5
131
+ stack-data==0.6.3
132
+ sympy==1.12
133
+ terminado==0.18.0
134
+ tinycss2==1.2.1
135
+ tokenizers==0.15.0
136
+ tomli==2.0.1
137
+ torch==2.1.1
138
+ tornado==6.3.3
139
+ tqdm==4.66.1
140
+ traitlets==5.13.0
141
+ transformers==4.36.0.dev0
142
+ triton==2.1.0
143
+ types-python-dateutil==2.8.19.14
144
+ typing-extensions==4.8.0
145
+ tzdata==2023.3
146
+ uri-template==1.3.0
147
+ urllib3==2.1.0
148
+ wandb==0.16.0
149
+ wcwidth==0.2.10
150
+ webcolors==1.13
151
+ webencodings==0.5.1
152
+ websocket-client==1.6.4
153
+ wheel==0.41.3
154
+ widgetsnbextension==4.0.9
155
+ xxhash==3.4.1
156
+ yarl==1.9.2
wandb/run-20231119_155754-znnni89w/files/wandb-metadata.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1021-aws-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "heartbeatAt": "2023-11-19T15:57:56.237031",
5
+ "startedAt": "2023-11-19T15:57:54.802501",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [],
9
+ "state": "running",
10
+ "program": "mistral-finetune-own-data.ipynb",
11
+ "codePathLocal": "mistral-finetune-own-data.ipynb",
12
+ "root": "/home/ubuntu",
13
+ "host": "verb-workspace",
14
+ "username": "ubuntu",
15
+ "executable": "/home/ubuntu/.pyenv/versions/3.10.13/bin/python3.10",
16
+ "cpu_count": 2,
17
+ "cpu_count_logical": 4,
18
+ "cpu_freq": {
19
+ "current": 2921.4885,
20
+ "min": 0.0,
21
+ "max": 0.0
22
+ },
23
+ "cpu_freq_per_core": [
24
+ {
25
+ "current": 3285.96,
26
+ "min": 0.0,
27
+ "max": 0.0
28
+ },
29
+ {
30
+ "current": 2799.998,
31
+ "min": 0.0,
32
+ "max": 0.0
33
+ },
34
+ {
35
+ "current": 2799.998,
36
+ "min": 0.0,
37
+ "max": 0.0
38
+ },
39
+ {
40
+ "current": 2799.998,
41
+ "min": 0.0,
42
+ "max": 0.0
43
+ }
44
+ ],
45
+ "disk": {
46
+ "/": {
47
+ "total": 116.25236511230469,
48
+ "used": 56.017494201660156
49
+ }
50
+ },
51
+ "gpu": "NVIDIA A10G",
52
+ "gpu_count": 1,
53
+ "gpu_devices": [
54
+ {
55
+ "name": "NVIDIA A10G",
56
+ "memory_total": 24146608128
57
+ }
58
+ ],
59
+ "memory": {
60
+ "total": 15.44485092163086
61
+ }
62
+ }
wandb/run-20231119_155754-znnni89w/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 1.3415, "train/learning_rate": 1.8537074148296592e-05, "train/epoch": 0.33, "train/global_step": 130, "_timestamp": 1700413375.3028648, "_runtime": 3900.4956438541412, "_step": 24, "eval/loss": 1.4165884256362915, "eval/runtime": 221.7346, "eval/samples_per_second": 0.902, "eval/steps_per_second": 0.113, "_wandb": {"runtime": 3816}}
wandb/run-20231119_155754-znnni89w/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20231119_155754-znnni89w/logs/debug.log ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_setup.py:_flush():76] Current SDK version is 0.16.0
2
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_setup.py:_flush():76] Configure stats pid to 248
3
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_setup.py:_flush():76] Loading settings from /home/ubuntu/.config/wandb/settings
4
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_setup.py:_flush():76] Loading settings from /home/ubuntu/wandb/settings
5
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program': '<python with no main file>'}
7
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_setup.py:_flush():76] Applying login settings: {'api_key': '***REDACTED***'}
8
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_init.py:_log_setup():524] Logging user logs to /home/ubuntu/wandb/run-20231119_155754-znnni89w/logs/debug.log
9
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_init.py:_log_setup():525] Logging internal logs to /home/ubuntu/wandb/run-20231119_155754-znnni89w/logs/debug-internal.log
10
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_init.py:_jupyter_setup():470] configuring jupyter hooks <wandb.sdk.wandb_init._WandbInit object at 0x7fcc0082a710>
11
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_init.py:init():564] calling init triggers
12
+ 2023-11-19 15:57:54,804 INFO MainThread:248 [wandb_init.py:init():571] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2023-11-19 15:57:54,805 INFO MainThread:248 [wandb_init.py:init():614] starting backend
15
+ 2023-11-19 15:57:54,805 INFO MainThread:248 [wandb_init.py:init():618] setting up manager
16
+ 2023-11-19 15:57:54,806 INFO MainThread:248 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2023-11-19 15:57:54,807 INFO MainThread:248 [wandb_init.py:init():624] backend started and connected
18
+ 2023-11-19 15:57:54,812 INFO MainThread:248 [wandb_run.py:_label_probe_notebook():1294] probe notebook
19
+ 2023-11-19 15:57:54,813 INFO MainThread:248 [wandb_run.py:_label_probe_notebook():1304] Unable to probe notebook: 'ascii' codec can't decode byte 0xe2 in position 141509: ordinal not in range(128)
20
+ 2023-11-19 15:57:54,813 INFO MainThread:248 [wandb_init.py:init():716] updated telemetry
21
+ 2023-11-19 15:57:54,819 INFO MainThread:248 [wandb_init.py:init():749] communicating run to backend with 90.0 second timeout
22
+ 2023-11-19 15:57:56,070 INFO MainThread:248 [wandb_run.py:_on_init():2254] communicating current version
23
+ 2023-11-19 15:57:56,179 INFO MainThread:248 [wandb_run.py:_on_init():2263] got version response
24
+ 2023-11-19 15:57:56,179 INFO MainThread:248 [wandb_init.py:init():800] starting run threads in backend
25
+ 2023-11-19 15:57:56,253 INFO MainThread:248 [wandb_run.py:_console_start():2233] atexit reg
26
+ 2023-11-19 15:57:56,253 INFO MainThread:248 [wandb_run.py:_redirect():2088] redirect: wrap_raw
27
+ 2023-11-19 15:57:56,253 INFO MainThread:248 [wandb_run.py:_redirect():2153] Wrapping output streams.
28
+ 2023-11-19 15:57:56,253 INFO MainThread:248 [wandb_run.py:_redirect():2178] Redirects installed.
29
+ 2023-11-19 15:57:56,254 INFO MainThread:248 [wandb_init.py:init():841] run started, returning control to user process
30
+ 2023-11-19 15:57:56,259 INFO MainThread:248 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'filipealmeida/Mistral-7B-v0.1-sharded', 'transformers_version': '4.36.0.dev0', 'model_type': 'llama', 'sliding_window': 4096, 'quantization_config': {'quant_method': 'QuantizationMethod.BITS_AND_BYTES', 'load_in_8bit': False, 'load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16'}, 'output_dir': 'https://drive.google.com/drive/folders/1br7ACyOyuwqaV12bJdCCqHBN0VPh25Go?usp=drive_linkmistral-finetune', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2.5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 500, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'https://drive.google.com/drive/folders/1Vbwwv3NUPw2DbUwobSlQl7wS6MSmAc-f?usp=drive_link', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 10, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': 'mistral-finetune-2023-11-19-15-57', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_8bit', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None}
31
+ 2023-11-19 15:57:57,310 INFO MainThread:248 [jupyter.py:save_ipynb():373] not saving jupyter notebook
32
+ 2023-11-19 15:57:57,310 INFO MainThread:248 [wandb_init.py:_pause_backend():435] pausing backend
33
+ 2023-11-19 15:59:54,138 INFO MainThread:248 [wandb_init.py:_resume_backend():440] resuming backend
34
+ 2023-11-19 15:59:54,433 INFO MainThread:248 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'filipealmeida/Mistral-7B-v0.1-sharded', 'transformers_version': '4.36.0.dev0', 'model_type': 'llama', 'sliding_window': 4096, 'quantization_config': {'quant_method': 'QuantizationMethod.BITS_AND_BYTES', 'load_in_8bit': False, 'load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16'}, 'output_dir': 'https://drive.google.com/drive/folders/1br7ACyOyuwqaV12bJdCCqHBN0VPh25Go?usp=drive_linkmistral-finetune', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2.5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 500, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'https://drive.google.com/drive/folders/1Vbwwv3NUPw2DbUwobSlQl7wS6MSmAc-f?usp=drive_link', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 10, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': 'mistral-finetune-2023-11-19-15-59', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_8bit', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None}
35
+ 2023-11-19 16:00:50,048 INFO MainThread:248 [jupyter.py:save_ipynb():373] not saving jupyter notebook
36
+ 2023-11-19 16:00:50,048 INFO MainThread:248 [wandb_init.py:_pause_backend():435] pausing backend
37
+ 2023-11-19 16:01:15,664 INFO MainThread:248 [wandb_init.py:_resume_backend():440] resuming backend
38
+ 2023-11-19 16:01:15,985 INFO MainThread:248 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 32000, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'filipealmeida/Mistral-7B-v0.1-sharded', 'transformers_version': '4.36.0.dev0', 'model_type': 'llama', 'sliding_window': 4096, 'quantization_config': {'quant_method': 'QuantizationMethod.BITS_AND_BYTES', 'load_in_8bit': False, 'load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16'}, 'output_dir': './mistral-finetune', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2.5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 500, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 10, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 10, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': 'mistral-finetune-2023-11-19-16-01', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_8bit', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': False, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None}
39
+ 2023-11-19 17:03:53,112 INFO MainThread:248 [jupyter.py:save_ipynb():373] not saving jupyter notebook
40
+ 2023-11-19 17:03:53,112 INFO MainThread:248 [wandb_init.py:_pause_backend():435] pausing backend
41
+ 2023-11-19 17:45:28,693 INFO MainThread:248 [wandb_init.py:_resume_backend():440] resuming backend
42
+ 2023-11-19 17:45:30,948 INFO MainThread:248 [jupyter.py:save_ipynb():373] not saving jupyter notebook
43
+ 2023-11-19 17:45:30,948 INFO MainThread:248 [wandb_init.py:_pause_backend():435] pausing backend
wandb/run-20231119_155754-znnni89w/run-znnni89w.wandb ADDED
Binary file (116 kB). View file