moheesh
got all my code
f29ea6c
"""
Fine-Tuning Script for SQL Generation Model
Uses LoRA for efficient fine-tuning.
"""
import os
import json
import torch
from datetime import datetime
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
# =============================================================================
# CONFIGURATION
# =============================================================================
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "outputs/finetuning"
CHECKPOINT_DIR = f"{OUTPUT_DIR}/checkpoints"
LOGS_DIR = f"{OUTPUT_DIR}/logs"
# Training config (optimized for RTX 4070)
TRAINING_CONFIG = {
'num_epochs': 3,
'batch_size': 8,
'learning_rate': 2e-4,
'max_length': 256,
'warmup_steps': 100,
'logging_steps': 50,
'save_steps': 500,
'gradient_accumulation_steps': 2,
}
# LoRA config
LORA_CONFIG = {
'r': 16,
'lora_alpha': 32,
'lora_dropout': 0.1,
'target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj']
}
def setup_directories():
for d in [OUTPUT_DIR, CHECKPOINT_DIR, LOGS_DIR]:
os.makedirs(d, exist_ok=True)
# =============================================================================
# TRAINING FUNCTIONS
# =============================================================================
def load_data():
"""Load prepared training data."""
train_file = f"{OUTPUT_DIR}/train.jsonl"
val_file = f"{OUTPUT_DIR}/val.jsonl"
if not os.path.exists(train_file):
raise FileNotFoundError("Run prepare_data.py first!")
return load_dataset('json', data_files={
'train': train_file,
'validation': val_file
})
def setup_model():
"""Load model and tokenizer with LoRA."""
print(f"Loading: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto"
)
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=LORA_CONFIG['r'],
lora_alpha=LORA_CONFIG['lora_alpha'],
lora_dropout=LORA_CONFIG['lora_dropout'],
target_modules=LORA_CONFIG['target_modules']
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
return model, tokenizer
def tokenize(examples, tokenizer):
"""Tokenize examples."""
return tokenizer(
examples['text'],
truncation=True,
padding='max_length',
max_length=TRAINING_CONFIG['max_length']
)
def train(model, tokenizer, dataset):
"""Train the model."""
# Tokenize
print("Tokenizing...")
tokenized_train = dataset['train'].map(
lambda x: tokenize(x, tokenizer),
batched=True,
remove_columns=dataset['train'].column_names
)
tokenized_val = dataset['validation'].map(
lambda x: tokenize(x, tokenizer),
batched=True,
remove_columns=dataset['validation'].column_names
)
# Training args
training_args = TrainingArguments(
output_dir=CHECKPOINT_DIR,
num_train_epochs=TRAINING_CONFIG['num_epochs'],
per_device_train_batch_size=TRAINING_CONFIG['batch_size'],
per_device_eval_batch_size=TRAINING_CONFIG['batch_size'],
learning_rate=TRAINING_CONFIG['learning_rate'],
warmup_steps=TRAINING_CONFIG['warmup_steps'],
logging_steps=TRAINING_CONFIG['logging_steps'],
save_steps=TRAINING_CONFIG['save_steps'],
gradient_accumulation_steps=TRAINING_CONFIG['gradient_accumulation_steps'],
eval_strategy="steps",
eval_steps=TRAINING_CONFIG['save_steps'],
save_total_limit=2,
fp16=True,
report_to="none",
logging_dir=LOGS_DIR,
dataloader_pin_memory=False,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)
# Train
print(f"\nTraining: {len(tokenized_train)} samples, {TRAINING_CONFIG['num_epochs']} epochs")
result = trainer.train()
# Save
print("\nSaving model...")
trainer.save_model(f"{CHECKPOINT_DIR}/final")
tokenizer.save_pretrained(f"{CHECKPOINT_DIR}/final")
# Stats
stats = {
'train_loss': result.training_loss,
'runtime_seconds': result.metrics['train_runtime'],
'samples_per_second': result.metrics['train_samples_per_second'],
'epochs': TRAINING_CONFIG['num_epochs'],
'total_steps': result.global_step,
'gpu': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU',
'completed_at': datetime.now().isoformat()
}
with open(f"{CHECKPOINT_DIR}/training_stats.json", 'w') as f:
json.dump(stats, f, indent=2)
return stats
# =============================================================================
# MAIN
# =============================================================================
def run_finetuning():
"""Main function."""
print("=" * 60)
print("FINE-TUNING SQL MODEL")
if torch.cuda.is_available():
print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
print("GPU: Not available (using CPU)")
print("=" * 60)
setup_directories()
# Load data
print("\n[1/3] Loading data...")
dataset = load_data()
print(f" Train: {len(dataset['train']):,}")
print(f" Val: {len(dataset['validation']):,}")
# Setup model
print("\n[2/3] Setting up model...")
model, tokenizer = setup_model()
# Train
print("\n[3/3] Training...")
stats = train(model, tokenizer, dataset)
# Done
print("\n" + "=" * 60)
print("TRAINING COMPLETE")
print("=" * 60)
print(f" Loss: {stats['train_loss']:.4f}")
print(f" Time: {stats['runtime_seconds']/60:.1f} min")
print(f" Model: {CHECKPOINT_DIR}/final")
return stats
if __name__ == "__main__":
run_finetuning()