Spaces:
Sleeping
Sleeping
| """ | |
| Data Preparation for Fine-Tuning | |
| Uses train.csv, validation.csv, test.csv correctly. | |
| """ | |
| import os | |
| import pandas as pd | |
| import json | |
| from datetime import datetime | |
| # ============================================================================= | |
| # CONFIGURATION | |
| # ============================================================================= | |
| OUTPUT_DIR = "outputs/finetuning" | |
| DATA_DIR = "data" | |
| # Change this for testing vs full training | |
| MAX_SAMPLES = 100 # Set to None for full data | |
| def setup_directories(): | |
| for d in [OUTPUT_DIR, f"{OUTPUT_DIR}/results", f"{OUTPUT_DIR}/logs"]: | |
| os.makedirs(d, exist_ok=True) | |
| # ============================================================================= | |
| # PROMPT TEMPLATE | |
| # ============================================================================= | |
| def format_for_training(question, sql): | |
| """Format single example for instruction fine-tuning.""" | |
| text = f"""### Question: | |
| {question} | |
| ### SQL: | |
| {sql}""" | |
| return text | |
| # ============================================================================= | |
| # DATA LOADING | |
| # ============================================================================= | |
| def load_csv_file(filepath, max_samples=None): | |
| """Load a single CSV file.""" | |
| if not os.path.exists(filepath): | |
| print(f" File not found: {filepath}") | |
| return None | |
| df = pd.read_csv(filepath) | |
| if max_samples and len(df) > max_samples: | |
| df = df.sample(n=max_samples, random_state=42) | |
| return df | |
| def format_dataframe(df, source_name): | |
| """Convert dataframe to training format.""" | |
| formatted = [] | |
| for _, row in df.iterrows(): | |
| formatted.append({ | |
| 'text': format_for_training(row['question'], row['sql']), | |
| 'question': str(row['question']), | |
| 'sql': str(row['sql']), | |
| 'source': source_name | |
| }) | |
| return formatted | |
| def save_jsonl(data, filepath): | |
| """Save data as JSONL file.""" | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| for item in data: | |
| f.write(json.dumps(item) + '\n') | |
| print(f" Saved: {filepath}") | |
| # ============================================================================= | |
| # MAIN FUNCTION | |
| # ============================================================================= | |
| def prepare_finetuning_data(): | |
| """Prepare data for fine-tuning.""" | |
| print("=" * 50) | |
| print("PREPARING FINE-TUNING DATA") | |
| print(f"Max samples per file: {MAX_SAMPLES if MAX_SAMPLES else 'ALL'}") | |
| print("=" * 50) | |
| setup_directories() | |
| # Load train data | |
| print("\n[1/5] Loading training data...") | |
| train_df = load_csv_file(f"{DATA_DIR}/train.csv", MAX_SAMPLES) | |
| print(f" train.csv: {len(train_df):,} rows") | |
| # Load synthetic and combine with train | |
| # synthetic_df = load_csv_file(f"{DATA_DIR}/synthetic.csv", MAX_SAMPLES) | |
| # if synthetic_df is not None: | |
| # print(f" synthetic.csv: {len(synthetic_df):,} rows") | |
| # train_df = pd.concat([train_df, synthetic_df], ignore_index=True) | |
| # print(f" Combined training: {len(train_df):,} rows") | |
| # Load validation data | |
| print("\n[2/5] Loading validation data...") | |
| val_df = load_csv_file(f"{DATA_DIR}/validation.csv", MAX_SAMPLES) | |
| print(f" validation.csv: {len(val_df):,} rows") | |
| # Load test data | |
| print("\n[3/5] Loading test data...") | |
| test_df = load_csv_file(f"{DATA_DIR}/test.csv", MAX_SAMPLES) | |
| print(f" test.csv: {len(test_df):,} rows") | |
| # Format data | |
| print("\n[4/5] Formatting data...") | |
| train_data = format_dataframe(train_df, 'train') | |
| val_data = format_dataframe(val_df, 'validation') | |
| test_data = format_dataframe(test_df, 'test') | |
| # Save files | |
| print("\n[5/5] Saving files...") | |
| save_jsonl(train_data, f"{OUTPUT_DIR}/train.jsonl") | |
| save_jsonl(val_data, f"{OUTPUT_DIR}/val.jsonl") | |
| save_jsonl(test_data, f"{OUTPUT_DIR}/test.jsonl") | |
| # Save stats | |
| stats = { | |
| 'train_samples': len(train_data), | |
| 'val_samples': len(val_data), | |
| 'test_samples': len(test_data), | |
| 'max_samples': MAX_SAMPLES, | |
| 'created_at': datetime.now().isoformat() | |
| } | |
| with open(f"{OUTPUT_DIR}/data_stats.json", 'w') as f: | |
| json.dump(stats, f, indent=2) | |
| # Summary | |
| print("\n" + "=" * 50) | |
| print("COMPLETE") | |
| print("=" * 50) | |
| print(f" Train: {len(train_data):,}") | |
| print(f" Val: {len(val_data):,}") | |
| print(f" Test: {len(test_data):,}") | |
| return stats | |
| # ============================================================================= | |
| # ENTRY POINT | |
| # ============================================================================= | |
| if __name__ == "__main__": | |
| prepare_finetuning_data() |