Spaces:

moheesh
/

sql-learning-assistant

Sleeping

App Files Files Community

moheesh commited on 19 days ago

Commit

f29ea6c

1 Parent(s): e9aa12a

got all my code

Browse files

Files changed (44) hide show

Dockerfile +1 -1
src/.env.example +32 -0
src/.gitignore +100 -0
src/README.md +209 -0
src/app.py +497 -0
src/config.py +98 -0
src/finetuning/__init__.py +0 -0
src/finetuning/evaluate.py +293 -0
src/finetuning/inference.py +168 -0
src/finetuning/prepare_data.py +149 -0
src/finetuning/train.py +218 -0
src/outputs/finetuning/data_stats.json +7 -0
src/outputs/finetuning/results/evaluation_report.md +26 -0
src/outputs/finetuning/results/evaluation_results.json +7 -0
src/outputs/finetuning/test.jsonl +100 -0
src/outputs/finetuning/train.jsonl +100 -0
src/outputs/finetuning/val.jsonl +100 -0
src/outputs/finetuning/visualizations/01_metrics_overview.png +0 -0
src/outputs/finetuning/visualizations/02_token_accuracy_dist.png +0 -0
src/outputs/finetuning/visualizations/03_keyword_accuracy_dist.png +0 -0
src/outputs/finetuning/visualizations/04_training_loss.png +0 -0
src/outputs/rag/reports/knowledge_base_report.md +46 -0
src/outputs/rag/stats/knowledge_base_stats.json +22 -0
src/outputs/synthetic/reports/synthetic_report.md +47 -0
src/outputs/synthetic/stats/statistics.json +24 -0
src/outputs/synthetic/visualizations/01_size_comparison.png +0 -0
src/outputs/synthetic/visualizations/02_length_distribution.png +0 -0
src/outputs/synthetic/visualizations/03_diversity_distribution.png +0 -0
src/pipeline/integrated.py +584 -0
src/prompts/__init__.py +0 -0
src/prompts/prompt_builder.py +440 -0
src/prompts/system_prompts.py +162 -0
src/rag/__init__.py +0 -0
src/rag/embeddings.py +87 -0
src/rag/knowledge_base.py +415 -0
src/rag/retriever.py +234 -0
src/requirements.txt +23 -0
src/streamlit_app.py +0 -40
src/synthetic/__init__.py +0 -0
src/synthetic/generate_data.py +401 -0
src/synthetic/synonyms.py +149 -0
src/tests/test_finetuned.py +0 -0
src/tests/test_rag.py +0 -0
src/tests/test_synthetic.py +0 -0

Dockerfile CHANGED Viewed

@@ -17,4 +17,4 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]


17
18	HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
20	+ ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]

src/.env.example ADDED Viewed

	@@ -0,0 +1,32 @@

+# =============================================================================
+# GEMINI API KEYS (Required)
+# =============================================================================
+GEMINI_API_KEY=your-primary-gemini-key
+GEMINI_API_KEY_FALLBACK_1=your-fallback-key-1
+GEMINI_API_KEY_FALLBACK_2=your-fallback-key-2
+# =============================================================================
+# GEMINI MODELS
+# =============================================================================
+GEMINI_MODEL=gemini-2.5-flash
+GEMINI_MODEL_FALLBACK_1=gemini-2.5-flash-lite
+# =============================================================================
+# HUGGINGFACE (Required for cloud deployment, optional for local)
+# =============================================================================
+HF_TOKEN=your-huggingface-token
+HF_MODEL_ID=your-username/sql-tinyllama-lora
+HF_CHROMADB_ID=your-username/sql-chromadb
+# =============================================================================
+# HOW IT WORKS:
+# =============================================================================
+# LOCAL RUN:
+#   - If outputs/finetuning/checkpoints/final exists → uses local model
+#   - If chromadb_data exists → uses local ChromaDB
+#
+# CLOUD RUN (Streamlit):
+#   - If HF_MODEL_ID set → downloads model from HuggingFace
+#   - If HF_CHROMADB_ID set → downloads ChromaDB from HuggingFace
+#   - Falls back to building ChromaDB from data/ folder if needed
+# =============================================================================

src/.gitignore ADDED Viewed

	@@ -0,0 +1,100 @@

+# =============================================================================
+# ENVIRONMENT & SECRETS
+# =============================================================================
+.env
+.env.local
+.env.production
+# =============================================================================
+# VIRTUAL ENVIRONMENT
+# =============================================================================
+.venv/
+venv/
+env/
+ENV/
+data/
+data/synthetic.csv
+# =============================================================================
+# PYTHON
+# =============================================================================
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# =============================================================================
+# MODEL FILES (Upload to HuggingFace instead)
+# =============================================================================
+outputs/finetuning/checkpoints/
+*.bin
+*.pt
+*.pth
+*.safetensors
+*.ckpt
+# =============================================================================
+# CHROMADB (Upload to HuggingFace instead)
+# =============================================================================
+chromadb_data/
+# =============================================================================
+# LOGS & OUTPUTS
+# =============================================================================
+*.log
+outputs/*/logs/
+outputs/pipeline/logs/
+outputs/prompts/logs/
+# =============================================================================
+# IDE
+# =============================================================================
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# =============================================================================
+# OS
+# =============================================================================
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# =============================================================================
+# JUPYTER
+# =============================================================================
+.ipynb_checkpoints/
+*.ipynb
+# =============================================================================
+# KEEP THESE (don't ignore)
+# =============================================================================
+# !data/
+# !data/*.csv
+# !docs/
+# !docs/index.html
+# !*.py
+# !requirements.txt
+# !README.md
+# !.env.example

src/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+# ⚡ Prompt to SQL using RAG + LLM
+AI-powered Natural Language to SQL conversion using RAG, Fine-tuned LLM, and Gemini Enhancement.
+![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
+![Streamlit](https://img.shields.io/badge/Streamlit-1.28+-red.svg)
+![License](https://img.shields.io/badge/License-MIT-green.svg)
+## 🌐 Live Demo
+- **🚀 Web App:** [Streamlit App](https://your-app.streamlit.app)
+- **📄 Project Page:** [GitHub Pages](https://moheesh.github.io/Prompt_to_SQL_using_RAG_LLM)
+## ✨ Features
+| Feature | Description |
+|---------|-------------|
+| 🔍 **RAG Retrieval** | 80,000+ SQL examples in ChromaDB vector store |
+| 🤖 **Fine-tuned LLM** | TinyLlama with LoRA for SQL generation |
+| ✨ **Gemini Enhancement** | Query refinement, validation & explanation |
+| 📝 **Prompt Engineering** | Context management, edge cases, query analysis |
+| 📦 **Synthetic Data** | Data augmentation with 5 techniques |
+| 🔄 **Auto Fallback** | Multiple API keys & models for reliability |
+## 🔄 Pipeline Architecture
+```
+┌─────────────────────┐
+│   Synthetic Data    │  (Training augmentation)
+└─────────┬───────────┘
+          ↓
+┌─────────────────────┐
+│  Fine-tuned Model   │  (LoRA training on TinyLlama)
+└─────────┬───────────┘
+          ↓
+┌─────────────────────┐
+│   User Question     │  (Natural language input)
+└─────────┬───────────┘
+          ↓
+┌─────────────────────┐
+│   RAG Retrieval     │  (Similar examples from ChromaDB)
+└─────────┬───────────┘
+          ↓
+┌─────────────────────┐
+│ Prompt Engineering  │  (Context + query formatting)
+└─────────┬───────────┘
+          ↓
+┌─────────────────────┐
+│  Fine-tuned Model   │  (SQL generation)
+└─────────┬───────────┘
+          ↓
+┌─────────────────────┐
+│ Gemini Enhancement  │  (Refine + explain)
+└─────────┬───────────┘
+          ↓
+┌─────────────────────┐
+│    Final SQL        │  (Optimized output)
+└─────────────────────┘
+```
+## 📁 Project Structure
+```
+Prompt_to_SQL_using_RAG_LLM/
+├── app.py                    # Streamlit UI
+├── config.py                 # Central configuration
+├── requirements.txt          # Dependencies
+│
+├── pipeline/
+│   └── integrated.py         # Main pipeline (RAG + Model + Gemini)
+│
+├── finetuning/
+│   ├── prepare_data.py       # Data preparation
+│   ├── train.py              # LoRA fine-tuning
+│   ├── evaluate.py           # Model evaluation
+│   └── inference.py          # SQL generation
+│
+├── rag/
+│   ├── embeddings.py         # Sentence transformers
+│   ├── knowledge_base.py     # ChromaDB builder
+│   └── retriever.py          # LangChain retriever
+│
+├── prompts/
+│   ├── prompt_builder.py     # Context management
+│   └── system_prompts.py     # Prompt templates
+│
+├── synthetic/
+│   ├── generate_data.py      # Data augmentation
+│   └── synonyms.py           # Synonym dictionary
+│
+├── data/
+│   ├── train.csv
+│   ├── validation.csv
+│   └── test.csv
+│
+└── docs/
+    └── index.html            # GitHub Pages
+```
+## 🛠️ Setup
+### 1. Clone the Repository
+```bash
+git clone https://github.com/moheesh/Prompt_to_SQL_using_RAG_LLM.git
+cd Prompt_to_SQL_using_RAG_LLM
+```
+### 2. Create Virtual Environment
+```bash
+python -m venv .venv
+# Windows
+.venv\Scripts\activate
+# Mac/Linux
+source .venv/bin/activate
+```
+### 3. Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+### 4. Configure Environment
+Create a `.env` file:
+```env
+# Gemini API
+GEMINI_API_KEY=your-primary-key
+GEMINI_MODEL=gemini-2.5-flash
+# HuggingFace (for cloud deployment)
+HF_TOKEN=your-hf-token
+HF_MODEL_ID=your-username/sql-tinyllama-lora
+HF_CHROMADB_ID=your-username/sql-chromadb
+```
+### 5. Build Knowledge Base (First Time)
+```bash
+python rag/knowledge_base.py
+```
+### 6. Run the App
+```bash
+streamlit run app.py
+```
+## 🚀 Deployment
+### Upload to HuggingFace
+```bash
+# Login
+huggingface-cli login
+# Upload model
+python -c "from huggingface_hub import HfApi; api = HfApi(); api.upload_folder(folder_path='outputs/finetuning/checkpoints/final', repo_id='moheesh/sql-tinyllama-lora', repo_type='model', create_repo=True)"
+# Upload ChromaDB
+python -c "from huggingface_hub import HfApi; api = HfApi(); api.upload_folder(folder_path='chromadb_data', repo_id='moheesh/sql-chromadb', repo_type='dataset', create_repo=True)"
+```
+### Deploy to Streamlit Cloud
+1. Push code to GitHub
+2. Go to [share.streamlit.io](https://share.streamlit.io)
+3. Connect your repo
+4. Add secrets (same as `.env`)
+5. Deploy!
+## 🛠️ Tech Stack
+| Component | Technology |
+|-----------|------------|
+| LLM | TinyLlama + LoRA |
+| Vector DB | ChromaDB |
+| Embeddings | all-MiniLM-L6-v2 |
+| Enhancement | Gemini API |
+| Framework | LangChain |
+| UI | Streamlit |
+## 📊 Evaluation Metrics
+| Metric | Score |
+|--------|-------|
+| Exact Match | XX% |
+| Token Accuracy | XX% |
+| Keyword Accuracy | XX% |
+| Structure Similarity | XX% |
+## 🎓 Course
+**INFO7375** - Northeastern University
+## 👤 Author
+**Your Name**
+- GitHub: [@moheesh](https://github.com/moheesh)
+- LinkedIn: [LinkedIn](https://linkedin.com/in/moheesh-k-a-a95306169)
+## 📝 License
+MIT License - see [LICENSE](LICENSE) for details.

src/app.py ADDED Viewed

	@@ -0,0 +1,497 @@

+"""
+Streamlit App for SQL Learning Assistant
+Integrates: RAG + Fine-tuned Model + Gemini Enhancement
+"""
+import streamlit as st
+import os
+import sys
+from dotenv import load_dotenv
+# Load environment variables FIRST
+load_dotenv()
+# Add parent directory
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# =============================================================================
+# PAGE CONFIG - MUST BE FIRST STREAMLIT COMMAND
+# =============================================================================
+st.set_page_config(
+    page_title="SQL Learning Assistant",
+    page_icon="⚡",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# =============================================================================
+# CACHED LOADERS - Load on-demand, cache forever
+# =============================================================================
+@st.cache_resource(show_spinner=False)
+def load_chromadb():
+    """Download ChromaDB from HuggingFace if needed."""
+    chromadb_path = "chromadb_data"
+    hf_chromadb_id = os.getenv("HF_CHROMADB_ID", None)
+    has_files = False
+    if os.path.exists(chromadb_path):
+        local_files = os.listdir(chromadb_path) if os.path.isdir(chromadb_path) else []
+        has_files = any('chroma' in f.lower() or 'sqlite' in f.lower() for f in local_files) or len(local_files) > 2
+    if not has_files and hf_chromadb_id:
+        from huggingface_hub import snapshot_download
+        os.makedirs(chromadb_path, exist_ok=True)
+        snapshot_download(repo_id=hf_chromadb_id, repo_type="dataset", local_dir=chromadb_path)
+    return chromadb_path
+@st.cache_resource(show_spinner=False)
+def load_retriever():
+    """Load the RAG retriever."""
+    load_chromadb()
+    from rag.retriever import SQLRetriever
+    return SQLRetriever()
+@st.cache_resource(show_spinner=False)
+def load_model():
+    """Load the fine-tuned model."""
+    from finetuning.inference import SQLGenerator
+    return SQLGenerator()
+@st.cache_resource(show_spinner=False)
+def load_prompt_builder():
+    """Load prompt builder."""
+    from prompts.prompt_builder import PromptBuilder
+    return PromptBuilder()
+@st.cache_resource(show_spinner=False)
+def load_gemini():
+    """Load Gemini client."""
+    from pipeline.integrated import GeminiClient, GEMINI_KEYS
+    if GEMINI_KEYS:
+        return GeminiClient()
+    return None
+# =============================================================================
+# HELPER FUNCTION TO RUN PIPELINE
+# =============================================================================
+def run_pipeline(question, num_examples=3):
+    """Run the full pipeline - loads components on first use."""
+    result = {
+        'question': question,
+        'success': False,
+        'steps': {}
+    }
+    # Step 1: RAG
+    rag_context = ""
+    examples = []
+    try:
+        with st.spinner("🔍 Loading RAG system..."):
+            retriever = load_retriever()
+        if retriever:
+            examples = retriever.retrieve(question, top_k=num_examples)
+            rag_context = "Similar SQL examples:\n\n"
+            for i, r in enumerate(examples, 1):
+                rag_context += f"Example {i}:\nQuestion: {r['question']}\nSQL: {r['sql']}\n\n"
+    except Exception as e:
+        st.warning(f"RAG error: {e}")
+    result['steps']['rag'] = {'examples': examples, 'num_examples': len(examples), 'context': rag_context}
+    # Step 2: Prompt
+    prompt = ""
+    try:
+        prompt_builder = load_prompt_builder()
+        if prompt_builder:
+            prompt_result = prompt_builder.build_prompt(question=question, rag_context=rag_context)
+            if prompt_result['success']:
+                prompt = prompt_result['prompt']
+    except:
+        pass
+    if not prompt:
+        prompt = f"{rag_context}\nQuestion: {question}\n\nSQL:"
+    result['steps']['prompt'] = {'prompt': prompt, 'length': len(prompt)}
+    # Step 3: Fine-tuned Model
+    finetuned_sql = None
+    try:
+        with st.spinner("🤖 Loading AI model..."):
+            model = load_model()
+        if model:
+            finetuned_sql = model.generate(question, rag_context)
+    except Exception as e:
+        st.warning(f"Model error: {e}")
+    result['steps']['finetuned'] = {'sql': finetuned_sql, 'error': None if finetuned_sql else 'Model not available'}
+    if not finetuned_sql:
+        return result
+    # Step 4: Gemini Enhancement
+    enhanced_sql = finetuned_sql
+    try:
+        gemini = load_gemini()
+        if gemini:
+            enhance_prompt = f"""You are an SQL expert. Review and enhance this SQL query.
+Original Question: {question}
+Generated SQL (by a smaller model):
+{finetuned_sql}
+Rules:
+- If the SQL is correct, return it unchanged
+- If it needs fixes, return the corrected version
+- Return ONLY the SQL query, no explanations
+Enhanced SQL:"""
+            response, error = gemini.generate(enhance_prompt)
+            if response and not error:
+                enhanced_sql = response.strip()
+                if enhanced_sql.startswith("```"):
+                    lines = enhanced_sql.split("\n")
+                    enhanced_sql = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
+                if enhanced_sql.lower().startswith("sql"):
+                    enhanced_sql = enhanced_sql[3:].strip()
+    except Exception as e:
+        st.warning(f"Gemini enhance error: {e}")
+    result['steps']['gemini_enhance'] = {'sql': enhanced_sql, 'info': {'enhanced': enhanced_sql != finetuned_sql}}
+    result['final_sql'] = enhanced_sql
+    # Step 5: Explanation
+    explanation = ""
+    try:
+        gemini = load_gemini()
+        if gemini:
+            explain_prompt = f"Explain this SQL query in simple terms (2-3 sentences):\n\nSQL: {enhanced_sql}"
+            response, error = gemini.generate(explain_prompt)
+            if response and not error:
+                explanation = response.strip()
+    except:
+        pass
+    result['explanation'] = explanation
+    result['success'] = True
+    return result
+# =============================================================================
+# CUSTOM CSS
+# =============================================================================
+st.markdown("""
+<style>
+    .stApp {
+        background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 50%, #16213e 100%);
+    }
+    .main-header {
+        font-size: 3rem;
+        font-weight: 800;
+        background: linear-gradient(120deg, #00d4ff, #7c3aed, #f472b6);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        background-clip: text;
+        text-align: center;
+        margin-bottom: 0.5rem;
+    }
+    .sub-header {
+        font-size: 1.1rem;
+        color: #94a3b8;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .stButton > button {
+        background: linear-gradient(135deg, #1e293b 0%, #334155 100%);
+        color: #e2e8f0;
+        border: 1px solid #475569;
+        border-radius: 10px;
+        transition: all 0.3s ease;
+    }
+    .stButton > button:hover {
+        background: linear-gradient(135deg, #3b82f6 0%, #8b5cf6 100%);
+        border-color: #60a5fa;
+        transform: translateY(-2px);
+    }
+    .stTextInput > div > div > input {
+        background: rgba(30, 41, 59, 0.8);
+        border: 1px solid #475569;
+        border-radius: 12px;
+        color: #f1f5f9;
+    }
+    [data-testid="stSidebar"] {
+        background: linear-gradient(180deg, #0f172a 0%, #1e293b 100%);
+    }
+    .pipeline-box {
+        background: rgba(30, 41, 59, 0.6);
+        border: 1px solid #475569;
+        border-radius: 8px;
+        padding: 0.5rem 1rem;
+        margin: 0.25rem 0;
+        font-size: 0.85rem;
+        text-align: center;
+    }
+    .pipeline-arrow {
+        color: #3b82f6;
+        text-align: center;
+        font-size: 1.2rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+# =============================================================================
+# HEADER
+# =============================================================================
+st.markdown('<p class="main-header">⚡ SQL Learning Assistant</p>', unsafe_allow_html=True)
+st.markdown('<p class="sub-header">Transform Natural Language into SQL using AI-Powered Pipeline</p>', unsafe_allow_html=True)
+# =============================================================================
+# SIDEBAR
+# =============================================================================
+with st.sidebar:
+    st.markdown("## ⚙️ Configuration")
+    st.markdown("---")
+    st.markdown("### 🎯 RAG Settings")
+    num_examples = st.slider("Similar examples to retrieve", min_value=1, max_value=5, value=3)
+    st.markdown("---")
+    st.markdown("### 📊 System Status")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("✅ **RAG**")
+        st.markdown("✅ **Model**")
+    with col2:
+        st.markdown("✅ **Prompts**")
+        if os.getenv("GEMINI_API_KEY"):
+            st.markdown("✅ **Gemini**")
+        else:
+            st.markdown("❌ **Gemini**")
+    st.markdown("---")
+    st.markdown("### 🔄 Pipeline Flow")
+    pipeline_steps = [
+        ("📦", "Synthetic Data"),
+        ("🎓", "Fine-tuned Model"),
+        ("❓", "User Question"),
+        ("🔍", "RAG Retrieval"),
+        ("📝", "Prompt Engineering"),
+        ("🤖", "Model Inference"),
+        ("✨", "Gemini Enhancement"),
+        ("✅", "Final Output"),
+    ]
+    for i, (icon, title) in enumerate(pipeline_steps):
+        st.markdown(f'<div class="pipeline-box">{icon} <strong>{title}</strong></div>', unsafe_allow_html=True)
+        if i < len(pipeline_steps) - 1:
+            st.markdown('<p class="pipeline-arrow">↓</p>', unsafe_allow_html=True)
+    st.markdown("---")
+    st.markdown("### 📚 About")
+    st.markdown("**Course:** INFO7375")
+# =============================================================================
+# MAIN CONTENT
+# =============================================================================
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "results_history" not in st.session_state:
+    st.session_state.results_history = []
+if "input_text" not in st.session_state:
+    st.session_state.input_text = ""
+# =============================================================================
+# EXAMPLE QUESTIONS
+# =============================================================================
+st.markdown("### 💡 Try an Example")
+example_questions = [
+    ("👥 Employees", "Find all employees with salary above 50000"),
+    ("📊 Orders", "Count total orders by customer"),
+    ("🏆 Top Products", "Show top 5 products by revenue"),
+    ("📅 Recent", "List customers who placed orders in 2024"),
+    ("💰 Salary", "Calculate average salary by department"),
+]
+cols = st.columns(5)
+for i, (label, ex_question) in enumerate(example_questions):
+    with cols[i]:
+        if st.button(label, key=f"ex_{i}", use_container_width=True, help=ex_question):
+            st.session_state.input_text = ex_question
+# =============================================================================
+# INPUT AREA
+# =============================================================================
+st.markdown("### 🎤 Ask Your Question")
+col1, col2 = st.columns([6, 1])
+with col1:
+    question = st.text_input(
+        "Question",
+        placeholder="e.g., Find all employees with salary greater than 50000...",
+        label_visibility="collapsed",
+        key="input_text"
+    )
+with col2:
+    submit_btn = st.button("🚀 Run", type="primary", use_container_width=True)
+st.markdown("---")
+# =============================================================================
+# CHAT HISTORY
+# =============================================================================
+for i, message in enumerate(st.session_state.messages):
+    with st.chat_message(message["role"], avatar="🧑‍💻" if message["role"] == "user" else "🤖"):
+        st.markdown(message["content"])
+        if message["role"] == "assistant":
+            result_idx = i // 2
+            if result_idx < len(st.session_state.results_history):
+                result = st.session_state.results_history[result_idx]
+                if result and result.get('success'):
+                    with st.expander("🔍 View Pipeline Details", expanded=False):
+                        tab1, tab2, tab3, tab4 = st.tabs(["🔍 RAG", "📝 Prompt", "🤖 Fine-tuned", "✨ Gemini"])
+                        with tab1:
+                            examples = result['steps']['rag'].get('examples', [])
+                            st.markdown(f"**Retrieved {len(examples)} examples**")
+                            for j, ex in enumerate(examples, 1):
+                                st.markdown(f"**Example {j}** | Score: `{ex.get('score', 0):.3f}`")
+                                st.markdown(f"Q: {ex.get('question', 'N/A')}")
+                                st.code(ex.get('sql', 'N/A'), language="sql")
+                        with tab2:
+                            st.markdown("**Constructed Prompt:**")
+                            st.code(result['steps']['prompt'].get('prompt', 'N/A'), language="text")
+                        with tab3:
+                            st.markdown("**Fine-tuned Model Output:**")
+                            st.code(result['steps']['finetuned'].get('sql', 'N/A'), language="sql")
+                        with tab4:
+                            if 'gemini_enhance' in result['steps']:
+                                st.markdown("**Enhanced SQL:**")
+                                st.code(result['steps']['gemini_enhance'].get('sql', 'N/A'), language="sql")
+# =============================================================================
+# PROCESS QUERY
+# =============================================================================
+if submit_btn and question:
+    st.session_state.messages.append({"role": "user", "content": question})
+    with st.chat_message("user", avatar="🧑‍💻"):
+        st.markdown(question)
+    with st.chat_message("assistant", avatar="🤖"):
+        with st.status("🔄 Processing your query...", expanded=True) as status:
+            st.write("🔍 Retrieving similar examples...")
+            st.write("📝 Building prompt...")
+            st.write("🤖 Generating SQL...")
+            st.write("✨ Enhancing with Gemini...")
+            result = run_pipeline(question=question, num_examples=num_examples)
+            status.update(label="✅ Complete!", state="complete", expanded=False)
+        st.session_state.results_history.append(result)
+        if result['success']:
+            st.markdown("### ✅ Generated SQL")
+            st.code(result['final_sql'], language="sql")
+            if 'gemini_enhance' in result['steps']:
+                original = result['steps']['finetuned'].get('sql', '')
+                enhanced = result['steps']['gemini_enhance'].get('sql', '')
+                if original != enhanced:
+                    st.success("✨ Query optimized by Gemini!")
+                else:
+                    st.info("✓ Query was already optimal")
+            if 'explanation' in result and result['explanation']:
+                if not result['explanation'].startswith("Explanation error"):
+                    st.markdown("### 📖 Explanation")
+                    st.info(result['explanation'])
+            with st.expander("🔍 View Pipeline Details", expanded=False):
+                tab1, tab2, tab3, tab4 = st.tabs(["🔍 RAG", "📝 Prompt", "🤖 Fine-tuned", "✨ Gemini"])
+                with tab1:
+                    examples = result['steps']['rag'].get('examples', [])
+                    st.markdown(f"**Retrieved {len(examples)} examples**")
+                    for j, ex in enumerate(examples, 1):
+                        st.markdown(f"**Example {j}** | Score: `{ex.get('score', 0):.3f}`")
+                        st.markdown(f"Q: {ex.get('question', 'N/A')}")
+                        st.code(ex.get('sql', 'N/A'), language="sql")
+                with tab2:
+                    st.markdown("**Constructed Prompt:**")
+                    st.code(result['steps']['prompt'].get('prompt', 'N/A'), language="text")
+                with tab3:
+                    st.markdown("**Fine-tuned Model Output:**")
+                    st.code(result['steps']['finetuned'].get('sql', 'N/A'), language="sql")
+                with tab4:
+                    if 'gemini_enhance' in result['steps']:
+                        st.markdown("**Enhanced SQL:**")
+                        st.code(result['steps']['gemini_enhance'].get('sql', 'N/A'), language="sql")
+            response_text = f"**Generated SQL:**\n```sql\n{result['final_sql']}\n```"
+            if 'explanation' in result and not result['explanation'].startswith("Explanation error"):
+                response_text += f"\n\n**Explanation:** {result['explanation']}"
+            st.session_state.messages.append({"role": "assistant", "content": response_text})
+        else:
+            st.error("❌ Failed to generate SQL. Please try again.")
+            st.session_state.messages.append({"role": "assistant", "content": "❌ Failed to generate SQL."})
+elif submit_btn and not question:
+    st.warning("⚠️ Please enter a question first!")
+# =============================================================================
+# FOOTER
+# =============================================================================
+st.markdown("---")
+col1, col2, col3 = st.columns([1, 2, 1])
+with col1:
+    if st.button("🗑️ Clear Chat", use_container_width=True):
+        st.session_state.messages = []
+        st.session_state.results_history = []
+        st.session_state.input_text = ""
+        st.rerun()
+with col2:
+    st.markdown('<p style="text-align: center; color: #64748b;">Built with ❤️ using Streamlit • LangChain • Gemini</p>', unsafe_allow_html=True)
+with col3:
+    st.markdown('<p style="text-align: right; color: #64748b;"><strong>INFO7375</strong></p>', unsafe_allow_html=True)

src/config.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Central Configuration for SQL Learning Assistant
+Handles local vs HuggingFace paths automatically
+"""
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# =============================================================================
+# HUGGINGFACE CONFIGURATION (for cloud deployment)
+# Set these in .env or Streamlit secrets
+# =============================================================================
+HF_MODEL_ID = os.getenv("HF_MODEL_ID", None)  # e.g., "username/sql-tinyllama-lora"
+HF_CHROMADB_ID = os.getenv("HF_CHROMADB_ID", None)  # e.g., "username/sql-chromadb"
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+# =============================================================================
+# LOCAL PATHS
+# =============================================================================
+LOCAL_MODEL_DIR = "outputs/finetuning/checkpoints/final"
+LOCAL_CHROMADB_DIR = "chromadb_data"
+LOCAL_DATA_DIR = "data"
+# =============================================================================
+# GEMINI CONFIGURATION
+# =============================================================================
+GEMINI_KEYS = [
+    os.getenv("GEMINI_API_KEY"),
+    os.getenv("GEMINI_API_KEY_FALLBACK_1"),
+    os.getenv("GEMINI_API_KEY_FALLBACK_2"),
+]
+GEMINI_KEYS = [k for k in GEMINI_KEYS if k]  # Remove None values
+GEMINI_MODELS = [
+    os.getenv("GEMINI_MODEL", "gemini-2.5-flash"),
+    os.getenv("GEMINI_MODEL_FALLBACK_1"),
+]
+GEMINI_MODELS = [m for m in GEMINI_MODELS if m]  # Remove None values
+# =============================================================================
+# RAG CONFIGURATION
+# =============================================================================
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+COLLECTION_NAME = "sql_knowledge"
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+def is_local():
+    """Check if running locally (has local model/data)."""
+    return os.path.exists(LOCAL_MODEL_DIR) and os.path.exists(LOCAL_CHROMADB_DIR)
+def is_cloud():
+    """Check if running in cloud (has HF config)."""
+    return HF_MODEL_ID is not None or HF_CHROMADB_ID is not None
+def get_model_source():
+    """Get where model will be loaded from."""
+    if os.path.exists(LOCAL_MODEL_DIR) and os.listdir(LOCAL_MODEL_DIR):
+        return "local", LOCAL_MODEL_DIR
+    elif HF_MODEL_ID:
+        return "huggingface", HF_MODEL_ID
+    else:
+        return "base", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+def get_chromadb_source():
+    """Get where ChromaDB will be loaded from."""
+    if os.path.exists(LOCAL_CHROMADB_DIR) and os.listdir(LOCAL_CHROMADB_DIR):
+        return "local", LOCAL_CHROMADB_DIR
+    elif HF_CHROMADB_ID:
+        return "huggingface", HF_CHROMADB_ID
+    else:
+        return "build", LOCAL_DATA_DIR
+def print_config():
+    """Print current configuration."""
+    print("=" * 50)
+    print("CONFIGURATION")
+    print("=" * 50)
+    model_src, model_path = get_model_source()
+    chromadb_src, chromadb_path = get_chromadb_source()
+    print(f"Model: {model_src} → {model_path}")
+    print(f"ChromaDB: {chromadb_src} → {chromadb_path}")
+    print(f"Gemini Keys: {len(GEMINI_KEYS)} available")
+    print(f"Gemini Models: {GEMINI_MODELS}")
+    print("=" * 50)
+if __name__ == "__main__":
+    print_config()

src/finetuning/__init__.py ADDED Viewed

File without changes

src/finetuning/evaluate.py ADDED Viewed

	@@ -0,0 +1,293 @@

+"""
+Evaluation Module for Fine-Tuned SQL Model
+"""
+import os
+import json
+import matplotlib.pyplot as plt
+from datetime import datetime
+from collections import Counter
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+OUTPUT_DIR = "outputs/finetuning"
+RESULTS_DIR = f"{OUTPUT_DIR}/results"
+VIZ_DIR = f"{OUTPUT_DIR}/visualizations"
+# Number of samples to evaluate
+NUM_EVAL_SAMPLES = 50  # Change for more/less evaluation
+def setup_directories():
+    for d in [RESULTS_DIR, VIZ_DIR]:
+        os.makedirs(d, exist_ok=True)
+# =============================================================================
+# EVALUATION METRICS
+# =============================================================================
+def exact_match(pred, expected):
+    """Check exact match."""
+    return pred.lower().strip() == expected.lower().strip()
+def token_accuracy(pred, expected):
+    """Token overlap accuracy."""
+    pred_tokens = set(pred.lower().split())
+    exp_tokens = set(expected.lower().split())
+    if not exp_tokens:
+        return 0.0
+    return len(pred_tokens & exp_tokens) / len(exp_tokens)
+def keyword_accuracy(pred, expected):
+    """SQL keyword match accuracy."""
+    keywords = ['SELECT', 'FROM', 'WHERE', 'JOIN', 'GROUP BY',
+                'ORDER BY', 'COUNT', 'SUM', 'AVG', 'MAX', 'MIN']
+    pred_kw = [k for k in keywords if k in pred.upper()]
+    exp_kw = [k for k in keywords if k in expected.upper()]
+    if not exp_kw:
+        return 1.0 if not pred_kw else 0.0
+    matches = sum(1 for k in exp_kw if k in pred_kw)
+    return matches / len(exp_kw)
+def structure_similarity(pred, expected):
+    """SQL structure similarity."""
+    clauses = ['SELECT', 'FROM', 'WHERE', 'JOIN', 'GROUP BY', 'ORDER BY', 'LIMIT']
+    pred_struct = set(c for c in clauses if c in pred.upper())
+    exp_struct = set(c for c in clauses if c in expected.upper())
+    if not exp_struct and not pred_struct:
+        return 1.0
+    if not exp_struct or not pred_struct:
+        return 0.0
+    return len(pred_struct & exp_struct) / len(pred_struct | exp_struct)
+# =============================================================================
+# EVALUATION RUNNER
+# =============================================================================
+def evaluate_predictions(predictions, ground_truth):
+    """Calculate all metrics."""
+    results = {
+        'exact_match': [],
+        'token_accuracy': [],
+        'keyword_accuracy': [],
+        'structure_similarity': []
+    }
+    for pred, exp in zip(predictions, ground_truth):
+        results['exact_match'].append(1 if exact_match(pred, exp) else 0)
+        results['token_accuracy'].append(token_accuracy(pred, exp))
+        results['keyword_accuracy'].append(keyword_accuracy(pred, exp))
+        results['structure_similarity'].append(structure_similarity(pred, exp))
+    # Calculate averages
+    metrics = {
+        'total_samples': len(predictions),
+        'exact_match_rate': sum(results['exact_match']) / len(results['exact_match']),
+        'avg_token_accuracy': sum(results['token_accuracy']) / len(results['token_accuracy']),
+        'avg_keyword_accuracy': sum(results['keyword_accuracy']) / len(results['keyword_accuracy']),
+        'avg_structure_similarity': sum(results['structure_similarity']) / len(results['structure_similarity']),
+        'detailed': results
+    }
+    return metrics
+# =============================================================================
+# VISUALIZATIONS
+# =============================================================================
+def create_visualizations(metrics):
+    """Create evaluation charts."""
+    setup_directories()
+    plt.style.use('seaborn-v0_8-whitegrid')
+    # 1. Metrics Overview
+    fig, ax = plt.subplots(figsize=(10, 6))
+    names = ['Exact Match', 'Token Acc', 'Keyword Acc', 'Structure Sim']
+    values = [
+        metrics['exact_match_rate'] * 100,
+        metrics['avg_token_accuracy'] * 100,
+        metrics['avg_keyword_accuracy'] * 100,
+        metrics['avg_structure_similarity'] * 100
+    ]
+    colors = ['#3498db', '#2ecc71', '#9b59b6', '#e74c3c']
+    bars = ax.bar(names, values, color=colors, edgecolor='black')
+    for bar, val in zip(bars, values):
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
+                f'{val:.1f}%', ha='center', fontweight='bold')
+    ax.set_ylabel('Score (%)')
+    ax.set_title('Model Evaluation Metrics', fontsize=14, fontweight='bold')
+    ax.set_ylim(0, 110)
+    plt.tight_layout()
+    plt.savefig(f'{VIZ_DIR}/01_metrics_overview.png', dpi=150)
+    plt.close()
+    print(f"  Saved: {VIZ_DIR}/01_metrics_overview.png")
+    # 2. Token Accuracy Distribution
+    fig, ax = plt.subplots(figsize=(10, 6))
+    token_acc = metrics['detailed']['token_accuracy']
+    ax.hist(token_acc, bins=20, color='#2ecc71', edgecolor='black', alpha=0.7)
+    ax.axvline(sum(token_acc)/len(token_acc), color='red', linestyle='--',
+               label=f"Mean: {sum(token_acc)/len(token_acc):.2f}")
+    ax.set_xlabel('Token Accuracy')
+    ax.set_ylabel('Frequency')
+    ax.set_title('Token Accuracy Distribution', fontsize=14, fontweight='bold')
+    ax.legend()
+    plt.tight_layout()
+    plt.savefig(f'{VIZ_DIR}/02_token_accuracy_dist.png', dpi=150)
+    plt.close()
+    print(f"  Saved: {VIZ_DIR}/02_token_accuracy_dist.png")
+    # 3. Keyword Accuracy Distribution
+    fig, ax = plt.subplots(figsize=(10, 6))
+    kw_acc = metrics['detailed']['keyword_accuracy']
+    ax.hist(kw_acc, bins=20, color='#9b59b6', edgecolor='black', alpha=0.7)
+    ax.axvline(sum(kw_acc)/len(kw_acc), color='red', linestyle='--',
+               label=f"Mean: {sum(kw_acc)/len(kw_acc):.2f}")
+    ax.set_xlabel('Keyword Accuracy')
+    ax.set_ylabel('Frequency')
+    ax.set_title('Keyword Accuracy Distribution', fontsize=14, fontweight='bold')
+    ax.legend()
+    plt.tight_layout()
+    plt.savefig(f'{VIZ_DIR}/03_keyword_accuracy_dist.png', dpi=150)
+    plt.close()
+    print(f"  Saved: {VIZ_DIR}/03_keyword_accuracy_dist.png")
+# =============================================================================
+# REPORT GENERATION
+# =============================================================================
+def generate_report(metrics):
+    """Generate evaluation report."""
+    report = f"""# Fine-Tuning Evaluation Report
+**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+## Metrics Summary
+| Metric | Score |
+|--------|-------|
+| Samples Evaluated | {metrics['total_samples']} |
+| Exact Match Rate | {metrics['exact_match_rate']*100:.2f}% |
+| Token Accuracy | {metrics['avg_token_accuracy']*100:.2f}% |
+| Keyword Accuracy | {metrics['avg_keyword_accuracy']*100:.2f}% |
+| Structure Similarity | {metrics['avg_structure_similarity']*100:.2f}% |
+## Metrics Explanation
+- **Exact Match**: Predictions identical to ground truth
+- **Token Accuracy**: Word overlap between prediction and expected
+- **Keyword Accuracy**: SQL keywords (SELECT, WHERE, etc.) match
+- **Structure Similarity**: Query structure (clauses used) match
+## Visualizations
+- `01_metrics_overview.png` - All metrics bar chart
+- `02_token_accuracy_dist.png` - Token accuracy histogram
+- `03_keyword_accuracy_dist.png` - Keyword accuracy histogram
+"""
+    with open(f'{RESULTS_DIR}/evaluation_report.md', 'w') as f:
+        f.write(report)
+    print(f"  Saved: {RESULTS_DIR}/evaluation_report.md")
+    # Save JSON
+    json_metrics = {k: v for k, v in metrics.items() if k != 'detailed'}
+    with open(f'{RESULTS_DIR}/evaluation_results.json', 'w') as f:
+        json.dump(json_metrics, f, indent=2)
+    print(f"  Saved: {RESULTS_DIR}/evaluation_results.json")
+# =============================================================================
+# MAIN EVALUATION
+# =============================================================================
+def run_evaluation():
+    """Run full evaluation."""
+    print("=" * 60)
+    print("EVALUATING FINE-TUNED MODEL")
+    print("=" * 60)
+    setup_directories()
+    # Load test data
+    print("\n[1/4] Loading test data...")
+    test_file = f"{OUTPUT_DIR}/test.jsonl"
+    if not os.path.exists(test_file):
+        print("ERROR: Run prepare_data.py first!")
+        return None
+    test_data = []
+    with open(test_file) as f:
+        for line in f:
+            test_data.append(json.loads(line))
+    test_data = test_data[:NUM_EVAL_SAMPLES]
+    print(f"  Loaded {len(test_data)} samples")
+    # Generate predictions
+    print("\n[2/4] Generating predictions...")
+    try:
+        import sys
+        sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        from finetuning.inference import SQLGenerator
+        generator = SQLGenerator()
+        predictions = []
+        ground_truth = []
+        for i, item in enumerate(test_data):
+            pred = generator.generate(item['question'])
+            predictions.append(pred)
+            ground_truth.append(item['sql'])
+            if (i + 1) % 10 == 0:
+                print(f"  Progress: {i+1}/{len(test_data)}")
+    except Exception as e:
+        print(f"  Error loading model: {e}")
+        print("  Using ground truth as predictions (for testing metrics)")
+        predictions = [item['sql'] for item in test_data]
+        ground_truth = [item['sql'] for item in test_data]
+    # Calculate metrics
+    print("\n[3/4] Calculating metrics...")
+    metrics = evaluate_predictions(predictions, ground_truth)
+    print(f"  Exact Match: {metrics['exact_match_rate']*100:.2f}%")
+    print(f"  Token Accuracy: {metrics['avg_token_accuracy']*100:.2f}%")
+    print(f"  Keyword Accuracy: {metrics['avg_keyword_accuracy']*100:.2f}%")
+    print(f"  Structure Sim: {metrics['avg_structure_similarity']*100:.2f}%")
+    # Generate outputs
+    print("\n[4/4] Generating outputs...")
+    create_visualizations(metrics)
+    generate_report(metrics)
+    print("\n" + "=" * 60)
+    print("EVALUATION COMPLETE")
+    print("=" * 60)
+    return metrics
+if __name__ == "__main__":
+    run_evaluation()

src/finetuning/inference.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Inference Module for Fine-Tuned SQL Model
+Loads from: Local checkpoint OR Hugging Face Hub
+"""
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from dotenv import load_dotenv
+load_dotenv()
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+# Hugging Face Model ID (set in .env or Streamlit secrets)
+HF_MODEL_ID = os.getenv("HF_MODEL_ID", None)
+# Local paths
+LOCAL_MODEL_DIR = "outputs/finetuning/checkpoints/final"
+BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# =============================================================================
+# SQL GENERATOR CLASS
+# =============================================================================
+class SQLGenerator:
+    """SQL Generation using fine-tuned model."""
+    def __init__(self):
+        """Load the fine-tuned model from local or HuggingFace."""
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Device: {self.device}")
+        load_path = self._get_model_path()
+        # Load tokenizer and model with memory optimization
+        print(f"Loading model from: {load_path}")
+        self.tokenizer = AutoTokenizer.from_pretrained(load_path)
+        # Memory-efficient loading for cloud deployment
+        self.model = AutoModelForCausalLM.from_pretrained(
+            load_path,
+            torch_dtype=torch.float32,  # Use float32 for CPU
+            device_map=None,  # Don't use device_map on CPU
+            low_cpu_mem_usage=True,  # Reduce memory during loading
+            trust_remote_code=True
+        )
+        # Move to device after loading
+        self.model = self.model.to(self.device)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        print("✓ Model loaded!")
+    def _get_model_path(self):
+        """Determine where to load model from."""
+        # Check for required model files (not just folder existence)
+        required_files = ['config.json', 'tokenizer.json', 'tokenizer_config.json']
+        # Priority 1: Local checkpoint with actual model files
+        if os.path.exists(LOCAL_MODEL_DIR):
+            local_files = os.listdir(LOCAL_MODEL_DIR) if os.path.isdir(LOCAL_MODEL_DIR) else []
+            has_model_files = any(f in local_files for f in required_files) or any(f.endswith('.safetensors') or f.endswith('.bin') for f in local_files)
+            if has_model_files:
+                print(f"📁 Found local model checkpoint: {LOCAL_MODEL_DIR}")
+                return LOCAL_MODEL_DIR
+            else:
+                print(f"⚠️ Local folder exists but no model files found")
+        # Priority 2: Download from HuggingFace Hub
+        if HF_MODEL_ID:
+            print(f"☁️ Downloading model from HuggingFace: {HF_MODEL_ID}")
+            return HF_MODEL_ID
+        # Priority 3: Base model fallback
+        print("⚠️ No fine-tuned model found, using base model")
+        return BASE_MODEL
+    def generate(self, question, context="", max_tokens=128):
+        """Generate SQL from question."""
+        # Build prompt
+        if context:
+            prompt = f"""{context}
+### Question:
+{question}
+### SQL:"""
+        else:
+            prompt = f"""### Question:
+{question}
+### SQL:"""
+        # Tokenize
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512
+        ).to(self.device)
+        # Generate
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                temperature=0.1,
+                do_sample=True,
+                top_p=0.95,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        # Decode
+        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract SQL
+        sql = generated[len(prompt):].strip()
+        if "###" in sql:
+            sql = sql.split("###")[0].strip()
+        return sql
+# =============================================================================
+# STANDALONE FUNCTION
+# =============================================================================
+_generator = None
+def generate_sql(question, context=""):
+    """Standalone SQL generation."""
+    global _generator
+    if _generator is None:
+        _generator = SQLGenerator()
+    return _generator.generate(question, context)
+# =============================================================================
+# TEST
+# =============================================================================
+def test_inference():
+    """Test the model."""
+    print("=" * 60)
+    print("TESTING SQL GENERATION")
+    print("=" * 60)
+    generator = SQLGenerator()
+    questions = [
+        "Find all employees with salary greater than 50000",
+    ]
+    print("\n" + "-" * 60)
+    for q in questions:
+        print(f"Q: {q}")
+        sql = generator.generate(q)
+        print(f"SQL: {sql}")
+        print("-" * 60)
+    print("\n✓ Test complete")
+if __name__ == "__main__":
+    test_inference()

src/finetuning/prepare_data.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+Data Preparation for Fine-Tuning
+Uses train.csv, validation.csv, test.csv correctly.
+"""
+import os
+import pandas as pd
+import json
+from datetime import datetime
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+OUTPUT_DIR = "outputs/finetuning"
+DATA_DIR = "data"
+# Change this for testing vs full training
+MAX_SAMPLES = 100  # Set to None for full data
+def setup_directories():
+    for d in [OUTPUT_DIR, f"{OUTPUT_DIR}/results", f"{OUTPUT_DIR}/logs"]:
+        os.makedirs(d, exist_ok=True)
+# =============================================================================
+# PROMPT TEMPLATE
+# =============================================================================
+def format_for_training(question, sql):
+    """Format single example for instruction fine-tuning."""
+    text = f"""### Question:
+{question}
+### SQL:
+{sql}"""
+    return text
+# =============================================================================
+# DATA LOADING
+# =============================================================================
+def load_csv_file(filepath, max_samples=None):
+    """Load a single CSV file."""
+    if not os.path.exists(filepath):
+        print(f"  File not found: {filepath}")
+        return None
+    df = pd.read_csv(filepath)
+    if max_samples and len(df) > max_samples:
+        df = df.sample(n=max_samples, random_state=42)
+    return df
+def format_dataframe(df, source_name):
+    """Convert dataframe to training format."""
+    formatted = []
+    for _, row in df.iterrows():
+        formatted.append({
+            'text': format_for_training(row['question'], row['sql']),
+            'question': str(row['question']),
+            'sql': str(row['sql']),
+            'source': source_name
+        })
+    return formatted
+def save_jsonl(data, filepath):
+    """Save data as JSONL file."""
+    with open(filepath, 'w', encoding='utf-8') as f:
+        for item in data:
+            f.write(json.dumps(item) + '\n')
+    print(f"  Saved: {filepath}")
+# =============================================================================
+# MAIN FUNCTION
+# =============================================================================
+def prepare_finetuning_data():
+    """Prepare data for fine-tuning."""
+    print("=" * 50)
+    print("PREPARING FINE-TUNING DATA")
+    print(f"Max samples per file: {MAX_SAMPLES if MAX_SAMPLES else 'ALL'}")
+    print("=" * 50)
+    setup_directories()
+    # Load train data
+    print("\n[1/5] Loading training data...")
+    train_df = load_csv_file(f"{DATA_DIR}/train.csv", MAX_SAMPLES)
+    print(f"  train.csv: {len(train_df):,} rows")
+    # Load synthetic and combine with train
+    # synthetic_df = load_csv_file(f"{DATA_DIR}/synthetic.csv", MAX_SAMPLES)
+    # if synthetic_df is not None:
+    #     print(f"  synthetic.csv: {len(synthetic_df):,} rows")
+    #     train_df = pd.concat([train_df, synthetic_df], ignore_index=True)
+    #     print(f"  Combined training: {len(train_df):,} rows")
+    # Load validation data
+    print("\n[2/5] Loading validation data...")
+    val_df = load_csv_file(f"{DATA_DIR}/validation.csv", MAX_SAMPLES)
+    print(f"  validation.csv: {len(val_df):,} rows")
+    # Load test data
+    print("\n[3/5] Loading test data...")
+    test_df = load_csv_file(f"{DATA_DIR}/test.csv", MAX_SAMPLES)
+    print(f"  test.csv: {len(test_df):,} rows")
+    # Format data
+    print("\n[4/5] Formatting data...")
+    train_data = format_dataframe(train_df, 'train')
+    val_data = format_dataframe(val_df, 'validation')
+    test_data = format_dataframe(test_df, 'test')
+    # Save files
+    print("\n[5/5] Saving files...")
+    save_jsonl(train_data, f"{OUTPUT_DIR}/train.jsonl")
+    save_jsonl(val_data, f"{OUTPUT_DIR}/val.jsonl")
+    save_jsonl(test_data, f"{OUTPUT_DIR}/test.jsonl")
+    # Save stats
+    stats = {
+        'train_samples': len(train_data),
+        'val_samples': len(val_data),
+        'test_samples': len(test_data),
+        'max_samples': MAX_SAMPLES,
+        'created_at': datetime.now().isoformat()
+    }
+    with open(f"{OUTPUT_DIR}/data_stats.json", 'w') as f:
+        json.dump(stats, f, indent=2)
+    # Summary
+    print("\n" + "=" * 50)
+    print("COMPLETE")
+    print("=" * 50)
+    print(f"  Train: {len(train_data):,}")
+    print(f"  Val:   {len(val_data):,}")
+    print(f"  Test:  {len(test_data):,}")
+    return stats
+# =============================================================================
+# ENTRY POINT
+# =============================================================================
+if __name__ == "__main__":
+    prepare_finetuning_data()

src/finetuning/train.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""
+Fine-Tuning Script for SQL Generation Model
+Uses LoRA for efficient fine-tuning.
+"""
+import os
+import json
+import torch
+from datetime import datetime
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling
+)
+from peft import LoraConfig, get_peft_model, TaskType
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+OUTPUT_DIR = "outputs/finetuning"
+CHECKPOINT_DIR = f"{OUTPUT_DIR}/checkpoints"
+LOGS_DIR = f"{OUTPUT_DIR}/logs"
+# Training config (optimized for RTX 4070)
+TRAINING_CONFIG = {
+    'num_epochs': 3,
+    'batch_size': 8,
+    'learning_rate': 2e-4,
+    'max_length': 256,
+    'warmup_steps': 100,
+    'logging_steps': 50,
+    'save_steps': 500,
+    'gradient_accumulation_steps': 2,
+}
+# LoRA config
+LORA_CONFIG = {
+    'r': 16,
+    'lora_alpha': 32,
+    'lora_dropout': 0.1,
+    'target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj']
+}
+def setup_directories():
+    for d in [OUTPUT_DIR, CHECKPOINT_DIR, LOGS_DIR]:
+        os.makedirs(d, exist_ok=True)
+# =============================================================================
+# TRAINING FUNCTIONS
+# =============================================================================
+def load_data():
+    """Load prepared training data."""
+    train_file = f"{OUTPUT_DIR}/train.jsonl"
+    val_file = f"{OUTPUT_DIR}/val.jsonl"
+    if not os.path.exists(train_file):
+        raise FileNotFoundError("Run prepare_data.py first!")
+    return load_dataset('json', data_files={
+        'train': train_file,
+        'validation': val_file
+    })
+def setup_model():
+    """Load model and tokenizer with LoRA."""
+    print(f"Loading: {MODEL_NAME}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        torch_dtype=torch.float16,
+        device_map="auto"
+    )
+    lora_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        r=LORA_CONFIG['r'],
+        lora_alpha=LORA_CONFIG['lora_alpha'],
+        lora_dropout=LORA_CONFIG['lora_dropout'],
+        target_modules=LORA_CONFIG['target_modules']
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    return model, tokenizer
+def tokenize(examples, tokenizer):
+    """Tokenize examples."""
+    return tokenizer(
+        examples['text'],
+        truncation=True,
+        padding='max_length',
+        max_length=TRAINING_CONFIG['max_length']
+    )
+def train(model, tokenizer, dataset):
+    """Train the model."""
+    # Tokenize
+    print("Tokenizing...")
+    tokenized_train = dataset['train'].map(
+        lambda x: tokenize(x, tokenizer),
+        batched=True,
+        remove_columns=dataset['train'].column_names
+    )
+    tokenized_val = dataset['validation'].map(
+        lambda x: tokenize(x, tokenizer),
+        batched=True,
+        remove_columns=dataset['validation'].column_names
+    )
+    # Training args
+    training_args = TrainingArguments(
+        output_dir=CHECKPOINT_DIR,
+        num_train_epochs=TRAINING_CONFIG['num_epochs'],
+        per_device_train_batch_size=TRAINING_CONFIG['batch_size'],
+        per_device_eval_batch_size=TRAINING_CONFIG['batch_size'],
+        learning_rate=TRAINING_CONFIG['learning_rate'],
+        warmup_steps=TRAINING_CONFIG['warmup_steps'],
+        logging_steps=TRAINING_CONFIG['logging_steps'],
+        save_steps=TRAINING_CONFIG['save_steps'],
+        gradient_accumulation_steps=TRAINING_CONFIG['gradient_accumulation_steps'],
+        eval_strategy="steps",
+        eval_steps=TRAINING_CONFIG['save_steps'],
+        save_total_limit=2,
+        fp16=True,
+        report_to="none",
+        logging_dir=LOGS_DIR,
+        dataloader_pin_memory=False,
+    )
+    # Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_val,
+        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    )
+    # Train
+    print(f"\nTraining: {len(tokenized_train)} samples, {TRAINING_CONFIG['num_epochs']} epochs")
+    result = trainer.train()
+    # Save
+    print("\nSaving model...")
+    trainer.save_model(f"{CHECKPOINT_DIR}/final")
+    tokenizer.save_pretrained(f"{CHECKPOINT_DIR}/final")
+    # Stats
+    stats = {
+        'train_loss': result.training_loss,
+        'runtime_seconds': result.metrics['train_runtime'],
+        'samples_per_second': result.metrics['train_samples_per_second'],
+        'epochs': TRAINING_CONFIG['num_epochs'],
+        'total_steps': result.global_step,
+        'gpu': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU',
+        'completed_at': datetime.now().isoformat()
+    }
+    with open(f"{CHECKPOINT_DIR}/training_stats.json", 'w') as f:
+        json.dump(stats, f, indent=2)
+    return stats
+# =============================================================================
+# MAIN
+# =============================================================================
+def run_finetuning():
+    """Main function."""
+    print("=" * 60)
+    print("FINE-TUNING SQL MODEL")
+    if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name(0)}")
+    else:
+        print("GPU: Not available (using CPU)")
+    print("=" * 60)
+    setup_directories()
+    # Load data
+    print("\n[1/3] Loading data...")
+    dataset = load_data()
+    print(f"  Train: {len(dataset['train']):,}")
+    print(f"  Val: {len(dataset['validation']):,}")
+    # Setup model
+    print("\n[2/3] Setting up model...")
+    model, tokenizer = setup_model()
+    # Train
+    print("\n[3/3] Training...")
+    stats = train(model, tokenizer, dataset)
+    # Done
+    print("\n" + "=" * 60)
+    print("TRAINING COMPLETE")
+    print("=" * 60)
+    print(f"  Loss: {stats['train_loss']:.4f}")
+    print(f"  Time: {stats['runtime_seconds']/60:.1f} min")
+    print(f"  Model: {CHECKPOINT_DIR}/final")
+    return stats
+if __name__ == "__main__":
+    run_finetuning()

src/outputs/finetuning/data_stats.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "train_samples": 100,
+  "val_samples": 100,
+  "test_samples": 100,
+  "max_samples": 100,
+  "created_at": "2025-12-08T01:22:29.002186"
+}

src/outputs/finetuning/results/evaluation_report.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Fine-Tuning Evaluation Report
+**Generated:** 2025-12-08 01:32:37
+## Metrics Summary
+| Metric | Score |
+|--------|-------|
+| Samples Evaluated | 50 |
+| Exact Match Rate | 0.00% |
+| Token Accuracy | 47.21% |
+| Keyword Accuracy | 91.33% |
+| Structure Similarity | 91.07% |
+## Metrics Explanation
+- **Exact Match**: Predictions identical to ground truth
+- **Token Accuracy**: Word overlap between prediction and expected
+- **Keyword Accuracy**: SQL keywords (SELECT, WHERE, etc.) match
+- **Structure Similarity**: Query structure (clauses used) match
+## Visualizations
+- `01_metrics_overview.png` - All metrics bar chart
+- `02_token_accuracy_dist.png` - Token accuracy histogram
+- `03_keyword_accuracy_dist.png` - Keyword accuracy histogram

src/outputs/finetuning/results/evaluation_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "total_samples": 50,
+  "exact_match_rate": 0.0,
+  "avg_token_accuracy": 0.472115604983252,
+  "avg_keyword_accuracy": 0.9133333333333334,
+  "avg_structure_similarity": 0.9106666666666667
+}

src/outputs/finetuning/test.jsonl ADDED Viewed

	@@ -0,0 +1,100 @@

+{"text": "### Question:\nWhat is the whole of Drawn that has a Lost of 4?\n\n### SQL:\nSELECT SUM Drawn FROM table WHERE Lost = 4", "question": "What is the whole of Drawn that has a Lost of 4?", "sql": "SELECT SUM Drawn FROM table WHERE Lost = 4", "source": "test"}
+{"text": "### Question:\nWhat is the rating of the episode with a rating/share of 0.9/4?\n\n### SQL:\nSELECT Rating FROM table WHERE Rating/Share (18\u201349) = 0.9/4", "question": "What is the rating of the episode with a rating/share of 0.9/4?", "sql": "SELECT Rating FROM table WHERE Rating/Share (18\u201349) = 0.9/4", "source": "test"}
+{"text": "### Question:\nWhat is the last year that someone is first elected in this table?\n\n### SQL:\nSELECT MAX First elected FROM table", "question": "What is the last year that someone is first elected in this table?", "sql": "SELECT MAX First elected FROM table", "source": "test"}
+{"text": "### Question:\nHow many poles had 72 points?\n\n### SQL:\nSELECT COUNT Poles FROM table WHERE Points = 72", "question": "How many poles had 72 points?", "sql": "SELECT COUNT Poles FROM table WHERE Points = 72", "source": "test"}
+{"text": "### Question:\nWho are all the Moto2 winners when the grand prix was Shell Advance Malaysian Grand Prix?\n\n### SQL:\nSELECT Moto2 winner FROM table WHERE Grand Prix = Shell Advance Malaysian Grand Prix", "question": "Who are all the Moto2 winners when the grand prix was Shell Advance Malaysian Grand Prix?", "sql": "SELECT Moto2 winner FROM table WHERE Grand Prix = Shell Advance Malaysian Grand Prix", "source": "test"}
+{"text": "### Question:\nHow many incumbents are there in the georgia 8 district when the party is democratic?\n\n### SQL:\nSELECT COUNT Incumbent FROM table WHERE Party = Democratic AND District = Georgia 8", "question": "How many incumbents are there in the georgia 8 district when the party is democratic?", "sql": "SELECT COUNT Incumbent FROM table WHERE Party = Democratic AND District = Georgia 8", "source": "test"}
+{"text": "### Question:\nWhat is the largest Area (msr) that has an Area less than 291.045, is part of the Per family, and has a rank higher than 78?\n\n### SQL:\nSELECT MAX Area (msr) FROM table WHERE Area (sq.deg.) < 291.045 AND Family = per AND Rank > 78", "question": "What is the largest Area (msr) that has an Area less than 291.045, is part of the Per family, and has a rank higher than 78?", "sql": "SELECT MAX Area (msr) FROM table WHERE Area (sq.deg.) < 291.045 AND Family = per AND Rank > 78", "source": "test"}
+{"text": "### Question:\nWho won women's double in 2002, the year that Kenneth Vella won men's singles?\n\n### SQL:\nSELECT Women's doubles FROM table WHERE Men's singles = kenneth vella AND Year = 2002", "question": "Who won women's double in 2002, the year that Kenneth Vella won men's singles?", "sql": "SELECT Women's doubles FROM table WHERE Men's singles = kenneth vella AND Year = 2002", "source": "test"}
+{"text": "### Question:\nWhat's k. j. choi's to par?\n\n### SQL:\nSELECT To par FROM table WHERE Player = k. j. choi", "question": "What's k. j. choi's to par?", "sql": "SELECT To par FROM table WHERE Player = k. j. choi", "source": "test"}
+{"text": "### Question:\nWho was the winner when the time was 1:24.00?\n\n### SQL:\nSELECT Winner/2nd FROM table WHERE Time = 1:24.00", "question": "Who was the winner when the time was 1:24.00?", "sql": "SELECT Winner/2nd FROM table WHERE Time = 1:24.00", "source": "test"}
+{"text": "### Question:\nWhich couple had a week 2 score of exactly 23?\n\n### SQL:\nSELECT Couple FROM table WHERE Week 2 = 23", "question": "Which couple had a week 2 score of exactly 23?", "sql": "SELECT Couple FROM table WHERE Week 2 = 23", "source": "test"}
+{"text": "### Question:\nWhat is the record when the result was w 52\u201343?\n\n### SQL:\nSELECT Record FROM table WHERE Result = w 52\u201343", "question": "What is the record when the result was w 52\u201343?", "sql": "SELECT Record FROM table WHERE Result = w 52\u201343", "source": "test"}
+{"text": "### Question:\nWhat's the L3 cache that has a low power part number?\n\n### SQL:\nSELECT L3 cache FROM table WHERE Part number(s) = low power", "question": "What's the L3 cache that has a low power part number?", "sql": "SELECT L3 cache FROM table WHERE Part number(s) = low power", "source": "test"}
+{"text": "### Question:\nWho won the race on 24 August?\n\n### SQL:\nSELECT Winning driver FROM table WHERE Date = 24 august", "question": "Who won the race on 24 August?", "sql": "SELECT Winning driver FROM table WHERE Date = 24 august", "source": "test"}
+{"text": "### Question:\nWhat is the original artist when the vocal percussionist is Alexei Kalveks?\n\n### SQL:\nSELECT Original Artist FROM table WHERE Vocal Percussionist = Alexei Kalveks", "question": "What is the original artist when the vocal percussionist is Alexei Kalveks?", "sql": "SELECT Original Artist FROM table WHERE Vocal Percussionist = Alexei Kalveks", "source": "test"}
+{"text": "### Question:\nwhat is the winning % for the years 2006-11?\n\n### SQL:\nSELECT Winning % FROM table WHERE Years = 2006-11", "question": "what is the winning % for the years 2006-11?", "sql": "SELECT Winning % FROM table WHERE Years = 2006-11", "source": "test"}
+{"text": "### Question:\nWhat is Method, when Event is \"Reality Submission Fighting 2\"?\n\n### SQL:\nSELECT Method FROM table WHERE Event = reality submission fighting 2", "question": "What is Method, when Event is \"Reality Submission Fighting 2\"?", "sql": "SELECT Method FROM table WHERE Event = reality submission fighting 2", "source": "test"}
+{"text": "### Question:\nWhat was the date of the game that had a loss of lidle (10-8)?\n\n### SQL:\nSELECT Date FROM table WHERE Loss = lidle (10-8)", "question": "What was the date of the game that had a loss of lidle (10-8)?", "sql": "SELECT Date FROM table WHERE Loss = lidle (10-8)", "source": "test"}
+{"text": "### Question:\nWho played mixed doubles when Anna Keir played women's singles?\n\n### SQL:\nSELECT Mixed doubles FROM table WHERE Women's singles = anna keir", "question": "Who played mixed doubles when Anna Keir played women's singles?", "sql": "SELECT Mixed doubles FROM table WHERE Women's singles = anna keir", "source": "test"}
+{"text": "### Question:\nWhich tone has a Standard Thai at \u0e1b\u0e25\u0e32?\n\n### SQL:\nSELECT Tone FROM table WHERE Standard Thai = \u0e1b\u0e25\u0e32", "question": "Which tone has a Standard Thai at \u0e1b\u0e25\u0e32?", "sql": "SELECT Tone FROM table WHERE Standard Thai = \u0e1b\u0e25\u0e32", "source": "test"}
+{"text": "### Question:\nName the payload that has a weight of 12,000\n\n### SQL:\nSELECT Payload (kg) FROM table WHERE Weight (kg) = 12,000", "question": "Name the payload that has a weight of 12,000", "sql": "SELECT Payload (kg) FROM table WHERE Weight (kg) = 12,000", "source": "test"}
+{"text": "### Question:\nWhich school's round was 24?\n\n### SQL:\nSELECT School FROM table WHERE Round = 24", "question": "Which school's round was 24?", "sql": "SELECT School FROM table WHERE Round = 24", "source": "test"}
+{"text": "### Question:\nWhat is the least for Scottish Cup with a Challenge Cup greater than 0, Player Paul Keegan, and League Cup greater than 0?\n\n### SQL:\nSELECT MIN Scottish Cup FROM table WHERE Challenge Cup > 0 AND Player = paul keegan AND League Cup > 0", "question": "What is the least for Scottish Cup with a Challenge Cup greater than 0, Player Paul Keegan, and League Cup greater than 0?", "sql": "SELECT MIN Scottish Cup FROM table WHERE Challenge Cup > 0 AND Player = paul keegan AND League Cup > 0", "source": "test"}
+{"text": "### Question:\nWhat is the only type of university that was founded in 1873?\n\n### SQL:\nSELECT Control FROM table WHERE Founded = 1873", "question": "What is the only type of university that was founded in 1873?", "sql": "SELECT Control FROM table WHERE Founded = 1873", "source": "test"}
+{"text": "### Question:\nHow many games were there in the 1966 season?\n\n### SQL:\nSELECT MAX Game FROM table", "question": "How many games were there in the 1966 season?", "sql": "SELECT MAX Game FROM table", "source": "test"}
+{"text": "### Question:\nWhen luz mcclinton is the name what is the season?\n\n### SQL:\nSELECT Season FROM table WHERE Name = Luz McClinton", "question": "When luz mcclinton is the name what is the season?", "sql": "SELECT Season FROM table WHERE Name = Luz McClinton", "source": "test"}
+{"text": "### Question:\nWho was the original artist for First Solo?\n\n### SQL:\nSELECT Original artist FROM table WHERE Theme = First Solo", "question": "Who was the original artist for First Solo?", "sql": "SELECT Original artist FROM table WHERE Theme = First Solo", "source": "test"}
+{"text": "### Question:\nWhat is the lowest grid for Roberto Rolfo with more than 26 laps?\n\n### SQL:\nSELECT MIN Grid FROM table WHERE Rider = roberto rolfo AND Laps > 26", "question": "What is the lowest grid for Roberto Rolfo with more than 26 laps?", "sql": "SELECT MIN Grid FROM table WHERE Rider = roberto rolfo AND Laps > 26", "source": "test"}
+{"text": "### Question:\nWhat is the average lap for suzuki gsx-r1000 k7 and at grid 6?\n\n### SQL:\nSELECT AVG Laps FROM table WHERE Bike = suzuki gsx-r1000 k7 AND Grid = 6", "question": "What is the average lap for suzuki gsx-r1000 k7 and at grid 6?", "sql": "SELECT AVG Laps FROM table WHERE Bike = suzuki gsx-r1000 k7 AND Grid = 6", "source": "test"}
+{"text": "### Question:\nWhat is the date when the Lakers were the home team?\n\n### SQL:\nSELECT Date FROM table WHERE Home = lakers", "question": "What is the date when the Lakers were the home team?", "sql": "SELECT Date FROM table WHERE Home = lakers", "source": "test"}
+{"text": "### Question:\nWhen was there a score of 7-1?\n\n### SQL:\nSELECT Date FROM table WHERE Score = 7-1", "question": "When was there a score of 7-1?", "sql": "SELECT Date FROM table WHERE Score = 7-1", "source": "test"}
+{"text": "### Question:\nWho received 6,131 televotes?\n\n### SQL:\nSELECT Televote Points FROM table WHERE Televotes = 6,131", "question": "Who received 6,131 televotes?", "sql": "SELECT Televote Points FROM table WHERE Televotes = 6,131", "source": "test"}
+{"text": "### Question:\nHow many episodes aired Saturday, July 11, 2009\n\n### SQL:\nSELECT COUNT Episode # FROM table WHERE US air date = Saturday, July 11, 2009", "question": "How many episodes aired Saturday, July 11, 2009", "sql": "SELECT COUNT Episode # FROM table WHERE US air date = Saturday, July 11, 2009", "source": "test"}
+{"text": "### Question:\nWhich episode aired in the USA on 20 May 2005?\n\n### SQL:\nSELECT Episode FROM table WHERE Airdate (USA) = 20 may 2005", "question": "Which episode aired in the USA on 20 May 2005?", "sql": "SELECT Episode FROM table WHERE Airdate (USA) = 20 may 2005", "source": "test"}
+{"text": "### Question:\nWhat was the best finish for 206 on the money list?\n\n### SQL:\nSELECT Best finish FROM table WHERE Money list rank = 206", "question": "What was the best finish for 206 on the money list?", "sql": "SELECT Best finish FROM table WHERE Money list rank = 206", "source": "test"}
+{"text": "### Question:\nName the sum of pick # for round less than 1\n\n### SQL:\nSELECT SUM Pick # FROM table WHERE Round < 1", "question": "Name the sum of pick # for round less than 1", "sql": "SELECT SUM Pick # FROM table WHERE Round < 1", "source": "test"}
+{"text": "### Question:\nWhat engine did the Team Lotus have after 1965?\n\n### SQL:\nSELECT Engine FROM table WHERE Entrant = team lotus AND Year > 1965", "question": "What engine did the Team Lotus have after 1965?", "sql": "SELECT Engine FROM table WHERE Entrant = team lotus AND Year > 1965", "source": "test"}
+{"text": "### Question:\nOpponent of chicago bulls had what location?\n\n### SQL:\nSELECT Location FROM table WHERE Opponent = chicago bulls", "question": "Opponent of chicago bulls had what location?", "sql": "SELECT Location FROM table WHERE Opponent = chicago bulls", "source": "test"}
+{"text": "### Question:\nWhat religious groups made up 0.72% of the Indian population in 2001?\n\n### SQL:\nSELECT Religious group FROM table WHERE Population % 2001 = 0.72%", "question": "What religious groups made up 0.72% of the Indian population in 2001?", "sql": "SELECT Religious group FROM table WHERE Population % 2001 = 0.72%", "source": "test"}
+{"text": "### Question:\nOn which date was the high assists Delonte West Earl Watson (6)?\n\n### SQL:\nSELECT Date FROM table WHERE High assists = delonte west earl watson (6)", "question": "On which date was the high assists Delonte West Earl Watson (6)?", "sql": "SELECT Date FROM table WHERE High assists = delonte west earl watson (6)", "source": "test"}
+{"text": "### Question:\nWhat is the enrollment for the institution in Westfield, Massachusetts? \n\n### SQL:\nSELECT Enrollment FROM table WHERE Location = Westfield, Massachusetts", "question": "What is the enrollment for the institution in Westfield, Massachusetts? ", "sql": "SELECT Enrollment FROM table WHERE Location = Westfield, Massachusetts", "source": "test"}
+{"text": "### Question:\nWhat is the Studio of the Film with a Gross rental of $7,500,000?\n\n### SQL:\nSELECT Studio FROM table WHERE Gross rental = $7,500,000", "question": "What is the Studio of the Film with a Gross rental of $7,500,000?", "sql": "SELECT Studio FROM table WHERE Gross rental = $7,500,000", "source": "test"}
+{"text": "### Question:\nWhat was the Outcome of the match played on Hard (i) Surface?\n\n### SQL:\nSELECT Outcome FROM table WHERE Surface = hard (i)", "question": "What was the Outcome of the match played on Hard (i) Surface?", "sql": "SELECT Outcome FROM table WHERE Surface = hard (i)", "source": "test"}
+{"text": "### Question:\nWhat is the highest rank of the player who played 30 events and made less than $2,708,005?\n\n### SQL:\nSELECT MAX Rank FROM table WHERE Earnings ( $ ) < 2,708,005 AND Events = 30", "question": "What is the highest rank of the player who played 30 events and made less than $2,708,005?", "sql": "SELECT MAX Rank FROM table WHERE Earnings ( $ ) < 2,708,005 AND Events = 30", "source": "test"}
+{"text": "### Question:\nHow many games had Montreal Canadiens as an opponent?\n\n### SQL:\nSELECT SUM Game FROM table WHERE Opponent = montreal canadiens", "question": "How many games had Montreal Canadiens as an opponent?", "sql": "SELECT SUM Game FROM table WHERE Opponent = montreal canadiens", "source": "test"}
+{"text": "### Question:\nWhat is the length of the highway with the route name sh 2?\n\n### SQL:\nSELECT Length FROM table WHERE Route Name = sh 2", "question": "What is the length of the highway with the route name sh 2?", "sql": "SELECT Length FROM table WHERE Route Name = sh 2", "source": "test"}
+{"text": "### Question:\nCan you tell me total number of Silver that has the Republic of latvian ssr, and the Total larger than 6?\n\n### SQL:\nSELECT COUNT Silver FROM table WHERE Republic = latvian ssr AND Total > 6", "question": "Can you tell me total number of Silver that has the Republic of latvian ssr, and the Total larger than 6?", "sql": "SELECT COUNT Silver FROM table WHERE Republic = latvian ssr AND Total > 6", "source": "test"}
+{"text": "### Question:\nWhat date did they play the Florida Panthers?\n\n### SQL:\nSELECT Date FROM table WHERE Opponent = Florida Panthers", "question": "What date did they play the Florida Panthers?", "sql": "SELECT Date FROM table WHERE Opponent = Florida Panthers", "source": "test"}
+{"text": "### Question:\nwhich college has a player called Riley Clayton?\n\n### SQL:\nSELECT College FROM table WHERE Player = riley clayton", "question": "which college has a player called Riley Clayton?", "sql": "SELECT College FROM table WHERE Player = riley clayton", "source": "test"}
+{"text": "### Question:\nWhat is the most minimal Final year that has a North or east end of covington?\n\n### SQL:\nSELECT MIN Final year FROM table WHERE North or east terminus = covington", "question": "What is the most minimal Final year that has a North or east end of covington?", "sql": "SELECT MIN Final year FROM table WHERE North or east terminus = covington", "source": "test"}
+{"text": "### Question:\nWho directed the episode whose production code is pabf05?\n\n### SQL:\nSELECT Directed by FROM table WHERE Production code = PABF05", "question": "Who directed the episode whose production code is pabf05?", "sql": "SELECT Directed by FROM table WHERE Production code = PABF05", "source": "test"}
+{"text": "### Question:\nWhat is the best fit (all data) when the best fit (WMAP, extra parameter) shows \u2014?\n\n### SQL:\nSELECT Best fit (all data) FROM table WHERE Best fit (WMAP, extra parameter) = \u2014", "question": "What is the best fit (all data) when the best fit (WMAP, extra parameter) shows \u2014?", "sql": "SELECT Best fit (all data) FROM table WHERE Best fit (WMAP, extra parameter) = \u2014", "source": "test"}
+{"text": "### Question:\nCan you tell me the lowest Week that has the Attendance smaller than 34,336?\n\n### SQL:\nSELECT MIN Week FROM table WHERE Attendance < 34,336", "question": "Can you tell me the lowest Week that has the Attendance smaller than 34,336?", "sql": "SELECT MIN Week FROM table WHERE Attendance < 34,336", "source": "test"}
+{"text": "### Question:\nWhat is the Lead in the 2004-05 Season?\n\n### SQL:\nSELECT Lead FROM table WHERE Season = 2004-05", "question": "What is the Lead in the 2004-05 Season?", "sql": "SELECT Lead FROM table WHERE Season = 2004-05", "source": "test"}
+{"text": "### Question:\nWhat is the result for week 12 against the Green Bay Packers?\n\n### SQL:\nSELECT Result FROM table WHERE Week > 12 AND Opponent = green bay packers", "question": "What is the result for week 12 against the Green Bay Packers?", "sql": "SELECT Result FROM table WHERE Week > 12 AND Opponent = green bay packers", "source": "test"}
+{"text": "### Question:\nWhat was the first season for the club that in 2012 was 2nd in Superettan?\n\n### SQL:\nSELECT First season FROM table WHERE Position in 2012 = 2nd in Superettan", "question": "What was the first season for the club that in 2012 was 2nd in Superettan?", "sql": "SELECT First season FROM table WHERE Position in 2012 = 2nd in Superettan", "source": "test"}
+{"text": "### Question:\nWhat was the extra info for the Commonwealth Games?\n\n### SQL:\nSELECT Extra FROM table WHERE Tournament = commonwealth games", "question": "What was the extra info for the Commonwealth Games?", "sql": "SELECT Extra FROM table WHERE Tournament = commonwealth games", "source": "test"}
+{"text": "### Question:\nWhich player played for the Grizzlies from 1997-1998?\n\n### SQL:\nSELECT Player FROM table WHERE Years for Grizzlies = 1997-1998", "question": "Which player played for the Grizzlies from 1997-1998?", "sql": "SELECT Player FROM table WHERE Years for Grizzlies = 1997-1998", "source": "test"}
+{"text": "### Question:\nNone of the communities listed has a percentage smaller than 8.6 in 2006.\n\n### SQL:\nSELECT COUNT Seats 2001 FROM table WHERE % 2006 < 8.6", "question": "None of the communities listed has a percentage smaller than 8.6 in 2006.", "sql": "SELECT COUNT Seats 2001 FROM table WHERE % 2006 < 8.6", "source": "test"}
+{"text": "### Question:\nWhat is the basketball status for Valparaiso who has an indoor track status of yes?\n\n### SQL:\nSELECT Bask FROM table WHERE Indoor track = yes AND School = valparaiso", "question": "What is the basketball status for Valparaiso who has an indoor track status of yes?", "sql": "SELECT Bask FROM table WHERE Indoor track = yes AND School = valparaiso", "source": "test"}
+{"text": "### Question:\nWhat 1979 Hindi film had Ravindra Jain directing music?\n\n### SQL:\nSELECT Film name FROM table WHERE Language = hindi AND Lyricist = ravindra jain AND Music director = ravindra jain AND Year = 1979", "question": "What 1979 Hindi film had Ravindra Jain directing music?", "sql": "SELECT Film name FROM table WHERE Language = hindi AND Lyricist = ravindra jain AND Music director = ravindra jain AND Year = 1979", "source": "test"}
+{"text": "### Question:\nWhat was the winning score in the Alfred Dunhill links championship?\n\n### SQL:\nSELECT Winning score FROM table WHERE Tournament = alfred dunhill links championship", "question": "What was the winning score in the Alfred Dunhill links championship?", "sql": "SELECT Winning score FROM table WHERE Tournament = alfred dunhill links championship", "source": "test"}
+{"text": "### Question:\nTell me the highest Grid for Maurice Trintignant and laps less than 87\n\n### SQL:\nSELECT MAX Grid FROM table WHERE Driver = maurice trintignant AND Laps < 87", "question": "Tell me the highest Grid for Maurice Trintignant and laps less than 87", "sql": "SELECT MAX Grid FROM table WHERE Driver = maurice trintignant AND Laps < 87", "source": "test"}
+{"text": "### Question:\nWhich Play-Off has Events of \u2013, and a Season of a-6?\n\n### SQL:\nSELECT Play-Off FROM table WHERE Events = \u2013 AND Season = a-6", "question": "Which Play-Off has Events of \u2013, and a Season of a-6?", "sql": "SELECT Play-Off FROM table WHERE Events = \u2013 AND Season = a-6", "source": "test"}
+{"text": "### Question:\nWho did team phoenix visit in their home?\n\n### SQL:\nSELECT Home FROM table WHERE Visitor = phoenix", "question": "Who did team phoenix visit in their home?", "sql": "SELECT Home FROM table WHERE Visitor = phoenix", "source": "test"}
+{"text": "### Question:\nName the record with home of bucks on 24 november 2007\n\n### SQL:\nSELECT Record FROM table WHERE Home = bucks AND Date = 24 november 2007", "question": "Name the record with home of bucks on 24 november 2007", "sql": "SELECT Record FROM table WHERE Home = bucks AND Date = 24 november 2007", "source": "test"}
+{"text": "### Question:\nWhat is Myron Walwyn with a Territorial at-large Constiuency's First Elected Date\n\n### SQL:\nSELECT First elected FROM table WHERE Constiuency = territorial at-large AND Name = myron walwyn", "question": "What is Myron Walwyn with a Territorial at-large Constiuency's First Elected Date", "sql": "SELECT First elected FROM table WHERE Constiuency = territorial at-large AND Name = myron walwyn", "source": "test"}
+{"text": "### Question:\nWhat competition had a Rank-Qualifying of 1st and a ball apparatus?\n\n### SQL:\nSELECT Competition Description FROM table WHERE Rank-Qualifying = 1st AND Apparatus = ball", "question": "What competition had a Rank-Qualifying of 1st and a ball apparatus?", "sql": "SELECT Competition Description FROM table WHERE Rank-Qualifying = 1st AND Apparatus = ball", "source": "test"}
+{"text": "### Question:\nWhat is the average area larger than Code 19025 but a smaller region than 12?\n\n### SQL:\nSELECT AVG Area (km 2 ) FROM table WHERE Code > 19025 AND Region < 12", "question": "What is the average area larger than Code 19025 but a smaller region than 12?", "sql": "SELECT AVG Area (km 2 ) FROM table WHERE Code > 19025 AND Region < 12", "source": "test"}
+{"text": "### Question:\nWhat is the lowest number of laps for Marco Simoncelli on a grid higher than 11?\n\n### SQL:\nSELECT MIN Laps FROM table WHERE Rider = marco simoncelli AND Grid > 11", "question": "What is the lowest number of laps for Marco Simoncelli on a grid higher than 11?", "sql": "SELECT MIN Laps FROM table WHERE Rider = marco simoncelli AND Grid > 11", "source": "test"}
+{"text": "### Question:\nWhat is the highest Year, when Apparatus is \"Vault\", and when Rank-Final is less than 9?\n\n### SQL:\nSELECT MAX Year FROM table WHERE Apparatus = vault AND Rank-Final < 9", "question": "What is the highest Year, when Apparatus is \"Vault\", and when Rank-Final is less than 9?", "sql": "SELECT MAX Year FROM table WHERE Apparatus = vault AND Rank-Final < 9", "source": "test"}
+{"text": "### Question:\nHow many poles has a percentage of 22.08%?\n\n### SQL:\nSELECT SUM Poles FROM table WHERE Percentage = 22.08%", "question": "How many poles has a percentage of 22.08%?", "sql": "SELECT SUM Poles FROM table WHERE Percentage = 22.08%", "source": "test"}
+{"text": "### Question:\nName the number of score for sacramento\n\n### SQL:\nSELECT COUNT Score FROM table WHERE Team = Sacramento", "question": "Name the number of score for sacramento", "sql": "SELECT COUNT Score FROM table WHERE Team = Sacramento", "source": "test"}
+{"text": "### Question:\nWhat was the nationality of the player with a score of 72-72-67=211?\n\n### SQL:\nSELECT Country FROM table WHERE Score = 72-72-67=211", "question": "What was the nationality of the player with a score of 72-72-67=211?", "sql": "SELECT Country FROM table WHERE Score = 72-72-67=211", "source": "test"}
+{"text": "### Question:\nWhat was the venue that had 5000 m after 2009?\n\n### SQL:\nSELECT Venue FROM table WHERE Year > 2009 AND Notes = 5000 m", "question": "What was the venue that had 5000 m after 2009?", "sql": "SELECT Venue FROM table WHERE Year > 2009 AND Notes = 5000 m", "source": "test"}
+{"text": "### Question:\nwhat is the sspec number when the part number is cw8064701470802?\n\n### SQL:\nSELECT sSpec number FROM table WHERE Part number(s) = cw8064701470802", "question": "what is the sspec number when the part number is cw8064701470802?", "sql": "SELECT sSpec number FROM table WHERE Part number(s) = cw8064701470802", "source": "test"}
+{"text": "### Question:\nWhich Base has a Name of .44 wcf?\n\n### SQL:\nSELECT Base FROM table WHERE Name = .44 wcf", "question": "Which Base has a Name of .44 wcf?", "sql": "SELECT Base FROM table WHERE Name = .44 wcf", "source": "test"}
+{"text": "### Question:\nWhat is the value of the runner up column for the Alberta province?\n\n### SQL:\nSELECT MAX Runner Up FROM table WHERE Province = Alberta", "question": "What is the value of the runner up column for the Alberta province?", "sql": "SELECT MAX Runner Up FROM table WHERE Province = Alberta", "source": "test"}
+{"text": "### Question:\nWhat is the Athlete of the race with a Time of 9.78?\n\n### SQL:\nSELECT Athlete FROM table WHERE Time = 9.78", "question": "What is the Athlete of the race with a Time of 9.78?", "sql": "SELECT Athlete FROM table WHERE Time = 9.78", "source": "test"}
+{"text": "### Question:\nWhat was the rating of the episode \"After Hours\"?\n\n### SQL:\nSELECT Rating (Millions) FROM table WHERE Title = \"after hours\"", "question": "What was the rating of the episode \"After Hours\"?", "sql": "SELECT Rating (Millions) FROM table WHERE Title = \"after hours\"", "source": "test"}
+{"text": "### Question:\nWhat is the number for the forward position from the school/club team La Salle?\n\n### SQL:\nSELECT Number FROM table WHERE Position = forward AND School/Club Team = la salle", "question": "What is the number for the forward position from the school/club team La Salle?", "sql": "SELECT Number FROM table WHERE Position = forward AND School/Club Team = la salle", "source": "test"}
+{"text": "### Question:\nwhat is the highest wickets when the best bowling is 2/32 and matches is less than 5?\n\n### SQL:\nSELECT MAX Wickets FROM table WHERE Best Bowling = 2/32 AND Matches < 5", "question": "what is the highest wickets when the best bowling is 2/32 and matches is less than 5?", "sql": "SELECT MAX Wickets FROM table WHERE Best Bowling = 2/32 AND Matches < 5", "source": "test"}
+{"text": "### Question:\nWhich show runs on Friday at 05:00 AM?\n\n### SQL:\nSELECT 05:00 AM FROM table WHERE Time = friday", "question": "Which show runs on Friday at 05:00 AM?", "sql": "SELECT 05:00 AM FROM table WHERE Time = friday", "source": "test"}
+{"text": "### Question:\nHow many players have the hometown Pennsauken, NJ?\n\n### SQL:\nSELECT COUNT Player FROM table WHERE Hometown = Pennsauken, NJ", "question": "How many players have the hometown Pennsauken, NJ?", "sql": "SELECT COUNT Player FROM table WHERE Hometown = Pennsauken, NJ", "source": "test"}
+{"text": "### Question:\nHow many losses does Cross Keys RFC have?\n\n### SQL:\nSELECT COUNT Lost FROM table WHERE Club = Cross Keys RFC", "question": "How many losses does Cross Keys RFC have?", "sql": "SELECT COUNT Lost FROM table WHERE Club = Cross Keys RFC", "source": "test"}
+{"text": "### Question:\nWhat was the time when the method was TKO?\n\n### SQL:\nSELECT Time FROM table WHERE Method = tko", "question": "What was the time when the method was TKO?", "sql": "SELECT Time FROM table WHERE Method = tko", "source": "test"}
+{"text": "### Question:\nWhat was the type of sussex?\n\n### SQL:\nSELECT Type FROM table WHERE Name = Sussex", "question": "What was the type of sussex?", "sql": "SELECT Type FROM table WHERE Name = Sussex", "source": "test"}
+{"text": "### Question:\nWhat Title has a Role of Mylene?\n\n### SQL:\nSELECT Title FROM table WHERE Role = mylene", "question": "What Title has a Role of Mylene?", "sql": "SELECT Title FROM table WHERE Role = mylene", "source": "test"}
+{"text": "### Question:\nWho is the captain of Neil Warnock's team?\n\n### SQL:\nSELECT Team captain FROM table WHERE Manager = Neil Warnock", "question": "Who is the captain of Neil Warnock's team?", "sql": "SELECT Team captain FROM table WHERE Manager = Neil Warnock", "source": "test"}
+{"text": "### Question:\nWhat is the winning % for the 2010 QF?\n\n### SQL:\nSELECT Win % FROM table WHERE 2010 = qf", "question": "What is the winning % for the 2010 QF?", "sql": "SELECT Win % FROM table WHERE 2010 = qf", "source": "test"}
+{"text": "### Question:\nWhat's the court ranking of 5th son of tadayori and has revenues of 10,000 koku?\n\n### SQL:\nSELECT Court Rank FROM table WHERE Revenues = 10,000 koku AND Lineage = 5th son of tadayori", "question": "What's the court ranking of 5th son of tadayori and has revenues of 10,000 koku?", "sql": "SELECT Court Rank FROM table WHERE Revenues = 10,000 koku AND Lineage = 5th son of tadayori", "source": "test"}
+{"text": "### Question:\nWhich race was on the Las Vegas Motor Speedway for 2 hours?\n\n### SQL:\nSELECT Race FROM table WHERE Circuit = las vegas motor speedway AND Length = 2 hours", "question": "Which race was on the Las Vegas Motor Speedway for 2 hours?", "sql": "SELECT Race FROM table WHERE Circuit = las vegas motor speedway AND Length = 2 hours", "source": "test"}
+{"text": "### Question:\nWhat venue hsoted the european cross country championships with a notes of junior men individual 6.595km?\n\n### SQL:\nSELECT Venue FROM table WHERE Competition = european cross country championships AND Notes = junior men individual 6.595km", "question": "What venue hsoted the european cross country championships with a notes of junior men individual 6.595km?", "sql": "SELECT Venue FROM table WHERE Competition = european cross country championships AND Notes = junior men individual 6.595km", "source": "test"}
+{"text": "### Question:\nWhat was the Opponent in Week 9?\n\n### SQL:\nSELECT Opponent FROM table WHERE Week = 9", "question": "What was the Opponent in Week 9?", "sql": "SELECT Opponent FROM table WHERE Week = 9", "source": "test"}
+{"text": "### Question:\nBefore round 7, what is the greatest Pick # for a player that plays defensive tackle?\n\n### SQL:\nSELECT MAX Pick # FROM table WHERE Position = defensive tackle AND Round < 7", "question": "Before round 7, what is the greatest Pick # for a player that plays defensive tackle?", "sql": "SELECT MAX Pick # FROM table WHERE Position = defensive tackle AND Round < 7", "source": "test"}
+{"text": "### Question:\nWhat was the highest Pick for Lonnie Brockman before round 9?\n\n### SQL:\nSELECT MAX Pick FROM table WHERE Player = lonnie brockman AND Round < 9", "question": "What was the highest Pick for Lonnie Brockman before round 9?", "sql": "SELECT MAX Pick FROM table WHERE Player = lonnie brockman AND Round < 9", "source": "test"}
+{"text": "### Question:\nWhen was brian lemay born?\n\n### SQL:\nSELECT Date of Birth (Age) FROM table WHERE Player = brian lemay", "question": "When was brian lemay born?", "sql": "SELECT Date of Birth (Age) FROM table WHERE Player = brian lemay", "source": "test"}
+{"text": "### Question:\nWhat was the time of the NJKF Titans Neo X event?\n\n### SQL:\nSELECT Time FROM table WHERE Event = njkf titans neo x", "question": "What was the time of the NJKF Titans Neo X event?", "sql": "SELECT Time FROM table WHERE Event = njkf titans neo x", "source": "test"}
+{"text": "### Question:\nWhich kit maker have Trond Sollied as a manager?\n\n### SQL:\nSELECT Kit maker FROM table WHERE Manager = trond sollied", "question": "Which kit maker have Trond Sollied as a manager?", "sql": "SELECT Kit maker FROM table WHERE Manager = trond sollied", "source": "test"}
+{"text": "### Question:\nWhich Locomotive Entered Service in November 1984 and has an Operator of Southern Shorthaul Railroad?\n\n### SQL:\nSELECT Locomotive FROM table WHERE Operator = southern shorthaul railroad AND Entered service = november 1984", "question": "Which Locomotive Entered Service in November 1984 and has an Operator of Southern Shorthaul Railroad?", "sql": "SELECT Locomotive FROM table WHERE Operator = southern shorthaul railroad AND Entered service = november 1984", "source": "test"}

src/outputs/finetuning/train.jsonl ADDED Viewed

	@@ -0,0 +1,100 @@

+{"text": "### Question:\nWhat was the year when Ch\u016bnqi\u016b Ch\u00e1sh\u00ec (\u6625\u79cb\u8336\u5ba4) was submitted?\n\n### SQL:\nSELECT Year (Ceremony) FROM table WHERE Original title = Ch\u016bnqi\u016b ch\u00e1sh\u00ec (\u6625\u79cb\u8336\u5ba4)", "question": "What was the year when Ch\u016bnqi\u016b Ch\u00e1sh\u00ec (\u6625\u79cb\u8336\u5ba4) was submitted?", "sql": "SELECT Year (Ceremony) FROM table WHERE Original title = Ch\u016bnqi\u016b ch\u00e1sh\u00ec (\u6625\u79cb\u8336\u5ba4)", "source": "train"}
+{"text": "### Question:\nWhat is Name, when Builder is \"Kerr Stuart\"?\n\n### SQL:\nSELECT Name FROM table WHERE Builder = kerr stuart", "question": "What is Name, when Builder is \"Kerr Stuart\"?", "sql": "SELECT Name FROM table WHERE Builder = kerr stuart", "source": "train"}
+{"text": "### Question:\nWhat series had more than 10 Podiums?\n\n### SQL:\nSELECT Series FROM table WHERE Podiums > 10", "question": "What series had more than 10 Podiums?", "sql": "SELECT Series FROM table WHERE Podiums > 10", "source": "train"}
+{"text": "### Question:\nFor what ceremony was \"Fire Dancer\" not nominated? \n\n### SQL:\nSELECT Year (Ceremony) FROM table WHERE Original title = Fire Dancer", "question": "For what ceremony was \"Fire Dancer\" not nominated? ", "sql": "SELECT Year (Ceremony) FROM table WHERE Original title = Fire Dancer", "source": "train"}
+{"text": "### Question:\nHow many people went to the game with Indiana visiting?\n\n### SQL:\nSELECT Attendance FROM table WHERE Visitor = indiana", "question": "How many people went to the game with Indiana visiting?", "sql": "SELECT Attendance FROM table WHERE Visitor = indiana", "source": "train"}
+{"text": "### Question:\nWith a swim (1.5km) of 18:55 and a run (10km) of 32:37, what is the trans 2?\n\n### SQL:\nSELECT Trans 2 FROM table WHERE Swim (1.5km) = 18:55 AND Run (10km) = 32:37", "question": "With a swim (1.5km) of 18:55 and a run (10km) of 32:37, what is the trans 2?", "sql": "SELECT Trans 2 FROM table WHERE Swim (1.5km) = 18:55 AND Run (10km) = 32:37", "source": "train"}
+{"text": "### Question:\nWhat is the region for Chep\u00e9n with 3 districts?\n\n### SQL:\nSELECT Region FROM table WHERE Districts = 3 AND Capital = chep\u00e9n", "question": "What is the region for Chep\u00e9n with 3 districts?", "sql": "SELECT Region FROM table WHERE Districts = 3 AND Capital = chep\u00e9n", "source": "train"}
+{"text": "### Question:\nWhat was the third place of the performance in 2006 with the host Japan?\n\n### SQL:\nSELECT Third place FROM table WHERE Host = japan AND Season < 2006", "question": "What was the third place of the performance in 2006 with the host Japan?", "sql": "SELECT Third place FROM table WHERE Host = japan AND Season < 2006", "source": "train"}
+{"text": "### Question:\nWhat is the 1999-2000 team, when the Height (cm) is less than 187, and when the Birthplace is Cloquet, Minnesota?\n\n### SQL:\nSELECT 1999-2000 team FROM table WHERE Height (cm) < 187 AND Birthplace = cloquet, minnesota", "question": "What is the 1999-2000 team, when the Height (cm) is less than 187, and when the Birthplace is Cloquet, Minnesota?", "sql": "SELECT 1999-2000 team FROM table WHERE Height (cm) < 187 AND Birthplace = cloquet, minnesota", "source": "train"}
+{"text": "### Question:\nWhat is the highest Tournaments, when Pro Debut is \"July 2002\"?\n\n### SQL:\nSELECT MAX Tournaments FROM table WHERE Pro Debut = july 2002", "question": "What is the highest Tournaments, when Pro Debut is \"July 2002\"?", "sql": "SELECT MAX Tournaments FROM table WHERE Pro Debut = july 2002", "source": "train"}
+{"text": "### Question:\nwhat is the total time in office when the assumed office is 1 november 1856?\n\n### SQL:\nSELECT TOTAL Time in Office: FROM table WHERE Assumed Office: = 1 november 1856", "question": "what is the total time in office when the assumed office is 1 november 1856?", "sql": "SELECT TOTAL Time in Office: FROM table WHERE Assumed Office: = 1 november 1856", "source": "train"}
+{"text": "### Question:\nWhen did the term end for the term that had government 27 and Minister Tzachi Hanegbi?\n\n### SQL:\nSELECT Term end FROM table WHERE Governments = 27 AND Minister = tzachi hanegbi", "question": "When did the term end for the term that had government 27 and Minister Tzachi Hanegbi?", "sql": "SELECT Term end FROM table WHERE Governments = 27 AND Minister = tzachi hanegbi", "source": "train"}
+{"text": "### Question:\nWhat is the Outcome of the Doubles played on Carpet?\n\n### SQL:\nSELECT Outcome FROM table WHERE Surface = carpet", "question": "What is the Outcome of the Doubles played on Carpet?", "sql": "SELECT Outcome FROM table WHERE Surface = carpet", "source": "train"}
+{"text": "### Question:\nName the venue for geelong away team\n\n### SQL:\nSELECT Venue FROM table WHERE Away team = geelong", "question": "Name the venue for geelong away team", "sql": "SELECT Venue FROM table WHERE Away team = geelong", "source": "train"}
+{"text": "### Question:\nWhat was the score for the final played on 2 July 2012?\n\n### SQL:\nSELECT Score FROM table WHERE Date = 2 july 2012", "question": "What was the score for the final played on 2 July 2012?", "sql": "SELECT Score FROM table WHERE Date = 2 july 2012", "source": "train"}
+{"text": "### Question:\nBetween November 25\u201330, 2008 the sellout rate was at 75%, indicating that the ration between shows to sellout was what?\n\n### SQL:\nSELECT Shows / Sellout FROM table WHERE Sellout (%) = 75%", "question": "Between November 25\u201330, 2008 the sellout rate was at 75%, indicating that the ration between shows to sellout was what?", "sql": "SELECT Shows / Sellout FROM table WHERE Sellout (%) = 75%", "source": "train"}
+{"text": "### Question:\nWhat is the total ranking when there are less than 16 draws, less than 1 point, and the English translation is in love with you?\n\n### SQL:\nSELECT SUM Place FROM table WHERE Draw < 16 AND English translation = in love with you AND Points < 1", "question": "What is the total ranking when there are less than 16 draws, less than 1 point, and the English translation is in love with you?", "sql": "SELECT SUM Place FROM table WHERE Draw < 16 AND English translation = in love with you AND Points < 1", "source": "train"}
+{"text": "### Question:\nWhich Ratio as % has a Ratio of 8/9?\n\n### SQL:\nSELECT Ratio as % FROM table WHERE Ratio = 8/9", "question": "Which Ratio as % has a Ratio of 8/9?", "sql": "SELECT Ratio as % FROM table WHERE Ratio = 8/9", "source": "train"}
+{"text": "### Question:\nWhat is the Early Modern English phonology used in the example b\u014dg > \"bough\"; pl\u014dg > pl\u014dh > \"plough\"?\n\n### SQL:\nSELECT Early Modern English FROM table WHERE Example = b\u014dg > \"bough\"; pl\u014dg > pl\u014dh > \"plough\"", "question": "What is the Early Modern English phonology used in the example b\u014dg > \"bough\"; pl\u014dg > pl\u014dh > \"plough\"?", "sql": "SELECT Early Modern English FROM table WHERE Example = b\u014dg > \"bough\"; pl\u014dg > pl\u014dh > \"plough\"", "source": "train"}
+{"text": "### Question:\nWhat is the greatest Wins with Matches smaller than 5, and a Year of 1994?\n\n### SQL:\nSELECT MAX Wins FROM table WHERE Matches < 5 AND Year = 1994", "question": "What is the greatest Wins with Matches smaller than 5, and a Year of 1994?", "sql": "SELECT MAX Wins FROM table WHERE Matches < 5 AND Year = 1994", "source": "train"}
+{"text": "### Question:\nWhat ranking is the Battersea Power Station?\n\n### SQL:\nSELECT Rank FROM table WHERE Name = battersea power station", "question": "What ranking is the Battersea Power Station?", "sql": "SELECT Rank FROM table WHERE Name = battersea power station", "source": "train"}
+{"text": "### Question:\nWhat pick number was the player that was picked by Edmonton?\n\n### SQL:\nSELECT Pick # FROM table WHERE CFL Team = Edmonton", "question": "What pick number was the player that was picked by Edmonton?", "sql": "SELECT Pick # FROM table WHERE CFL Team = Edmonton", "source": "train"}
+{"text": "### Question:\nWhat was the location of the fight when Gassaway fought kevin knabjian?\n\n### SQL:\nSELECT Location FROM table WHERE Opponent = kevin knabjian", "question": "What was the location of the fight when Gassaway fought kevin knabjian?", "sql": "SELECT Location FROM table WHERE Opponent = kevin knabjian", "source": "train"}
+{"text": "### Question:\nHow many points did the song \"Stille f\u00f8r stormen\" get?\n\n### SQL:\nSELECT MIN Total Points FROM table WHERE Song = \"Stille f\u00f8r stormen\"", "question": "How many points did the song \"Stille f\u00f8r stormen\" get?", "sql": "SELECT MIN Total Points FROM table WHERE Song = \"Stille f\u00f8r stormen\"", "source": "train"}
+{"text": "### Question:\nWht years did truck robinson play?\n\n### SQL:\nSELECT Years for Jazz FROM table WHERE Player = Truck Robinson", "question": "Wht years did truck robinson play?", "sql": "SELECT Years for Jazz FROM table WHERE Player = Truck Robinson", "source": "train"}
+{"text": "### Question:\nWhich Score has a Couple of cristi\u00e1n & cheryl, and a Style of cha-cha-cha?\n\n### SQL:\nSELECT Score FROM table WHERE Couple = cristi\u00e1n & cheryl AND Style = cha-cha-cha", "question": "Which Score has a Couple of cristi\u00e1n & cheryl, and a Style of cha-cha-cha?", "sql": "SELECT Score FROM table WHERE Couple = cristi\u00e1n & cheryl AND Style = cha-cha-cha", "source": "train"}
+{"text": "### Question:\nWhat is the IUPAC name for chloroform?\n\n### SQL:\nSELECT IUPAC name FROM table WHERE Common name = chloroform", "question": "What is the IUPAC name for chloroform?", "sql": "SELECT IUPAC name FROM table WHERE Common name = chloroform", "source": "train"}
+{"text": "### Question:\nWhat is the Constituency Number when the Number of Electorates (2003) is more than 156,910, and Reserved for sc?\n\n### SQL:\nSELECT Constituency number FROM table WHERE Number of electorates (2003) > 156,910 AND Reserved for ( SC / ST /None) = sc", "question": "What is the Constituency Number when the Number of Electorates (2003) is more than 156,910, and Reserved for sc?", "sql": "SELECT Constituency number FROM table WHERE Number of electorates (2003) > 156,910 AND Reserved for ( SC / ST /None) = sc", "source": "train"}
+{"text": "### Question:\nWhat was John Jones's pick#?\n\n### SQL:\nSELECT SUM Pick FROM table WHERE Player = john jones", "question": "What was John Jones's pick#?", "sql": "SELECT SUM Pick FROM table WHERE Player = john jones", "source": "train"}
+{"text": "### Question:\nWhich runner(s)-up had a Winning score of \u201313 (68-70-66-71=275) and a Margin of victory of 3 strokes?\n\n### SQL:\nSELECT Runner(s)-up FROM table WHERE Margin of victory = 3 strokes AND Winning score = \u201313 (68-70-66-71=275)", "question": "Which runner(s)-up had a Winning score of \u201313 (68-70-66-71=275) and a Margin of victory of 3 strokes?", "sql": "SELECT Runner(s)-up FROM table WHERE Margin of victory = 3 strokes AND Winning score = \u201313 (68-70-66-71=275)", "source": "train"}
+{"text": "### Question:\nWhat is the number of people in attendance when Tonbridge Angels is the opponent?\n\n### SQL:\nSELECT Attendance FROM table WHERE Opponent = tonbridge angels", "question": "What is the number of people in attendance when Tonbridge Angels is the opponent?", "sql": "SELECT Attendance FROM table WHERE Opponent = tonbridge angels", "source": "train"}
+{"text": "### Question:\nWhich letter has the British a\u026a?\n\n### SQL:\nSELECT Letter FROM table WHERE British = a\u026a", "question": "Which letter has the British a\u026a?", "sql": "SELECT Letter FROM table WHERE British = a\u026a", "source": "train"}
+{"text": "### Question:\nIn which city was the berlin marathon?\n\n### SQL:\nSELECT Location FROM table WHERE Road race = Berlin Marathon", "question": "In which city was the berlin marathon?", "sql": "SELECT Location FROM table WHERE Road race = Berlin Marathon", "source": "train"}
+{"text": "### Question:\nName the year 2007 for 668 2008-q1\n\n### SQL:\nSELECT year 2007 FROM table WHERE 2008 - Q1 = 668", "question": "Name the year 2007 for 668 2008-q1", "sql": "SELECT year 2007 FROM table WHERE 2008 - Q1 = 668", "source": "train"}
+{"text": "### Question:\nWhat was Collingwood's score when they played against North Melbourne at home?\n\n### SQL:\nSELECT Home team score FROM table WHERE Away team = north melbourne", "question": "What was Collingwood's score when they played against North Melbourne at home?", "sql": "SELECT Home team score FROM table WHERE Away team = north melbourne", "source": "train"}
+{"text": "### Question:\nWhich division were the Brewers a part of in the 1987 season?\n\n### SQL:\nSELECT Division FROM table WHERE Team season = 1987", "question": "Which division were the Brewers a part of in the 1987 season?", "sql": "SELECT Division FROM table WHERE Team season = 1987", "source": "train"}
+{"text": "### Question:\nWhich TV Station has a Romaji Title of kegareta shita?\n\n### SQL:\nSELECT TV Station FROM table WHERE Romaji Title = kegareta shita", "question": "Which TV Station has a Romaji Title of kegareta shita?", "sql": "SELECT TV Station FROM table WHERE Romaji Title = kegareta shita", "source": "train"}
+{"text": "### Question:\nTell me the notes with method of points and event of adcc 2001 absolute with result of loss\n\n### SQL:\nSELECT Notes FROM table WHERE Method = points AND Event = adcc 2001 absolute AND Result = loss", "question": "Tell me the notes with method of points and event of adcc 2001 absolute with result of loss", "sql": "SELECT Notes FROM table WHERE Method = points AND Event = adcc 2001 absolute AND Result = loss", "source": "train"}
+{"text": "### Question:\nWhat is the highest value for SF round for the country of England?\n\n### SQL:\nSELECT MAX SF Round FROM table WHERE Country = England", "question": "What is the highest value for SF round for the country of England?", "sql": "SELECT MAX SF Round FROM table WHERE Country = England", "source": "train"}
+{"text": "### Question:\nWhat country id Bob Rosburg from?\n\n### SQL:\nSELECT Country FROM table WHERE Player = bob rosburg", "question": "What country id Bob Rosburg from?", "sql": "SELECT Country FROM table WHERE Player = bob rosburg", "source": "train"}
+{"text": "### Question:\nWho is the Alternate for Sweden?\n\n### SQL:\nSELECT Alternate FROM table WHERE Nation = sweden", "question": "Who is the Alternate for Sweden?", "sql": "SELECT Alternate FROM table WHERE Nation = sweden", "source": "train"}
+{"text": "### Question:\nWhich Format has a Frequency of 100.5 fm?\n\n### SQL:\nSELECT Format FROM table WHERE Frequency = 100.5 fm", "question": "Which Format has a Frequency of 100.5 fm?", "sql": "SELECT Format FROM table WHERE Frequency = 100.5 fm", "source": "train"}
+{"text": "### Question:\nWhat country is Lee Janzen from?\n\n### SQL:\nSELECT Country FROM table WHERE Player = lee janzen", "question": "What country is Lee Janzen from?", "sql": "SELECT Country FROM table WHERE Player = lee janzen", "source": "train"}
+{"text": "### Question:\nHead Coach casemiro mior is at which Club?\n\n### SQL:\nSELECT Club FROM table WHERE Head Coach = casemiro mior", "question": "Head Coach casemiro mior is at which Club?", "sql": "SELECT Club FROM table WHERE Head Coach = casemiro mior", "source": "train"}
+{"text": "### Question:\nOn which date was the opponent the Chicago Bears?\n\n### SQL:\nSELECT Date FROM table WHERE Opponent = chicago bears", "question": "On which date was the opponent the Chicago Bears?", "sql": "SELECT Date FROM table WHERE Opponent = chicago bears", "source": "train"}
+{"text": "### Question:\nWhich languages are offered in the coverage area of klang petaling jaya shah alam?\n\n### SQL:\nSELECT Language FROM table WHERE Coverage Area = Klang Petaling Jaya Shah Alam", "question": "Which languages are offered in the coverage area of klang petaling jaya shah alam?", "sql": "SELECT Language FROM table WHERE Coverage Area = Klang Petaling Jaya Shah Alam", "source": "train"}
+{"text": "### Question:\nWhat is the to par for Jiyai Shin when the place is t1?\n\n### SQL:\nSELECT To par FROM table WHERE Place = t1 AND Player = jiyai shin", "question": "What is the to par for Jiyai Shin when the place is t1?", "sql": "SELECT To par FROM table WHERE Place = t1 AND Player = jiyai shin", "source": "train"}
+{"text": "### Question:\nWhat is the total number of Division(s), when Team is Chongqing Lifan, and when Apps is greater than 9?\n\n### SQL:\nSELECT COUNT Division FROM table WHERE Team = chongqing lifan AND Apps > 9", "question": "What is the total number of Division(s), when Team is Chongqing Lifan, and when Apps is greater than 9?", "sql": "SELECT COUNT Division FROM table WHERE Team = chongqing lifan AND Apps > 9", "source": "train"}
+{"text": "### Question:\nWhat is Mike Weir's To par?\n\n### SQL:\nSELECT To par FROM table WHERE Player = mike weir", "question": "What is Mike Weir's To par?", "sql": "SELECT To par FROM table WHERE Player = mike weir", "source": "train"}
+{"text": "### Question:\nWhat was the away team when the home was st kilda?\n\n### SQL:\nSELECT Away team FROM table WHERE Home team = st kilda", "question": "What was the away team when the home was st kilda?", "sql": "SELECT Away team FROM table WHERE Home team = st kilda", "source": "train"}
+{"text": "### Question:\nWhat is the % of same-sex marriages for the year of 2011?\n\n### SQL:\nSELECT % same-sex marriages FROM table WHERE Year = 2011", "question": "What is the % of same-sex marriages for the year of 2011?", "sql": "SELECT % same-sex marriages FROM table WHERE Year = 2011", "source": "train"}
+{"text": "### Question:\nWhat Place has a To par of \u20134?\n\n### SQL:\nSELECT Place FROM table WHERE To par = \u20134", "question": "What Place has a To par of \u20134?", "sql": "SELECT Place FROM table WHERE To par = \u20134", "source": "train"}
+{"text": "### Question:\nName the district for 1994\n\n### SQL:\nSELECT District FROM table WHERE First elected = 1994", "question": "Name the district for 1994", "sql": "SELECT District FROM table WHERE First elected = 1994", "source": "train"}
+{"text": "### Question:\nWhich Body Width/mm has a Lead Pitch/mm smaller than 0.55, and a Part Number of tsop48?\n\n### SQL:\nSELECT MIN Body Width/mm FROM table WHERE Lead Pitch/mm < 0.55 AND Part Number = tsop48", "question": "Which Body Width/mm has a Lead Pitch/mm smaller than 0.55, and a Part Number of tsop48?", "sql": "SELECT MIN Body Width/mm FROM table WHERE Lead Pitch/mm < 0.55 AND Part Number = tsop48", "source": "train"}
+{"text": "### Question:\nWhat is the lowest number of episodes for anabel barnston?\n\n### SQL:\nSELECT MIN Episodes FROM table WHERE Actor = anabel barnston", "question": "What is the lowest number of episodes for anabel barnston?", "sql": "SELECT MIN Episodes FROM table WHERE Actor = anabel barnston", "source": "train"}
+{"text": "### Question:\nFor what league was the player in G position drafted?\n\n### SQL:\nSELECT League from FROM table WHERE Position = g", "question": "For what league was the player in G position drafted?", "sql": "SELECT League from FROM table WHERE Position = g", "source": "train"}
+{"text": "### Question:\nWhat is the last episode which has segment d as blown glass?\n\n### SQL:\nSELECT MAX Episode FROM table WHERE Segment D = Blown Glass", "question": "What is the last episode which has segment d as blown glass?", "sql": "SELECT MAX Episode FROM table WHERE Segment D = Blown Glass", "source": "train"}
+{"text": "### Question:\nWhat is the date for the 10b serial?\n\n### SQL:\nSELECT Date FROM table WHERE Serial = 10b", "question": "What is the date for the 10b serial?", "sql": "SELECT Date FROM table WHERE Serial = 10b", "source": "train"}
+{"text": "### Question:\nWhen \"we're going to disney world (part 1)\" is the title what is the air date?\n\n### SQL:\nSELECT Original air date FROM table WHERE Title = \"We're Going to Disney World (Part 1)\"", "question": "When \"we're going to disney world (part 1)\" is the title what is the air date?", "sql": "SELECT Original air date FROM table WHERE Title = \"We're Going to Disney World (Part 1)\"", "source": "train"}
+{"text": "### Question:\nHow did the School/Club Team of Manuel Luis Quezon acquire their Forward?\n\n### SQL:\nSELECT Acquisition via FROM table WHERE Position = forward AND School/Club Team = manuel luis quezon", "question": "How did the School/Club Team of Manuel Luis Quezon acquire their Forward?", "sql": "SELECT Acquisition via FROM table WHERE Position = forward AND School/Club Team = manuel luis quezon", "source": "train"}
+{"text": "### Question:\nWhat is the Loss has an Attendance more than 43,095 and a Record of 31\u201329?\n\n### SQL:\nSELECT Loss FROM table WHERE Attendance > 43,095 AND Record = 31\u201329", "question": "What is the Loss has an Attendance more than 43,095 and a Record of 31\u201329?", "sql": "SELECT Loss FROM table WHERE Attendance > 43,095 AND Record = 31\u201329", "source": "train"}
+{"text": "### Question:\nWhat is the ethernet ports of the u10 appliance?\n\n### SQL:\nSELECT Ethernet Ports FROM table WHERE Name = u10", "question": "What is the ethernet ports of the u10 appliance?", "sql": "SELECT Ethernet Ports FROM table WHERE Name = u10", "source": "train"}
+{"text": "### Question:\nName the 2007 for 2005 of a and 003 of a with 2009 of sf\n\n### SQL:\nSELECT 2007 FROM table WHERE 2005 = a AND 2003 = a AND 2009 = sf", "question": "Name the 2007 for 2005 of a and 003 of a with 2009 of sf", "sql": "SELECT 2007 FROM table WHERE 2005 = a AND 2003 = a AND 2009 = sf", "source": "train"}
+{"text": "### Question:\nWhat is the constructor for the race with Nigel Mansell as the fastest lap?\n\n### SQL:\nSELECT Constructor FROM table WHERE Fastest Lap = nigel mansell", "question": "What is the constructor for the race with Nigel Mansell as the fastest lap?", "sql": "SELECT Constructor FROM table WHERE Fastest Lap = nigel mansell", "source": "train"}
+{"text": "### Question:\nWhich institution's nickname is the Polar Bears?\n\n### SQL:\nSELECT Institution FROM table WHERE Nickname = Polar Bears", "question": "Which institution's nickname is the Polar Bears?", "sql": "SELECT Institution FROM table WHERE Nickname = Polar Bears", "source": "train"}
+{"text": "### Question:\nWhich Away team has an Away team score of 11.18 (84)?\n\n### SQL:\nSELECT Away team FROM table WHERE Away team score = 11.18 (84)", "question": "Which Away team has an Away team score of 11.18 (84)?", "sql": "SELECT Away team FROM table WHERE Away team score = 11.18 (84)", "source": "train"}
+{"text": "### Question:\nwhat is the current version with license gpl v3?\n\n### SQL:\nSELECT Current version FROM table WHERE License = gpl v3", "question": "what is the current version with license gpl v3?", "sql": "SELECT Current version FROM table WHERE License = gpl v3", "source": "train"}
+{"text": "### Question:\nWhat was the record on April 1?\n\n### SQL:\nSELECT Record FROM table WHERE Date = april 1", "question": "What was the record on April 1?", "sql": "SELECT Record FROM table WHERE Date = april 1", "source": "train"}
+{"text": "### Question:\nWhich Winning score has a Margin of victory of 1 stroke, and a Date of 21 jun 1981?\n\n### SQL:\nSELECT Winning score FROM table WHERE Margin of victory = 1 stroke AND Date = 21 jun 1981", "question": "Which Winning score has a Margin of victory of 1 stroke, and a Date of 21 jun 1981?", "sql": "SELECT Winning score FROM table WHERE Margin of victory = 1 stroke AND Date = 21 jun 1981", "source": "train"}
+{"text": "### Question:\nWhich Avoirdupois value is translated to grain?\n\n### SQL:\nSELECT Avoirdupois value FROM table WHERE Translation = grain", "question": "Which Avoirdupois value is translated to grain?", "sql": "SELECT Avoirdupois value FROM table WHERE Translation = grain", "source": "train"}
+{"text": "### Question:\nName the most margin for nco party and p. ramachandran won\n\n### SQL:\nSELECT MAX Margin FROM table WHERE Party = NCO AND Winner = P. Ramachandran", "question": "Name the most margin for nco party and p. ramachandran won", "sql": "SELECT MAX Margin FROM table WHERE Party = NCO AND Winner = P. Ramachandran", "source": "train"}
+{"text": "### Question:\nWhat is the Catalog with a Date that is february 20, 2002?\n\n### SQL:\nSELECT Catalog FROM table WHERE Date = february 20, 2002", "question": "What is the Catalog with a Date that is february 20, 2002?", "sql": "SELECT Catalog FROM table WHERE Date = february 20, 2002", "source": "train"}
+{"text": "### Question:\nWho is the driver of the chassis-engine porsche 956 gti?\n\n### SQL:\nSELECT Driver FROM table WHERE Chassis \u2013 Engine = porsche 956 gti", "question": "Who is the driver of the chassis-engine porsche 956 gti?", "sql": "SELECT Driver FROM table WHERE Chassis \u2013 Engine = porsche 956 gti", "source": "train"}
+{"text": "### Question:\nWhat is the 1st party with Charles Isaac Elton as the 2nd member?\n\n### SQL:\nSELECT 1st Party FROM table WHERE 2nd Member = charles isaac elton", "question": "What is the 1st party with Charles Isaac Elton as the 2nd member?", "sql": "SELECT 1st Party FROM table WHERE 2nd Member = charles isaac elton", "source": "train"}
+{"text": "### Question:\nWhat is the earliest date of the game with a score of 2-2?\n\n### SQL:\nSELECT MIN Date FROM table WHERE Score = 2-2", "question": "What is the earliest date of the game with a score of 2-2?", "sql": "SELECT MIN Date FROM table WHERE Score = 2-2", "source": "train"}
+{"text": "### Question:\nWhat was the original nfl team that the player was in from the midwestern conference?\n\n### SQL:\nSELECT Original NFL team FROM table WHERE Conf. = midwestern", "question": "What was the original nfl team that the player was in from the midwestern conference?", "sql": "SELECT Original NFL team FROM table WHERE Conf. = midwestern", "source": "train"}
+{"text": "### Question:\nName the total number of domestic mail for 7853 for total frieght and mail\n\n### SQL:\nSELECT COUNT Domestic mail FROM table WHERE Total freight and mail = 7853", "question": "Name the total number of domestic mail for 7853 for total frieght and mail", "sql": "SELECT COUNT Domestic mail FROM table WHERE Total freight and mail = 7853", "source": "train"}
+{"text": "### Question:\nWhat time is listed against the Wrestler Jimmy Rave?\n\n### SQL:\nSELECT Time FROM table WHERE Wrestler = jimmy rave", "question": "What time is listed against the Wrestler Jimmy Rave?", "sql": "SELECT Time FROM table WHERE Wrestler = jimmy rave", "source": "train"}
+{"text": "### Question:\nWhat's the listed average of Cuts made that has a Top-5 of 3, and a Top-10 that's smaller than 5?\n\n### SQL:\nSELECT AVG Cuts made FROM table WHERE Top-5 = 3 AND Top-10 < 5", "question": "What's the listed average of Cuts made that has a Top-5 of 3, and a Top-10 that's smaller than 5?", "sql": "SELECT AVG Cuts made FROM table WHERE Top-5 = 3 AND Top-10 < 5", "source": "train"}
+{"text": "### Question:\nHow large was the crowd at Carlton's home game?\n\n### SQL:\nSELECT COUNT Crowd FROM table WHERE Home team = carlton", "question": "How large was the crowd at Carlton's home game?", "sql": "SELECT COUNT Crowd FROM table WHERE Home team = carlton", "source": "train"}
+{"text": "### Question:\nWhat date did \"The runner\" originally air on?\n\n### SQL:\nSELECT Original air date FROM table WHERE Title = \"The Runner\"", "question": "What date did \"The runner\" originally air on?", "sql": "SELECT Original air date FROM table WHERE Title = \"The Runner\"", "source": "train"}
+{"text": "### Question:\nWhen 10th, south west district 1 is the mens 2nd xi what is the ladies 1st xi?\n\n### SQL:\nSELECT Ladies 1st XI FROM table WHERE Mens 2nd XI = 10th, South West District 1", "question": "When 10th, south west district 1 is the mens 2nd xi what is the ladies 1st xi?", "sql": "SELECT Ladies 1st XI FROM table WHERE Mens 2nd XI = 10th, South West District 1", "source": "train"}
+{"text": "### Question:\nWhat is Show, when Episode Number is 1, when Year is less than 2010, and when Original Airdate is January 20, 2008?\n\n### SQL:\nSELECT Show FROM table WHERE Episode number = 1 AND Year < 2010 AND Original airdate = january 20, 2008", "question": "What is Show, when Episode Number is 1, when Year is less than 2010, and when Original Airdate is January 20, 2008?", "sql": "SELECT Show FROM table WHERE Episode number = 1 AND Year < 2010 AND Original airdate = january 20, 2008", "source": "train"}
+{"text": "### Question:\nWhat is the verb for the Proto-Austronesian word *diri?\n\n### SQL:\nSELECT Verb FROM table WHERE Proto-Austronesian = *diri", "question": "What is the verb for the Proto-Austronesian word *diri?", "sql": "SELECT Verb FROM table WHERE Proto-Austronesian = *diri", "source": "train"}
+{"text": "### Question:\nWhat did winner Gary Player par?\n\n### SQL:\nSELECT To par FROM table WHERE Winner = gary player", "question": "What did winner Gary Player par?", "sql": "SELECT To par FROM table WHERE Winner = gary player", "source": "train"}
+{"text": "### Question:\nWhat notes have 2 as the rank?\n\n### SQL:\nSELECT Notes FROM table WHERE Rank = 2", "question": "What notes have 2 as the rank?", "sql": "SELECT Notes FROM table WHERE Rank = 2", "source": "train"}
+{"text": "### Question:\nWho was the Away Captain when the Home Captain was Joe Darling at Melbourne Cricket Ground?\n\n### SQL:\nSELECT Away captain FROM table WHERE Venue = melbourne cricket ground AND Home captain = joe darling", "question": "Who was the Away Captain when the Home Captain was Joe Darling at Melbourne Cricket Ground?", "sql": "SELECT Away captain FROM table WHERE Venue = melbourne cricket ground AND Home captain = joe darling", "source": "train"}
+{"text": "### Question:\nWhich spacecraft were launched by the Titan II?\n\n### SQL:\nSELECT Spacecraft FROM table WHERE Launcher = titan ii", "question": "Which spacecraft were launched by the Titan II?", "sql": "SELECT Spacecraft FROM table WHERE Launcher = titan ii", "source": "train"}
+{"text": "### Question:\nWhat shows for 3:30 pm when 12:30 pm is the young and the restless?\n\n### SQL:\nSELECT noon FROM table WHERE 12:30 pm = the young and the restless", "question": "What shows for 3:30 pm when 12:30 pm is the young and the restless?", "sql": "SELECT noon FROM table WHERE 12:30 pm = the young and the restless", "source": "train"}
+{"text": "### Question:\nWhat's the fed tax that has a total tax greater than 33.2, a minimum sales tax less than 41.01 and in Vancouver, BC?\n\n### SQL:\nSELECT AVG Federal excise tax ( CAD\u00a2 / L ) FROM table WHERE Total excise tax (CAD\u00a2/L) > 33.2 AND Government = vancouver, bc AND Minimum tax incl. sales taxes (CAD\u00a2/L) < 41.01", "question": "What's the fed tax that has a total tax greater than 33.2, a minimum sales tax less than 41.01 and in Vancouver, BC?", "sql": "SELECT AVG Federal excise tax ( CAD\u00a2 / L ) FROM table WHERE Total excise tax (CAD\u00a2/L) > 33.2 AND Government = vancouver, bc AND Minimum tax incl. sales taxes (CAD\u00a2/L) < 41.01", "source": "train"}
+{"text": "### Question:\nWhat is the highest total number?\n\n### SQL:\nSELECT MAX Total# FROM table", "question": "What is the highest total number?", "sql": "SELECT MAX Total# FROM table", "source": "train"}
+{"text": "### Question:\nWhat is the score of the match that was against alberto berasategui?\n\n### SQL:\nSELECT Score in the final FROM table WHERE Opponent in the final = alberto berasategui", "question": "What is the score of the match that was against alberto berasategui?", "sql": "SELECT Score in the final FROM table WHERE Opponent in the final = alberto berasategui", "source": "train"}
+{"text": "### Question:\nWhat is the newest Cap with a Goals stat larger than 17 and which was done by Brian Turner?\n\n### SQL:\nSELECT Most Recent Cap FROM table WHERE Goals > 17 AND Name = brian turner", "question": "What is the newest Cap with a Goals stat larger than 17 and which was done by Brian Turner?", "sql": "SELECT Most Recent Cap FROM table WHERE Goals > 17 AND Name = brian turner", "source": "train"}
+{"text": "### Question:\nWhat was the losing score on September 1?\n\n### SQL:\nSELECT Loss FROM table WHERE Date = september 1", "question": "What was the losing score on September 1?", "sql": "SELECT Loss FROM table WHERE Date = september 1", "source": "train"}
+{"text": "### Question:\nWhat is the score of the away team whose opponent scored 14.8 (92)?\n\n### SQL:\nSELECT Away team score FROM table WHERE Home team score = 14.8 (92)", "question": "What is the score of the away team whose opponent scored 14.8 (92)?", "sql": "SELECT Away team score FROM table WHERE Home team score = 14.8 (92)", "source": "train"}
+{"text": "### Question:\nWhat are donor payments in the country where there are 12 children to 6 families (2 per family)?\n\n### SQL:\nSELECT Donor payment FROM table WHERE Children per donor = 12 children to 6 families (2 per family)", "question": "What are donor payments in the country where there are 12 children to 6 families (2 per family)?", "sql": "SELECT Donor payment FROM table WHERE Children per donor = 12 children to 6 families (2 per family)", "source": "train"}
+{"text": "### Question:\nWhat was the original air date for the episode with production code 1wab06?\n\n### SQL:\nSELECT Originalairdate FROM table WHERE Production code = 1WAB06", "question": "What was the original air date for the episode with production code 1wab06?", "sql": "SELECT Originalairdate FROM table WHERE Production code = 1WAB06", "source": "train"}
+{"text": "### Question:\nWhat was the total for David O'Callaghan, and a Tally of 1-9?\n\n### SQL:\nSELECT SUM Total FROM table WHERE Player = david o'callaghan AND Tally = 1-9", "question": "What was the total for David O'Callaghan, and a Tally of 1-9?", "sql": "SELECT SUM Total FROM table WHERE Player = david o'callaghan AND Tally = 1-9", "source": "train"}
+{"text": "### Question:\nAttendance larger than 17,001, and a Date of june 15 had what decision?\n\n### SQL:\nSELECT Decision FROM table WHERE Attendance > 17,001 AND Date = june 15", "question": "Attendance larger than 17,001, and a Date of june 15 had what decision?", "sql": "SELECT Decision FROM table WHERE Attendance > 17,001 AND Date = june 15", "source": "train"}
+{"text": "### Question:\nWhat is Album Artist, when Song is \"\"Something New\" (with Mint Royale and Class A)\"?\n\n### SQL:\nSELECT Album artist FROM table WHERE Song = \"something new\" (with mint royale and class a)", "question": "What is Album Artist, when Song is \"\"Something New\" (with Mint Royale and Class A)\"?", "sql": "SELECT Album artist FROM table WHERE Song = \"something new\" (with mint royale and class a)", "source": "train"}

src/outputs/finetuning/val.jsonl ADDED Viewed

	@@ -0,0 +1,100 @@

+{"text": "### Question:\nName the play for 1976\n\n### SQL:\nSELECT Play FROM table WHERE Year = 1976", "question": "Name the play for 1976", "sql": "SELECT Play FROM table WHERE Year = 1976", "source": "validation"}
+{"text": "### Question:\nwhat are all the playoffs for u.s. open cup in 1st round\n\n### SQL:\nSELECT Playoffs FROM table WHERE U.S. Open Cup = 1st Round", "question": "what are all the playoffs for u.s. open cup in 1st round", "sql": "SELECT Playoffs FROM table WHERE U.S. Open Cup = 1st Round", "source": "validation"}
+{"text": "### Question:\nWhat is the location of the game that has a number smaller than 2?\n\n### SQL:\nSELECT Location FROM table WHERE Game < 2", "question": "What is the location of the game that has a number smaller than 2?", "sql": "SELECT Location FROM table WHERE Game < 2", "source": "validation"}
+{"text": "### Question:\nWhat is 2004, when 2005 is \"Not Tier I\"?\n\n### SQL:\nSELECT 2004 FROM table WHERE 2005 = not tier i", "question": "What is 2004, when 2005 is \"Not Tier I\"?", "sql": "SELECT 2004 FROM table WHERE 2005 = not tier i", "source": "validation"}
+{"text": "### Question:\nWhich venue led to a result of 13th and had an extra of Long Race?\n\n### SQL:\nSELECT Venue FROM table WHERE Extra = long race AND Result = 13th", "question": "Which venue led to a result of 13th and had an extra of Long Race?", "sql": "SELECT Venue FROM table WHERE Extra = long race AND Result = 13th", "source": "validation"}
+{"text": "### Question:\nWhat was Anders Forsbrand's score when the TO par is +4?\n\n### SQL:\nSELECT Score FROM table WHERE To par = +4 AND Player = anders forsbrand", "question": "What was Anders Forsbrand's score when the TO par is +4?", "sql": "SELECT Score FROM table WHERE To par = +4 AND Player = anders forsbrand", "source": "validation"}
+{"text": "### Question:\nWhat was the attendance of the game that had an away team of FK Mogren?\n\n### SQL:\nSELECT Attendance FROM table WHERE Guest = fk mogren", "question": "What was the attendance of the game that had an away team of FK Mogren?", "sql": "SELECT Attendance FROM table WHERE Guest = fk mogren", "source": "validation"}
+{"text": "### Question:\nWhat is the Air Date that has a 18\u201349 larger than 1.9, less than 7.54 viewers and a rating less than 4.9?\n\n### SQL:\nSELECT Air Date FROM table WHERE 18\u201349 > 1.9 AND Viewers < 7.54 AND Rating < 4.9", "question": "What is the Air Date that has a 18\u201349 larger than 1.9, less than 7.54 viewers and a rating less than 4.9?", "sql": "SELECT Air Date FROM table WHERE 18\u201349 > 1.9 AND Viewers < 7.54 AND Rating < 4.9", "source": "validation"}
+{"text": "### Question:\nWhich Avg/G is the lowest one that has a Long smaller than 47, and a Name of frank murphy, and a Gain smaller than 569?\n\n### SQL:\nSELECT MIN Avg/G FROM table WHERE Long < 47 AND Name = frank murphy AND Gain < 569", "question": "Which Avg/G is the lowest one that has a Long smaller than 47, and a Name of frank murphy, and a Gain smaller than 569?", "sql": "SELECT MIN Avg/G FROM table WHERE Long < 47 AND Name = frank murphy AND Gain < 569", "source": "validation"}
+{"text": "### Question:\nWhich rank has 1 silver medal and more than 1 gold medal?\n\n### SQL:\nSELECT Rank FROM table WHERE Silver = 1 AND Gold > 1", "question": "Which rank has 1 silver medal and more than 1 gold medal?", "sql": "SELECT Rank FROM table WHERE Silver = 1 AND Gold > 1", "source": "validation"}
+{"text": "### Question:\nName the number of candidates for # of seats won being 43\n\n### SQL:\nSELECT # of candidates FROM table WHERE # of seats won = 43", "question": "Name the number of candidates for # of seats won being 43", "sql": "SELECT # of candidates FROM table WHERE # of seats won = 43", "source": "validation"}
+{"text": "### Question:\nWhat is the home team score at lake oval?\n\n### SQL:\nSELECT Home team score FROM table WHERE Venue = lake oval", "question": "What is the home team score at lake oval?", "sql": "SELECT Home team score FROM table WHERE Venue = lake oval", "source": "validation"}
+{"text": "### Question:\nWhich loss has an attendance greater than 49,688 and 11-8 as the record?\n\n### SQL:\nSELECT Loss FROM table WHERE Attendance > 49,688 AND Record = 11-8", "question": "Which loss has an attendance greater than 49,688 and 11-8 as the record?", "sql": "SELECT Loss FROM table WHERE Attendance > 49,688 AND Record = 11-8", "source": "validation"}
+{"text": "### Question:\nWhat is the sum of pick# for Don Majkowski?3\n\n### SQL:\nSELECT SUM Pick # FROM table WHERE Player = don majkowski", "question": "What is the sum of pick# for Don Majkowski?3", "sql": "SELECT SUM Pick # FROM table WHERE Player = don majkowski", "source": "validation"}
+{"text": "### Question:\nWhat is the total number of wins for riders with fewer than 56 races and more than 0 titles?\n\n### SQL:\nSELECT COUNT Wins FROM table WHERE Races < 56 AND Titles > 0", "question": "What is the total number of wins for riders with fewer than 56 races and more than 0 titles?", "sql": "SELECT COUNT Wins FROM table WHERE Races < 56 AND Titles > 0", "source": "validation"}
+{"text": "### Question:\nHow much did the girl, nicknamed Chidi, weigh at birth?\n\n### SQL:\nSELECT Weight at birth FROM table WHERE Gender = girl AND Nickname = chidi", "question": "How much did the girl, nicknamed Chidi, weigh at birth?", "sql": "SELECT Weight at birth FROM table WHERE Gender = girl AND Nickname = chidi", "source": "validation"}
+{"text": "### Question:\nOn which apparatus did Kanayeva have a final score smaller than 75.5 and a qualifying score smaller than 18.7?\n\n### SQL:\nSELECT Apparatus FROM table WHERE Score-Final < 75.5 AND Score-Qualifying < 18.7", "question": "On which apparatus did Kanayeva have a final score smaller than 75.5 and a qualifying score smaller than 18.7?", "sql": "SELECT Apparatus FROM table WHERE Score-Final < 75.5 AND Score-Qualifying < 18.7", "source": "validation"}
+{"text": "### Question:\nWhat are the rounds for the B tyres and Ferrari 053 engine +?\n\n### SQL:\nSELECT Rounds FROM table WHERE Tyre = b AND Engine \u2020 = ferrari 053", "question": "What are the rounds for the B tyres and Ferrari 053 engine +?", "sql": "SELECT Rounds FROM table WHERE Tyre = b AND Engine \u2020 = ferrari 053", "source": "validation"}
+{"text": "### Question:\nWhat is the sexual abuse rate where the conflict is the Burundi Civil War?\n\n### SQL:\nSELECT MAX Sexual abuse 1 FROM table WHERE Conflict = Burundi Civil War", "question": "What is the sexual abuse rate where the conflict is the Burundi Civil War?", "sql": "SELECT MAX Sexual abuse 1 FROM table WHERE Conflict = Burundi Civil War", "source": "validation"}
+{"text": "### Question:\nWhen 0-1 is the series who has the highest amount of assists?\n\n### SQL:\nSELECT High assists FROM table WHERE Series = 0-1", "question": "When 0-1 is the series who has the highest amount of assists?", "sql": "SELECT High assists FROM table WHERE Series = 0-1", "source": "validation"}
+{"text": "### Question:\nWhich MLS team has the #41 pick?\n\n### SQL:\nSELECT MLS team FROM table WHERE Pick # = 41", "question": "Which MLS team has the #41 pick?", "sql": "SELECT MLS team FROM table WHERE Pick # = 41", "source": "validation"}
+{"text": "### Question:\nWhat is the most bronze can be when silver is larger than 2, and the nation is germany, and gold is more than 8?\n\n### SQL:\nSELECT MAX Bronze FROM table WHERE Silver > 2 AND Nation = germany AND Gold > 8", "question": "What is the most bronze can be when silver is larger than 2, and the nation is germany, and gold is more than 8?", "sql": "SELECT MAX Bronze FROM table WHERE Silver > 2 AND Nation = germany AND Gold > 8", "source": "validation"}
+{"text": "### Question:\nWhat dates contained matches at the venue Bourda?\n\n### SQL:\nSELECT Date FROM table WHERE Venue = bourda", "question": "What dates contained matches at the venue Bourda?", "sql": "SELECT Date FROM table WHERE Venue = bourda", "source": "validation"}
+{"text": "### Question:\nWhat is the minimum amount of poles?\n\n### SQL:\nSELECT MIN Poles FROM table", "question": "What is the minimum amount of poles?", "sql": "SELECT MIN Poles FROM table", "source": "validation"}
+{"text": "### Question:\nWhat is the average Episode # with a 7 share and 18\u201349 is less than 2 and the Air Date of may 21, 2009?\n\n### SQL:\nSELECT AVG Episode # FROM table WHERE Share = 7 AND 18\u201349 < 2 AND Air Date = may 21, 2009", "question": "What is the average Episode # with a 7 share and 18\u201349 is less than 2 and the Air Date of may 21, 2009?", "sql": "SELECT AVG Episode # FROM table WHERE Share = 7 AND 18\u201349 < 2 AND Air Date = may 21, 2009", "source": "validation"}
+{"text": "### Question:\nWhat is the most lost games for the team with a difference smaller than 86 and points of 32?\n\n### SQL:\nSELECT MAX Lost FROM table WHERE Points = 32 AND Difference < 86", "question": "What is the most lost games for the team with a difference smaller than 86 and points of 32?", "sql": "SELECT MAX Lost FROM table WHERE Points = 32 AND Difference < 86", "source": "validation"}
+{"text": "### Question:\nwhat's the\u00a0first elected\u00a0with\u00a0district\u00a0being florida 7\n\n### SQL:\nSELECT First elected FROM table WHERE District = Florida 7", "question": "what's the\u00a0first elected\u00a0with\u00a0district\u00a0being florida 7", "sql": "SELECT First elected FROM table WHERE District = Florida 7", "source": "validation"}
+{"text": "### Question:\nWhat is the average number of points for a song ranked 2nd with a draw greater than 3?\n\n### SQL:\nSELECT AVG Points FROM table WHERE Rank = 2nd AND Draw > 3", "question": "What is the average number of points for a song ranked 2nd with a draw greater than 3?", "sql": "SELECT AVG Points FROM table WHERE Rank = 2nd AND Draw > 3", "source": "validation"}
+{"text": "### Question:\nWhat is the destination when the train number is 16526?\n\n### SQL:\nSELECT Destination FROM table WHERE Train number = 16526", "question": "What is the destination when the train number is 16526?", "sql": "SELECT Destination FROM table WHERE Train number = 16526", "source": "validation"}
+{"text": "### Question:\nWhat is the highest game that has 32 points and a team rank larger than 4 named montepaschi siena\n\n### SQL:\nSELECT MAX Games FROM table WHERE Points = 32 AND Team = montepaschi siena AND Rank > 4", "question": "What is the highest game that has 32 points and a team rank larger than 4 named montepaschi siena", "sql": "SELECT MAX Games FROM table WHERE Points = 32 AND Team = montepaschi siena AND Rank > 4", "source": "validation"}
+{"text": "### Question:\nWhat is Episode, when Jeremy's Guest is \"Pauline McLynn\"?\n\n### SQL:\nSELECT Episode FROM table WHERE Jeremy's guest = pauline mclynn", "question": "What is Episode, when Jeremy's Guest is \"Pauline McLynn\"?", "sql": "SELECT Episode FROM table WHERE Jeremy's guest = pauline mclynn", "source": "validation"}
+{"text": "### Question:\nWhat is the poor law union of the Kilmaloda townland?\n\n### SQL:\nSELECT Poor law union FROM table WHERE Townland = Kilmaloda", "question": "What is the poor law union of the Kilmaloda townland?", "sql": "SELECT Poor law union FROM table WHERE Townland = Kilmaloda", "source": "validation"}
+{"text": "### Question:\nWhat is the largest pick in round 8?\n\n### SQL:\nSELECT MAX Pick FROM table WHERE Round = 8", "question": "What is the largest pick in round 8?", "sql": "SELECT MAX Pick FROM table WHERE Round = 8", "source": "validation"}
+{"text": "### Question:\nOn what date was the attendance at TD Garden 18,624?\n\n### SQL:\nSELECT Date FROM table WHERE Location Attendance = TD Garden 18,624", "question": "On what date was the attendance at TD Garden 18,624?", "sql": "SELECT Date FROM table WHERE Location Attendance = TD Garden 18,624", "source": "validation"}
+{"text": "### Question:\nWhat is canada's margin?\n\n### SQL:\nSELECT SUM Margin FROM table WHERE Country = canada", "question": "What is canada's margin?", "sql": "SELECT SUM Margin FROM table WHERE Country = canada", "source": "validation"}
+{"text": "### Question:\nWhat Sweet Sixteen team is in the Colonial conference?\n\n### SQL:\nSELECT Sweet Sixteen FROM table WHERE Conference = colonial", "question": "What Sweet Sixteen team is in the Colonial conference?", "sql": "SELECT Sweet Sixteen FROM table WHERE Conference = colonial", "source": "validation"}
+{"text": "### Question:\nHow many resorts have 118 runs?\n\n### SQL:\nSELECT COUNT Name FROM table WHERE Runs = 118", "question": "How many resorts have 118 runs?", "sql": "SELECT COUNT Name FROM table WHERE Runs = 118", "source": "validation"}
+{"text": "### Question:\nWho was the winner against Lindsay Davenport?\n\n### SQL:\nSELECT Winner FROM table WHERE Finalist = lindsay davenport", "question": "Who was the winner against Lindsay Davenport?", "sql": "SELECT Winner FROM table WHERE Finalist = lindsay davenport", "source": "validation"}
+{"text": "### Question:\nHow many laps for a grid larger than 1 with a Time/Retired of halfshaft?\n\n### SQL:\nSELECT Laps FROM table WHERE Grid > 1 AND Time/Retired = halfshaft", "question": "How many laps for a grid larger than 1 with a Time/Retired of halfshaft?", "sql": "SELECT Laps FROM table WHERE Grid > 1 AND Time/Retired = halfshaft", "source": "validation"}
+{"text": "### Question:\nIn what year was the feature at a 33.3S latitude named? \n\n### SQL:\nSELECT MAX Year named FROM table WHERE Latitude = 33.3S", "question": "In what year was the feature at a 33.3S latitude named? ", "sql": "SELECT MAX Year named FROM table WHERE Latitude = 33.3S", "source": "validation"}
+{"text": "### Question:\nWhich Thirds (Under 17's) have a Reserve of barnawartha?\n\n### SQL:\nSELECT Thirds (Under 17's) FROM table WHERE Reserves = barnawartha", "question": "Which Thirds (Under 17's) have a Reserve of barnawartha?", "sql": "SELECT Thirds (Under 17's) FROM table WHERE Reserves = barnawartha", "source": "validation"}
+{"text": "### Question:\nWhat was the outcome of the match against Stacy Margolin?\n\n### SQL:\nSELECT Outcome FROM table WHERE Opponent = stacy margolin", "question": "What was the outcome of the match against Stacy Margolin?", "sql": "SELECT Outcome FROM table WHERE Opponent = stacy margolin", "source": "validation"}
+{"text": "### Question:\nIf the working force of HK is 10.4%, what is the salary range?\n\n### SQL:\nSELECT Salary range FROM table WHERE Working force of HK = 10.4%", "question": "If the working force of HK is 10.4%, what is the salary range?", "sql": "SELECT Salary range FROM table WHERE Working force of HK = 10.4%", "source": "validation"}
+{"text": "### Question:\nWhat is the sum of the pick from texas a&i college with a round greater than 1?\n\n### SQL:\nSELECT SUM Pick FROM table WHERE College = texas a&i AND Round > 1", "question": "What is the sum of the pick from texas a&i college with a round greater than 1?", "sql": "SELECT SUM Pick FROM table WHERE College = texas a&i AND Round > 1", "source": "validation"}
+{"text": "### Question:\nWhich Second has a Lead of ben hebert?\n\n### SQL:\nSELECT Second FROM table WHERE Lead = ben hebert", "question": "Which Second has a Lead of ben hebert?", "sql": "SELECT Second FROM table WHERE Lead = ben hebert", "source": "validation"}
+{"text": "### Question:\nWhich Genre has a Game of donkey kong country?\n\n### SQL:\nSELECT Genre FROM table WHERE Game = donkey kong country", "question": "Which Genre has a Game of donkey kong country?", "sql": "SELECT Genre FROM table WHERE Game = donkey kong country", "source": "validation"}
+{"text": "### Question:\nWhat is the location of the Carousel toll plaza?\n\n### SQL:\nSELECT Location FROM table WHERE Name = Carousel Toll Plaza", "question": "What is the location of the Carousel toll plaza?", "sql": "SELECT Location FROM table WHERE Name = Carousel Toll Plaza", "source": "validation"}
+{"text": "### Question:\nWhat is Turkey's average Gold entry that also has a Bronze entry that is smaller than 2 and the Total is greater than 1?\n\n### SQL:\nSELECT AVG Gold FROM table WHERE Bronze < 2 AND Nation = turkey AND Total > 1", "question": "What is Turkey's average Gold entry that also has a Bronze entry that is smaller than 2 and the Total is greater than 1?", "sql": "SELECT AVG Gold FROM table WHERE Bronze < 2 AND Nation = turkey AND Total > 1", "source": "validation"}
+{"text": "### Question:\nWhich Class has a Quantity made of 29?\n\n### SQL:\nSELECT Class FROM table WHERE Quantity made = 29", "question": "Which Class has a Quantity made of 29?", "sql": "SELECT Class FROM table WHERE Quantity made = 29", "source": "validation"}
+{"text": "### Question:\nWhich Oberliga Bayern has a Season of 1981-82?\n\n### SQL:\nSELECT Oberliga Bayern FROM table WHERE Season = 1981-82", "question": "Which Oberliga Bayern has a Season of 1981-82?", "sql": "SELECT Oberliga Bayern FROM table WHERE Season = 1981-82", "source": "validation"}
+{"text": "### Question:\nWhat is the number of podiums with 0 wins, 0 F.L. and 35 points?\n\n### SQL:\nSELECT Podiums FROM table WHERE Wins = 0 AND F.L. = 0 AND Points = 35", "question": "What is the number of podiums with 0 wins, 0 F.L. and 35 points?", "sql": "SELECT Podiums FROM table WHERE Wins = 0 AND F.L. = 0 AND Points = 35", "source": "validation"}
+{"text": "### Question:\nWho was the publisher of Martial Law: Dead Ringers?\n\n### SQL:\nSELECT Publisher FROM table WHERE Release title = martial law: dead ringers", "question": "Who was the publisher of Martial Law: Dead Ringers?", "sql": "SELECT Publisher FROM table WHERE Release title = martial law: dead ringers", "source": "validation"}
+{"text": "### Question:\nWhat is the Almali village with the S\u00fcsk\u0259n village z\u0259rn\u0259?\n\n### SQL:\nSELECT Almal\u0131 (Qax) FROM table WHERE S\u00fcsk\u0259n = z\u0259rn\u0259", "question": "What is the Almali village with the S\u00fcsk\u0259n village z\u0259rn\u0259?", "sql": "SELECT Almal\u0131 (Qax) FROM table WHERE S\u00fcsk\u0259n = z\u0259rn\u0259", "source": "validation"}
+{"text": "### Question:\nName the typed for formed from 6-pul trailer third in res unit\n\n### SQL:\nSELECT Type FROM table WHERE Formed from = 6-pul trailer third in res unit", "question": "Name the typed for formed from 6-pul trailer third in res unit", "sql": "SELECT Type FROM table WHERE Formed from = 6-pul trailer third in res unit", "source": "validation"}
+{"text": "### Question:\nName the 2009/10 with 2011/12 of lq and 2008/09 of not held\n\n### SQL:\nSELECT 2009/ 10 FROM table WHERE 2011/ 12 = lq AND 2008/ 09 = not held", "question": "Name the 2009/10 with 2011/12 of lq and 2008/09 of not held", "sql": "SELECT 2009/ 10 FROM table WHERE 2011/ 12 = lq AND 2008/ 09 = not held", "source": "validation"}
+{"text": "### Question:\nWhat is the tyres for the JBW type 2 chassis?\n\n### SQL:\nSELECT Tyres FROM table WHERE Chassis = jbw type 2", "question": "What is the tyres for the JBW type 2 chassis?", "sql": "SELECT Tyres FROM table WHERE Chassis = jbw type 2", "source": "validation"}
+{"text": "### Question:\nHow many total appearances (league only) have a name of gavin dykes?\n\n### SQL:\nSELECT Total Appearances(league only) FROM table WHERE Name = gavin dykes", "question": "How many total appearances (league only) have a name of gavin dykes?", "sql": "SELECT Total Appearances(league only) FROM table WHERE Name = gavin dykes", "source": "validation"}
+{"text": "### Question:\nWhat is the sum of laps that has a car number of larger than 1, is a ford, and has 155 points?\n\n### SQL:\nSELECT SUM Laps FROM table WHERE Car # > 1 AND Make = ford AND Points = 155", "question": "What is the sum of laps that has a car number of larger than 1, is a ford, and has 155 points?", "sql": "SELECT SUM Laps FROM table WHERE Car # > 1 AND Make = ford AND Points = 155", "source": "validation"}
+{"text": "### Question:\nWhat was the average crowd size of games held at Glenferrie Oval?\n\n### SQL:\nSELECT AVG Crowd FROM table WHERE Venue = glenferrie oval", "question": "What was the average crowd size of games held at Glenferrie Oval?", "sql": "SELECT AVG Crowd FROM table WHERE Venue = glenferrie oval", "source": "validation"}
+{"text": "### Question:\nWhat version of iWork was released on October 22, 2013 with a pages version greater than 2?\n\n### SQL:\nSELECT iWork version FROM table WHERE Release date = october 22, 2013 AND Pages version > 2", "question": "What version of iWork was released on October 22, 2013 with a pages version greater than 2?", "sql": "SELECT iWork version FROM table WHERE Release date = october 22, 2013 AND Pages version > 2", "source": "validation"}
+{"text": "### Question:\nName the player for chicago black hawks\n\n### SQL:\nSELECT Player FROM table WHERE NHL team = Chicago Black Hawks", "question": "Name the player for chicago black hawks", "sql": "SELECT Player FROM table WHERE NHL team = Chicago Black Hawks", "source": "validation"}
+{"text": "### Question:\nWhat is the streak for game 2?\n\n### SQL:\nSELECT Streak FROM table WHERE Game = 2", "question": "What is the streak for game 2?", "sql": "SELECT Streak FROM table WHERE Game = 2", "source": "validation"}
+{"text": "### Question:\nI want the D 45 and D 42 of r 22\n\n### SQL:\nSELECT D 45 FROM table WHERE D 42 = r 22", "question": "I want the D 45 and D 42 of r 22", "sql": "SELECT D 45 FROM table WHERE D 42 = r 22", "source": "validation"}
+{"text": "### Question:\nWho was the away team when Queensland Roar was the home team in the round less than 3?\n\n### SQL:\nSELECT Away Team FROM table WHERE Round < 3 AND Home Team = queensland roar", "question": "Who was the away team when Queensland Roar was the home team in the round less than 3?", "sql": "SELECT Away Team FROM table WHERE Round < 3 AND Home Team = queensland roar", "source": "validation"}
+{"text": "### Question:\nHow many artists were there for the show thoroughly modern millie?\n\n### SQL:\nSELECT COUNT Artist FROM table WHERE Show = Thoroughly Modern Millie", "question": "How many artists were there for the show thoroughly modern millie?", "sql": "SELECT COUNT Artist FROM table WHERE Show = Thoroughly Modern Millie", "source": "validation"}
+{"text": "### Question:\nWhich wrestling event was at the 2008 Beijing games?\n\n### SQL:\nSELECT Event FROM table WHERE Sport = wrestling AND Games = 2008 beijing", "question": "Which wrestling event was at the 2008 Beijing games?", "sql": "SELECT Event FROM table WHERE Sport = wrestling AND Games = 2008 beijing", "source": "validation"}
+{"text": "### Question:\nWho was the opponent in London, England in a round less than 2?\n\n### SQL:\nSELECT Opponent FROM table WHERE Location = london, england AND Round < 2", "question": "Who was the opponent in London, England in a round less than 2?", "sql": "SELECT Opponent FROM table WHERE Location = london, england AND Round < 2", "source": "validation"}
+{"text": "### Question:\nWhat is the ceremony year when Ganito Kami Noon, Paano Kayo Ngayon was the original title?\n\n### SQL:\nSELECT Year (Ceremony) FROM table WHERE Original title = ganito kami noon, paano kayo ngayon", "question": "What is the ceremony year when Ganito Kami Noon, Paano Kayo Ngayon was the original title?", "sql": "SELECT Year (Ceremony) FROM table WHERE Original title = ganito kami noon, paano kayo ngayon", "source": "validation"}
+{"text": "### Question:\nWhen the total score is 740, what is tromso?\n\n### SQL:\nSELECT MIN Troms\u00f8 FROM table WHERE Total = 740", "question": "When the total score is 740, what is tromso?", "sql": "SELECT MIN Troms\u00f8 FROM table WHERE Total = 740", "source": "validation"}
+{"text": "### Question:\nWhat is the result for director Said Elmarouk before 2008?\n\n### SQL:\nSELECT Result FROM table WHERE Director = said elmarouk AND Year < 2008", "question": "What is the result for director Said Elmarouk before 2008?", "sql": "SELECT Result FROM table WHERE Director = said elmarouk AND Year < 2008", "source": "validation"}
+{"text": "### Question:\nWhen was the score 56-26?\n\n### SQL:\nSELECT Date FROM table WHERE Record = 56-26", "question": "When was the score 56-26?", "sql": "SELECT Date FROM table WHERE Record = 56-26", "source": "validation"}
+{"text": "### Question:\nName the D 44 when it has a D 46 of d 31\n\n### SQL:\nSELECT D 44 FROM table WHERE D 46 = d 31", "question": "Name the D 44 when it has a D 46 of d 31", "sql": "SELECT D 44 FROM table WHERE D 46 = d 31", "source": "validation"}
+{"text": "### Question:\nWhat was the date of the race that lasted 6 hours?\n\n### SQL:\nSELECT Date FROM table WHERE Length/Duration = 6 hours", "question": "What was the date of the race that lasted 6 hours?", "sql": "SELECT Date FROM table WHERE Length/Duration = 6 hours", "source": "validation"}
+{"text": "### Question:\nWhich event is in the 1952 summer olympics?\n\n### SQL:\nSELECT Event FROM table WHERE Olympics = 1952 summer olympics", "question": "Which event is in the 1952 summer olympics?", "sql": "SELECT Event FROM table WHERE Olympics = 1952 summer olympics", "source": "validation"}
+{"text": "### Question:\n the 2010 clausura tournament?\n\n### SQL:\nSELECT Coefficient FROM table WHERE Tournament = 2010 Clausura", "question": " the 2010 clausura tournament?", "sql": "SELECT Coefficient FROM table WHERE Tournament = 2010 Clausura", "source": "validation"}
+{"text": "### Question:\nWhat was the score of the BCS National Championship game?\n\n### SQL:\nSELECT Score FROM table WHERE Bowl Game = bcs national championship", "question": "What was the score of the BCS National Championship game?", "sql": "SELECT Score FROM table WHERE Bowl Game = bcs national championship", "source": "validation"}
+{"text": "### Question:\nWhat was the attendance when their record stood at 0-2-2?\n\n### SQL:\nSELECT SUM Attendance FROM table WHERE Record = 0-2-2", "question": "What was the attendance when their record stood at 0-2-2?", "sql": "SELECT SUM Attendance FROM table WHERE Record = 0-2-2", "source": "validation"}
+{"text": "### Question:\nWhat were the results before the year 2000?\n\n### SQL:\nSELECT Result FROM table WHERE Year < 2000", "question": "What were the results before the year 2000?", "sql": "SELECT Result FROM table WHERE Year < 2000", "source": "validation"}
+{"text": "### Question:\nHow much time is required for less than 35 laps and less than 10 grids?\n\n### SQL:\nSELECT Time/Retired FROM table WHERE Laps < 35 AND Grid < 10", "question": "How much time is required for less than 35 laps and less than 10 grids?", "sql": "SELECT Time/Retired FROM table WHERE Laps < 35 AND Grid < 10", "source": "validation"}
+{"text": "### Question:\nWhen oslo is 48, what is stavanger?\n\n### SQL:\nSELECT MIN Stavanger FROM table WHERE Oslo = 48", "question": "When oslo is 48, what is stavanger?", "sql": "SELECT MIN Stavanger FROM table WHERE Oslo = 48", "source": "validation"}
+{"text": "### Question:\nWhen was the year that had an average attendance of 5,445?\n\n### SQL:\nSELECT Year FROM table WHERE Avg. attendance = 5,445", "question": "When was the year that had an average attendance of 5,445?", "sql": "SELECT Year FROM table WHERE Avg. attendance = 5,445", "source": "validation"}
+{"text": "### Question:\nFor the game with 528 attendance, what was the result?\n\n### SQL:\nSELECT Result FROM table WHERE Attendance = 528", "question": "For the game with 528 attendance, what was the result?", "sql": "SELECT Result FROM table WHERE Attendance = 528", "source": "validation"}
+{"text": "### Question:\nWhat dated was the game played at the location delta center 19,911?\n\n### SQL:\nSELECT Date FROM table WHERE Location Attendance = Delta Center 19,911", "question": "What dated was the game played at the location delta center 19,911?", "sql": "SELECT Date FROM table WHERE Location Attendance = Delta Center 19,911", "source": "validation"}
+{"text": "### Question:\nWhat is the ISBN of \"Dead as a Doornail?\n\n### SQL:\nSELECT Paperback FROM table WHERE Title = Dead as a Doornail", "question": "What is the ISBN of \"Dead as a Doornail?", "sql": "SELECT Paperback FROM table WHERE Title = Dead as a Doornail", "source": "validation"}
+{"text": "### Question:\nWhat scored is recorded on April 24?\n\n### SQL:\nSELECT Score FROM table WHERE Date = april 24", "question": "What scored is recorded on April 24?", "sql": "SELECT Score FROM table WHERE Date = april 24", "source": "validation"}
+{"text": "### Question:\nWho acquired tom norton?\n\n### SQL:\nSELECT Acquired FROM table WHERE Player = tom norton", "question": "Who acquired tom norton?", "sql": "SELECT Acquired FROM table WHERE Player = tom norton", "source": "validation"}
+{"text": "### Question:\nWHAT IS THE HIGHEST VIEWERS WITH AN EPISODE LESS THAN 15 AND SHARE LAGER THAN 7?\n\n### SQL:\nSELECT MAX Viewers (millions) FROM table WHERE Episode number < 15 AND Share > 7", "question": "WHAT IS THE HIGHEST VIEWERS WITH AN EPISODE LESS THAN 15 AND SHARE LAGER THAN 7?", "sql": "SELECT MAX Viewers (millions) FROM table WHERE Episode number < 15 AND Share > 7", "source": "validation"}
+{"text": "### Question:\nWhat is the English name given to the city of St. John's?\n\n### SQL:\nSELECT Capital ( exonym ) FROM table WHERE Capital ( endonym ) = St. John's", "question": "What is the English name given to the city of St. John's?", "sql": "SELECT Capital ( exonym ) FROM table WHERE Capital ( endonym ) = St. John's", "source": "validation"}
+{"text": "### Question:\nWhat was the result of the game played on November 23, 2003?\n\n### SQL:\nSELECT Result FROM table WHERE Date = november 23, 2003", "question": "What was the result of the game played on November 23, 2003?", "sql": "SELECT Result FROM table WHERE Date = november 23, 2003", "source": "validation"}
+{"text": "### Question:\nWho directed An Egg Scramble?\n\n### SQL:\nSELECT Director FROM table WHERE Title = an egg scramble", "question": "Who directed An Egg Scramble?", "sql": "SELECT Director FROM table WHERE Title = an egg scramble", "source": "validation"}
+{"text": "### Question:\nWhat is the Bulgarian Commander of the Battle of Rusion?\n\n### SQL:\nSELECT Bulgarian Commander FROM table WHERE Battle = battle of rusion", "question": "What is the Bulgarian Commander of the Battle of Rusion?", "sql": "SELECT Bulgarian Commander FROM table WHERE Battle = battle of rusion", "source": "validation"}
+{"text": "### Question:\nWhat was the location of the game when the record was 12-4?\n\n### SQL:\nSELECT Location FROM table WHERE Record = 12-4", "question": "What was the location of the game when the record was 12-4?", "sql": "SELECT Location FROM table WHERE Record = 12-4", "source": "validation"}
+{"text": "### Question:\nWhat Service Name has UTV as the owner?\n\n### SQL:\nSELECT Service name FROM table WHERE Owner = utv", "question": "What Service Name has UTV as the owner?", "sql": "SELECT Service name FROM table WHERE Owner = utv", "source": "validation"}
+{"text": "### Question:\nWhich Rebuilt has a Name as rebuilt of binevanagh?\n\n### SQL:\nSELECT Rebuilt FROM table WHERE Name as rebuilt = binevanagh", "question": "Which Rebuilt has a Name as rebuilt of binevanagh?", "sql": "SELECT Rebuilt FROM table WHERE Name as rebuilt = binevanagh", "source": "validation"}
+{"text": "### Question:\nwhat is the margin of victory when the runner-up is amy alcott and the winning score is \u20139 (72-68-67=207)?\n\n### SQL:\nSELECT Margin of victory FROM table WHERE Runner(s)-up = amy alcott AND Winning score = \u20139 (72-68-67=207)", "question": "what is the margin of victory when the runner-up is amy alcott and the winning score is \u20139 (72-68-67=207)?", "sql": "SELECT Margin of victory FROM table WHERE Runner(s)-up = amy alcott AND Winning score = \u20139 (72-68-67=207)", "source": "validation"}
+{"text": "### Question:\nWhat is the height of the building with 40 floors?\n\n### SQL:\nSELECT Height ft / m FROM table WHERE Floors = 40", "question": "What is the height of the building with 40 floors?", "sql": "SELECT Height ft / m FROM table WHERE Floors = 40", "source": "validation"}
+{"text": "### Question:\nWhat is the total number drawn with goals against less than 55, and a total of 14 losses?\n\n### SQL:\nSELECT COUNT Drawn FROM table WHERE Goals Against < 55 AND Lost = 14", "question": "What is the total number drawn with goals against less than 55, and a total of 14 losses?", "sql": "SELECT COUNT Drawn FROM table WHERE Goals Against < 55 AND Lost = 14", "source": "validation"}
+{"text": "### Question:\nWhich engine from 1973 has a Brabham bt37 chassis?\n\n### SQL:\nSELECT Engine FROM table WHERE Year = 1973 AND Chassis = brabham bt37", "question": "Which engine from 1973 has a Brabham bt37 chassis?", "sql": "SELECT Engine FROM table WHERE Year = 1973 AND Chassis = brabham bt37", "source": "validation"}
+{"text": "### Question:\nTell me the final score for january 9 for cincinnati bengals\n\n### SQL:\nSELECT Final Score FROM table WHERE Date = january 9 AND Host Team = cincinnati bengals", "question": "Tell me the final score for january 9 for cincinnati bengals", "sql": "SELECT Final Score FROM table WHERE Date = january 9 AND Host Team = cincinnati bengals", "source": "validation"}
+{"text": "### Question:\nWhat player was place of t1 in To Par and had a score of 70-73-69=212?\n\n### SQL:\nSELECT To par FROM table WHERE Place = t1 AND Score = 70-73-69=212", "question": "What player was place of t1 in To Par and had a score of 70-73-69=212?", "sql": "SELECT To par FROM table WHERE Place = t1 AND Score = 70-73-69=212", "source": "validation"}

src/outputs/finetuning/visualizations/01_metrics_overview.png ADDED Viewed

src/outputs/finetuning/visualizations/02_token_accuracy_dist.png ADDED Viewed

src/outputs/finetuning/visualizations/03_keyword_accuracy_dist.png ADDED Viewed

src/outputs/finetuning/visualizations/04_training_loss.png ADDED Viewed

src/outputs/rag/reports/knowledge_base_report.md ADDED Viewed

	@@ -0,0 +1,46 @@

+# RAG Knowledge Base Report
+**Generated:** 2025-12-07 23:58:56
+## Overview
+| Metric | Value |
+|--------|-------|
+| Total Documents | 80,654 |
+| Collection Name | sql_knowledge |
+| Embedding Model | all-MiniLM-L6-v2 |
+## Data Sources
+| Source | Documents |
+|--------|-----------|
+| train | 56,355 |
+| validation | 8,421 |
+| test | 15,878 |
+## Chunking Strategies
+1. **SQL Clause Extraction**: Identifies SELECT, FROM, WHERE, GROUP BY, etc.
+2. **Complexity Classification**: Categorizes as simple/intermediate/complex
+3. **Keyword Extraction**: Extracts SQL operations (JOIN, COUNT, etc.)
+4. **Size Categorization**: Classifies question/SQL length
+## Complexity Distribution
+| Level | Count |
+|-------|-------|
+| Simple | 80,396 |
+| Intermediate | 258 |
+| Complex | 0 |
+## Document Metadata Structure
+Each document contains:
+- `sql`: The SQL query
+- `source`: Origin dataset
+- `question`: Original question
+- `complexity`: simple/intermediate/complex
+- `sql_clauses`: Comma-separated clauses
+- `keywords`: SQL keywords found
+- `question_size`: short/medium/long
+- `sql_size`: short/medium/long

src/outputs/rag/stats/knowledge_base_stats.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "total_documents": 80654,
+  "sources": {
+    "train": 56355,
+    "validation": 8421,
+    "test": 15878
+  },
+  "collection_name": "sql_knowledge",
+  "embedding_model": "all-MiniLM-L6-v2",
+  "chunking_strategies": [
+    "sql_clause_extraction",
+    "complexity_classification",
+    "keyword_extraction",
+    "size_categorization"
+  ],
+  "complexity_distribution": {
+    "simple": 80396,
+    "intermediate": 258,
+    "complex": 0
+  },
+  "created_at": "2025-12-07T23:58:56.309706"
+}

src/outputs/synthetic/reports/synthetic_report.md ADDED Viewed

	@@ -0,0 +1,47 @@

+# Synthetic Data Generation Report
+**Generated:** 2025-12-07 23:24:17
+## Dataset Statistics
+| Metric | Original | Synthetic |
+|--------|----------|-----------|
+| Samples | 52,527 | 142,639 |
+| Avg Length | 11.64 | 14.75 |
+| Min Length | 3 | 3 |
+| Max Length | 44 | 49 |
+| Unique Words | 50,846 | 60,734 |
+## Augmentation Results
+- **Augmentation Factor:** 2.72x
+- **Avg Diversity Score:** 0.2832
+- **Min Diversity Score:** 0.103
+- **Max Diversity Score:** 0.8
+## Techniques Used
+1. Synonym Replacement (40% probability)
+2. Random Insertion (15% probability)
+3. Random Swap (10% probability)
+4. Structure Variation (prefix/suffix)
+5. Case Variation
+## Quality Controls
+- Minimum question length: 10 characters
+- Maximum question length: 500 characters
+- Minimum diversity score: 0.1
+- Duplicate removal via MD5 hashing
+## Privacy Measures
+- Email anonymization
+- Phone number anonymization
+- SSN anonymization
+## Visualizations
+- `01_size_comparison.png` - Dataset size comparison
+- `02_length_distribution.png` - Question length distribution
+- `03_diversity_distribution.png` - Diversity score distribution

src/outputs/synthetic/stats/statistics.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "original": {
+    "name": "Original",
+    "samples": 52527,
+    "avg_length": 11.64,
+    "min_length": 3,
+    "max_length": 44,
+    "unique_words": 50846
+  },
+  "synthetic": {
+    "name": "Synthetic",
+    "samples": 142639,
+    "avg_length": 14.75,
+    "min_length": 3,
+    "max_length": 49,
+    "unique_words": 60734
+  },
+  "diversity": {
+    "avg": 0.2832,
+    "min": 0.103,
+    "max": 0.8
+  },
+  "augmentation_factor": 2.72
+}

src/outputs/synthetic/visualizations/01_size_comparison.png ADDED Viewed

src/outputs/synthetic/visualizations/02_length_distribution.png ADDED Viewed

src/outputs/synthetic/visualizations/03_diversity_distribution.png ADDED Viewed

src/pipeline/integrated.py ADDED Viewed

	@@ -0,0 +1,584 @@

+"""
+Integrated Pipeline: RAG + Fine-tuned Model + Gemini Enhancement
+"""
+import os
+import sys
+import json
+from datetime import datetime
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+# Add parent directory
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+OUTPUT_DIR = "outputs/pipeline"
+LOGS_DIR = f"{OUTPUT_DIR}/logs"
+# Gemini config - loaded from .env with fallbacks
+GEMINI_KEYS = [
+    os.getenv("GEMINI_API_KEY"),
+    os.getenv("GEMINI_API_KEY_FALLBACK_1"),
+    os.getenv("GEMINI_API_KEY_FALLBACK_2"),
+]
+# Remove None values
+GEMINI_KEYS = [k for k in GEMINI_KEYS if k]
+GEMINI_MODELS = [
+    os.getenv("GEMINI_MODEL", "gemini-2.5-flash"),
+    os.getenv("GEMINI_MODEL_FALLBACK_1"),
+]
+# Remove None values
+GEMINI_MODELS = [m for m in GEMINI_MODELS if m]
+if not GEMINI_KEYS:
+    print("⚠️ Warning: No GEMINI_API_KEY found in .env file")
+else:
+    print(f"✓ Found {len(GEMINI_KEYS)} Gemini API key(s)")
+    print(f"✓ Found {len(GEMINI_MODELS)} Gemini model(s)")
+def setup_directories():
+    for d in [OUTPUT_DIR, LOGS_DIR]:
+        os.makedirs(d, exist_ok=True)
+# =============================================================================
+# GEMINI CLIENT WITH FALLBACK
+# =============================================================================
+class GeminiClient:
+    """Gemini client with automatic fallback for rate limits."""
+    def __init__(self):
+        self.genai = None
+        self.current_key_idx = 0
+        self.current_model_idx = 0
+        self.model = None
+        self.initialized = False
+        try:
+            import google.generativeai as genai
+            self.genai = genai
+            self._init_model()
+        except ImportError:
+            print("✗ google-generativeai not installed")
+    def _init_model(self):
+        """Initialize model with current key and model."""
+        if not GEMINI_KEYS:
+            return False
+        key = GEMINI_KEYS[self.current_key_idx]
+        model_name = GEMINI_MODELS[self.current_model_idx]
+        try:
+            self.genai.configure(api_key=key)
+            self.model = self.genai.GenerativeModel(model_name)
+            self.initialized = True
+            print(f"  Using API key #{self.current_key_idx + 1}, model: {model_name}")
+            return True
+        except Exception as e:
+            print(f"  Failed to init Gemini: {e}")
+            return False
+    def _switch_to_next(self):
+        """Switch to next model or key combination."""
+        # Try next model with same key
+        if self.current_model_idx < len(GEMINI_MODELS) - 1:
+            self.current_model_idx += 1
+            print(f"  ⟳ Switching to fallback model: {GEMINI_MODELS[self.current_model_idx]}")
+            return self._init_model()
+        # Try next key with first model
+        if self.current_key_idx < len(GEMINI_KEYS) - 1:
+            self.current_key_idx += 1
+            self.current_model_idx = 0
+            print(f"  ⟳ Switching to fallback API key #{self.current_key_idx + 1}")
+            return self._init_model()
+        # No more fallbacks
+        print("  ✗ All Gemini keys/models exhausted")
+        return False
+    def generate(self, prompt, max_retries=None):
+        """Generate content with automatic fallback."""
+        if not self.initialized or not self.model:
+            return None, "Gemini not initialized"
+        # Calculate max retries based on available combinations
+        if max_retries is None:
+            max_retries = len(GEMINI_KEYS) * len(GEMINI_MODELS)
+        attempts = 0
+        while attempts < max_retries:
+            try:
+                response = self.model.generate_content(prompt)
+                return response.text.strip(), None
+            except Exception as e:
+                error_str = str(e)
+                # Check if rate limit error
+                if "429" in error_str or "quota" in error_str.lower() or "rate" in error_str.lower():
+                    print(f"  ⚠️ Rate limit hit")
+                    if not self._switch_to_next():
+                        return None, "All API keys exhausted"
+                    attempts += 1
+                else:
+                    # Other error, don't retry
+                    return None, error_str
+        return None, "Max retries exceeded"
+    def is_available(self):
+        """Check if Gemini is available."""
+        return self.initialized and self.model is not None
+# =============================================================================
+# COMPONENT IMPORTS
+# =============================================================================
+def load_components():
+    """Load all pipeline components."""
+    components = {}
+    # 1. RAG Retriever (using SQLRetriever class)
+    try:
+        from rag.retriever import SQLRetriever
+        components['rag'] = SQLRetriever()
+        print("✓ RAG Retriever loaded")
+    except Exception as e:
+        components['rag'] = None
+        print(f"✗ RAG not available: {e}")
+    # 2. Prompt Builder
+    try:
+        from prompts.prompt_builder import PromptBuilder
+        components['prompt_builder'] = PromptBuilder()
+        print("✓ Prompt Builder loaded")
+    except Exception as e:
+        components['prompt_builder'] = None
+        print(f"✗ Prompt Builder not available: {e}")
+    # 3. Fine-tuned Model
+    try:
+        from finetuning.inference import SQLGenerator
+        components['finetuned_model'] = SQLGenerator()
+        print("✓ Fine-tuned model loaded")
+    except Exception as e:
+        components['finetuned_model'] = None
+        print(f"✗ Fine-tuned model not available: {e}")
+    # 4. Gemini with fallback support
+    try:
+        if GEMINI_KEYS:
+            components['gemini'] = GeminiClient()
+            if components['gemini'].is_available():
+                print("✓ Gemini loaded")
+            else:
+                components['gemini'] = None
+                print("✗ Gemini failed to initialize")
+        else:
+            components['gemini'] = None
+            print("✗ Gemini not available (no API keys)")
+    except Exception as e:
+        components['gemini'] = None
+        print(f"✗ Gemini not available: {e}")
+    return components
+# =============================================================================
+# GEMINI ENHANCEMENT PROMPTS
+# =============================================================================
+GEMINI_REFINE_PROMPT = """You are an SQL expert. Review and enhance this SQL query.
+Original Question: {question}
+Generated SQL (by a smaller model):
+{sql}
+Your tasks:
+1. Check for syntax errors
+2. Check for logical errors
+3. Optimize if possible
+4. Fix any issues
+Rules:
+- If the SQL is correct, return it unchanged
+- If it needs fixes, return the corrected version
+- Return ONLY the SQL query, no explanations
+Enhanced SQL:"""
+GEMINI_VALIDATE_PROMPT = """You are an SQL validator. Check this SQL query.
+Question: {question}
+SQL: {sql}
+Respond in JSON format:
+{{
+    "is_valid": true/false,
+    "errors": ["list of errors if any"],
+    "suggestions": ["list of suggestions if any"],
+    "confidence": 0.0-1.0
+}}
+JSON Response:"""
+GEMINI_EXPLAIN_PROMPT = """Explain this SQL query in simple terms.
+SQL: {sql}
+Provide a brief, beginner-friendly explanation (2-3 sentences):"""
+# =============================================================================
+# PIPELINE CLASS
+# =============================================================================
+class IntegratedPipeline:
+    """
+    Complete pipeline: RAG → Prompt → Fine-tuned Model → Gemini Enhancement
+    """
+    def __init__(self):
+        setup_directories()
+        print("\n" + "=" * 50)
+        print("LOADING PIPELINE COMPONENTS")
+        print("=" * 50)
+        self.components = load_components()
+        print("=" * 50 + "\n")
+    # -------------------------------------------------------------------------
+    # STEP 1: RAG Retrieval
+    # -------------------------------------------------------------------------
+    def retrieve_context(self, question, top_k=3):
+        """Retrieve similar examples using RAG."""
+        if not self.components['rag']:
+            return "", []
+        try:
+            # Use SQLRetriever's retrieve method
+            results = self.components['rag'].retrieve(question, top_k=top_k)
+            # Format as context string
+            context = "Similar SQL examples:\n\n"
+            examples = []
+            for i, r in enumerate(results, 1):
+                context += f"Example {i}:\n"
+                context += f"Question: {r['question']}\n"
+                context += f"SQL: {r['sql']}\n\n"
+                examples.append(r)
+            return context, examples
+        except Exception as e:
+            print(f"RAG error: {e}")
+            return "", []
+    def retrieve_context_formatted(self, question, top_k=3):
+        """Use SQLRetriever's built-in context formatting."""
+        if not self.components['rag']:
+            return ""
+        try:
+            return self.components['rag'].retrieve_as_context(question, top_k=top_k)
+        except Exception as e:
+            print(f"RAG error: {e}")
+            return ""
+    # -------------------------------------------------------------------------
+    # STEP 2: Build Prompt
+    # -------------------------------------------------------------------------
+    def build_prompt(self, question, rag_context):
+        """Build prompt with context."""
+        if self.components['prompt_builder']:
+            result = self.components['prompt_builder'].build_prompt(
+                question=question,
+                rag_context=rag_context
+            )
+            if result['success']:
+                return result['prompt']
+        # Fallback: simple prompt
+        if rag_context:
+            return f"{rag_context}\nQuestion: {question}\n\nSQL:"
+        return f"Generate SQL for: {question}\n\nSQL:"
+    # -------------------------------------------------------------------------
+    # STEP 3: Fine-tuned Model Generation
+    # -------------------------------------------------------------------------
+    def generate_with_finetuned(self, question, context=""):
+        """Generate SQL using fine-tuned model."""
+        if not self.components['finetuned_model']:
+            return None, "Fine-tuned model not available"
+        try:
+            sql = self.components['finetuned_model'].generate(question, context)
+            return sql, None
+        except Exception as e:
+            return None, str(e)
+    # -------------------------------------------------------------------------
+    # STEP 4: Gemini Enhancement
+    # -------------------------------------------------------------------------
+    def enhance_with_gemini(self, question, sql):
+        """Use Gemini to refine/validate the SQL."""
+        if not self.components['gemini']:
+            return sql, {"enhanced": False, "reason": "Gemini not available"}
+        try:
+            prompt = GEMINI_REFINE_PROMPT.format(question=question, sql=sql)
+            enhanced_sql, error = self.components['gemini'].generate(prompt)
+            if error:
+                return sql, {"enhanced": False, "reason": error}
+            # Clean up response
+            enhanced_sql = self._clean_sql(enhanced_sql)
+            return enhanced_sql, {"enhanced": True, "original": sql}
+        except Exception as e:
+            return sql, {"enhanced": False, "reason": str(e)}
+    def validate_with_gemini(self, question, sql):
+        """Use Gemini to validate SQL."""
+        if not self.components['gemini']:
+            return {"is_valid": True, "confidence": 0.5}
+        try:
+            prompt = GEMINI_VALIDATE_PROMPT.format(question=question, sql=sql)
+            text, error = self.components['gemini'].generate(prompt)
+            if error:
+                return {"is_valid": True, "confidence": 0.5, "error": error}
+            # Remove markdown code blocks if present
+            if text.startswith("```"):
+                text = text.split("```")[1]
+                if text.startswith("json"):
+                    text = text[4:]
+            return json.loads(text)
+        except:
+            return {"is_valid": True, "confidence": 0.5}
+    def explain_with_gemini(self, sql):
+        """Use Gemini to explain the SQL."""
+        if not self.components['gemini']:
+            return "Explanation not available"
+        try:
+            prompt = GEMINI_EXPLAIN_PROMPT.format(sql=sql)
+            explanation, error = self.components['gemini'].generate(prompt)
+            if error:
+                return f"Explanation error: {error}"
+            return explanation
+        except Exception as e:
+            return f"Explanation error: {e}"
+    # -------------------------------------------------------------------------
+    # MAIN PIPELINE
+    # -------------------------------------------------------------------------
+    def run(self, question, enhance=True, validate=False, explain=False, top_k=3):
+        """
+        Run the complete pipeline.
+        Args:
+            question: Natural language question
+            enhance: Use Gemini to enhance SQL
+            validate: Use Gemini to validate SQL
+            explain: Use Gemini to explain SQL
+            top_k: Number of RAG examples to retrieve
+        Returns:
+            dict with all results
+        """
+        result = {
+            'question': question,
+            'timestamp': datetime.now().isoformat(),
+            'steps': {}
+        }
+        # Step 1: RAG Retrieval
+        rag_context, examples = self.retrieve_context(question, top_k=top_k)
+        result['steps']['rag'] = {
+            'context': rag_context,
+            'examples': examples,
+            'num_examples': len(examples)
+        }
+        # Step 2: Build Prompt
+        prompt = self.build_prompt(question, rag_context)
+        result['steps']['prompt'] = {
+            'prompt': prompt,
+            'length': len(prompt)
+        }
+        # Step 3: Fine-tuned Model
+        finetuned_sql, error = self.generate_with_finetuned(question, rag_context)
+        result['steps']['finetuned'] = {
+            'sql': finetuned_sql,
+            'error': error
+        }
+        if not finetuned_sql:
+            result['success'] = False
+            result['final_sql'] = None
+            return result
+        # Step 4: Gemini Enhancement
+        if enhance:
+            enhanced_sql, enhance_info = self.enhance_with_gemini(question, finetuned_sql)
+            result['steps']['gemini_enhance'] = {
+                'sql': enhanced_sql,
+                'info': enhance_info
+            }
+            result['final_sql'] = enhanced_sql
+        else:
+            result['final_sql'] = finetuned_sql
+        # Optional: Validation
+        if validate:
+            validation = self.validate_with_gemini(question, result['final_sql'])
+            result['steps']['validation'] = validation
+        # Optional: Explanation
+        if explain:
+            explanation = self.explain_with_gemini(result['final_sql'])
+            result['explanation'] = explanation
+        result['success'] = True
+        # Log result
+        self._log_result(result)
+        return result
+    # -------------------------------------------------------------------------
+    # UTILITIES
+    # -------------------------------------------------------------------------
+    def _clean_sql(self, sql):
+        """Clean SQL output."""
+        sql = sql.strip()
+        # Remove markdown code blocks
+        if sql.startswith("```"):
+            lines = sql.split("\n")
+            sql = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
+        # Remove leading 'sql' keyword
+        if sql.lower().startswith("sql"):
+            sql = sql[3:].strip()
+        return sql
+    def _log_result(self, result):
+        """Log pipeline result."""
+        log_file = f"{LOGS_DIR}/pipeline_log.jsonl"
+        # Remove examples from log to save space
+        log_result = {k: v for k, v in result.items()}
+        if 'steps' in log_result and 'rag' in log_result['steps']:
+            log_result['steps']['rag'] = {
+                'num_examples': log_result['steps']['rag'].get('num_examples', 0)
+            }
+        with open(log_file, 'a') as f:
+            f.write(json.dumps(log_result, default=str) + '\n')
+    def get_component_status(self):
+        """Get status of all components."""
+        return {
+            'rag': self.components['rag'] is not None,
+            'prompt_builder': self.components['prompt_builder'] is not None,
+            'finetuned_model': self.components['finetuned_model'] is not None,
+            'gemini': self.components['gemini'] is not None
+        }
+# =============================================================================
+# SIMPLE INTERFACE
+# =============================================================================
+_pipeline = None
+def get_pipeline():
+    """Get or create pipeline instance."""
+    global _pipeline
+    if _pipeline is None:
+        _pipeline = IntegratedPipeline()
+    return _pipeline
+def generate_sql(question, enhance=True, explain=False):
+    """Simple function to generate SQL."""
+    pipeline = get_pipeline()
+    result = pipeline.run(question, enhance=enhance, explain=explain)
+    if result['success']:
+        return result['final_sql']
+    return None
+# =============================================================================
+# TEST
+# =============================================================================
+def test_pipeline():
+    """Test the integrated pipeline."""
+    print("=" * 60)
+    print("TESTING INTEGRATED PIPELINE")
+    print("=" * 60)
+    pipeline = IntegratedPipeline()
+    # Show component status
+    print("\nComponent Status:")
+    status = pipeline.get_component_status()
+    for comp, loaded in status.items():
+        icon = "✓" if loaded else "✗"
+        print(f"  {icon} {comp}")
+    questions = [
+        "Find all employees with salary above 50000",
+    ]
+    for q in questions:
+        print(f"\n{'='*60}")
+        print(f"Question: {q}")
+        print("-" * 60)
+        result = pipeline.run(q, enhance=True, explain=True, top_k=3)
+        # Show RAG results
+        print(f"\n[RAG] Retrieved {result['steps']['rag']['num_examples']} examples")
+        # Show fine-tuned output
+        print(f"\n[Fine-tuned Model]")
+        if result['steps']['finetuned']['sql']:
+            print(f"  SQL: {result['steps']['finetuned']['sql']}")
+        else:
+            print(f"  Error: {result['steps']['finetuned']['error']}")
+        # Show Gemini enhancement
+        if 'gemini_enhance' in result['steps']:
+            print(f"\n[Gemini Enhanced]")
+            print(f"  SQL: {result['steps']['gemini_enhance']['sql']}")
+            if result['steps']['finetuned']['sql'] != result['steps']['gemini_enhance']['sql']:
+                print(f"  ✨ Query was improved!")
+        # Show final
+        print(f"\n[Final SQL]")
+        print(f"  {result['final_sql']}")
+        # Show explanation
+        if 'explanation' in result:
+            print(f"\n[Explanation]")
+            print(f"  {result['explanation']}")
+    print("\n" + "=" * 60)
+    print("✓ Pipeline test complete")
+    print("=" * 60)
+if __name__ == "__main__":
+    test_pipeline()

src/prompts/__init__.py ADDED Viewed

File without changes

src/prompts/prompt_builder.py ADDED Viewed

	@@ -0,0 +1,440 @@

+"""
+Prompt Builder for SQL Learning Assistant
+Handles: Context Management, User Interaction Flows, Edge Cases
+"""
+import re
+import os
+import sys
+import json
+from datetime import datetime
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from prompts.system_prompts import (
+    get_system_prompt,
+    get_prompt_template,
+    CLARIFICATION_PROMPT,
+    ERROR_RECOVERY_PROMPT
+)
+# =============================================================================
+# OUTPUT DIRECTORIES
+# =============================================================================
+OUTPUT_DIR = "outputs/prompts"
+LOGS_DIR = f"{OUTPUT_DIR}/logs"
+def setup_directories():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    os.makedirs(LOGS_DIR, exist_ok=True)
+# =============================================================================
+# CONTEXT MANAGEMENT
+# =============================================================================
+class ConversationContext:
+    """
+    Manages conversation history and context for multi-turn interactions.
+    """
+    def __init__(self, max_history=5):
+        self.history = []
+        self.max_history = max_history
+        self.current_tables = []
+        self.current_schema = {}
+        self.user_preferences = {}
+    def add_turn(self, question, sql_response, success=True):
+        """Add a conversation turn to history."""
+        self.history.append({
+            'question': question,
+            'sql': sql_response,
+            'success': success,
+            'timestamp': datetime.now().isoformat()
+        })
+        # Keep only recent history
+        if len(self.history) > self.max_history:
+            self.history = self.history[-self.max_history:]
+    def get_history_context(self):
+        """Format history for prompt injection."""
+        if not self.history:
+            return ""
+        context = "Previous conversation:\n"
+        for turn in self.history[-3:]:  # Last 3 turns
+            context += f"Q: {turn['question']}\n"
+            context += f"SQL: {turn['sql']}\n\n"
+        return context
+    def set_schema(self, schema_dict):
+        """Set current database schema context."""
+        self.current_schema = schema_dict
+    def get_schema_context(self):
+        """Format schema for prompt injection."""
+        if not self.current_schema:
+            return ""
+        context = "Available tables and columns:\n"
+        for table, columns in self.current_schema.items():
+            context += f"- {table}: {', '.join(columns)}\n"
+        return context
+    def clear(self):
+        """Clear conversation history."""
+        self.history = []
+        self.current_tables = []
+        self.current_schema = {}
+# =============================================================================
+# QUERY ANALYSIS (For Specialized Flows)
+# =============================================================================
+def analyze_query_intent(question):
+    """
+    Analyze user question to determine query type and intent.
+    Returns: dict with query_type, keywords, entities
+    """
+    question_lower = question.lower()
+    # Detect query type
+    query_type = 'general'
+    # Aggregation patterns
+    agg_patterns = ['count', 'sum', 'average', 'avg', 'total', 'maximum', 'max',
+                    'minimum', 'min', 'how many', 'what is the total']
+    if any(p in question_lower for p in agg_patterns):
+        query_type = 'aggregation'
+    # Complex query patterns
+    complex_patterns = ['join', 'combine', 'merge', 'from multiple', 'across tables',
+                       'subquery', 'nested', 'with the highest', 'with the lowest']
+    if any(p in question_lower for p in complex_patterns):
+        query_type = 'complex'
+    # Modification patterns
+    mod_patterns = ['insert', 'add new', 'update', 'change', 'modify', 'delete', 'remove']
+    if any(p in question_lower for p in mod_patterns):
+        query_type = 'modification'
+    # Simple patterns (if nothing else matched)
+    simple_patterns = ['show', 'list', 'get', 'find', 'select', 'display']
+    if query_type == 'general' and any(p in question_lower for p in simple_patterns):
+        query_type = 'simple'
+    # Extract potential keywords
+    keywords = []
+    sql_keywords = ['where', 'group by', 'order by', 'having', 'limit', 'join',
+                   'distinct', 'between', 'like', 'in']
+    for kw in sql_keywords:
+        if kw in question_lower:
+            keywords.append(kw.upper())
+    return {
+        'query_type': query_type,
+        'keywords': keywords,
+        'question_length': len(question.split())
+    }
+# =============================================================================
+# EDGE CASE HANDLING
+# =============================================================================
+def detect_edge_cases(question):
+    """
+    Detect potential edge cases in user question.
+    Returns: list of edge case types detected
+    """
+    edge_cases = []
+    question_lower = question.lower()
+    # Empty or too short
+    if len(question.strip()) < 5:
+        edge_cases.append('too_short')
+    # Too vague
+    vague_patterns = ['something', 'stuff', 'things', 'data', 'information']
+    if any(p in question_lower for p in vague_patterns) and len(question.split()) < 5:
+        edge_cases.append('too_vague')
+    # Multiple questions
+    if question.count('?') > 1:
+        edge_cases.append('multiple_questions')
+    # Contains SQL (user pasted SQL instead of question)
+    sql_patterns = ['SELECT', 'INSERT', 'UPDATE', 'DELETE', 'FROM', 'WHERE']
+    if sum(1 for p in sql_patterns if p in question.upper()) >= 2:
+        edge_cases.append('contains_sql')
+    # Potentially dangerous operations
+    dangerous_patterns = ['drop table', 'truncate', 'delete all', 'remove all']
+    if any(p in question_lower for p in dangerous_patterns):
+        edge_cases.append('dangerous_operation')
+    # Non-SQL question
+    non_sql_patterns = ['weather', 'hello', 'how are you', 'thank', 'bye']
+    if any(p in question_lower for p in non_sql_patterns):
+        edge_cases.append('not_sql_related')
+    return edge_cases
+def handle_edge_case(edge_case_type, question):
+    """
+    Generate appropriate response for edge cases.
+    Returns: (should_continue, message)
+    """
+    responses = {
+        'too_short': (
+            False,
+            "Your question is too short. Please provide more details about what data you want to retrieve."
+        ),
+        'too_vague': (
+            False,
+            "Your question is a bit vague. Could you specify:\n- Which table(s) to query?\n- What columns to retrieve?\n- Any conditions to filter by?"
+        ),
+        'multiple_questions': (
+            False,
+            "I detected multiple questions. Please ask one question at a time for accurate SQL generation."
+        ),
+        'contains_sql': (
+            False,
+            "It looks like you've pasted SQL code. Please describe what you want in natural language, and I'll generate the SQL for you."
+        ),
+        'dangerous_operation': (
+            False,
+            "⚠️ This appears to be a destructive operation (DROP/TRUNCATE/DELETE ALL). Please confirm you want to proceed or rephrase your question."
+        ),
+        'not_sql_related': (
+            False,
+            "I'm an SQL assistant. Please ask me questions about querying databases, and I'll help generate SQL queries."
+        )
+    }
+    return responses.get(edge_case_type, (True, ""))
+# =============================================================================
+# PROMPT BUILDER CLASS
+# =============================================================================
+class PromptBuilder:
+    """
+    Main class for building prompts with context management.
+    """
+    def __init__(self):
+        self.context = ConversationContext()
+        self.log_file = None
+        setup_directories()
+    def build_prompt(self, question, rag_context="", include_history=True):
+        """
+        Build complete prompt for SQL generation.
+        Args:
+            question: User's natural language question
+            rag_context: Retrieved examples from RAG
+            include_history: Whether to include conversation history
+        Returns:
+            dict with 'success', 'prompt' or 'error'
+        """
+        # Check for edge cases
+        edge_cases = detect_edge_cases(question)
+        if edge_cases:
+            should_continue, message = handle_edge_case(edge_cases[0], question)
+            if not should_continue:
+                return {
+                    'success': False,
+                    'error': message,
+                    'edge_case': edge_cases[0]
+                }
+        # Analyze query intent
+        intent = analyze_query_intent(question)
+        # Get appropriate system prompt
+        system_prompt = get_system_prompt(intent['query_type'])
+        # Build context parts
+        context_parts = []
+        # Add schema context if available
+        schema_context = self.context.get_schema_context()
+        if schema_context:
+            context_parts.append(schema_context)
+        # Add conversation history
+        if include_history:
+            history_context = self.context.get_history_context()
+            if history_context:
+                context_parts.append(history_context)
+        # Add RAG context
+        if rag_context:
+            context_parts.append(rag_context)
+        # Build final prompt
+        if rag_context:
+            template = get_prompt_template('rag')
+            prompt = template.format(
+                context=rag_context,
+                question=question
+            )
+        else:
+            template = get_prompt_template('zero_shot')
+            prompt = template.format(question=question)
+        # Combine everything
+        full_prompt = f"{system_prompt}\n\n"
+        if context_parts:
+            full_prompt += "\n".join(context_parts) + "\n\n"
+        full_prompt += prompt
+        # Log the prompt
+        self._log_prompt(question, intent, full_prompt)
+        return {
+            'success': True,
+            'prompt': full_prompt,
+            'system_prompt': system_prompt,
+            'query_type': intent['query_type'],
+            'keywords': intent['keywords']
+        }
+    def add_response(self, question, sql_response, success=True):
+        """Add a completed interaction to history."""
+        self.context.add_turn(question, sql_response, success)
+    def set_schema(self, schema_dict):
+        """Set database schema for context."""
+        self.context.set_schema(schema_dict)
+    def clear_context(self):
+        """Clear all context."""
+        self.context.clear()
+    def _log_prompt(self, question, intent, prompt):
+        """Log prompt for debugging/analysis."""
+        log_entry = {
+            'timestamp': datetime.now().isoformat(),
+            'question': question,
+            'intent': intent,
+            'prompt_length': len(prompt)
+        }
+        log_file = f"{LOGS_DIR}/prompt_log.jsonl"
+        with open(log_file, 'a') as f:
+            f.write(json.dumps(log_entry) + '\n')
+# =============================================================================
+# USER INTERACTION FLOWS
+# =============================================================================
+def get_clarification_questions(question, intent):
+    """
+    Generate clarification questions for ambiguous queries.
+    """
+    clarifications = []
+    # Generic clarifications based on query type
+    if intent['query_type'] == 'aggregation':
+        clarifications.append("Which column should be aggregated?")
+        clarifications.append("Should results be grouped by any column?")
+    if intent['query_type'] == 'complex':
+        clarifications.append("Which tables need to be joined?")
+        clarifications.append("What is the relationship between the tables?")
+    # Check for missing specifics
+    if 'table' not in question.lower():
+        clarifications.append("Which table(s) should be queried?")
+    if not any(word in question.lower() for word in ['all', 'specific', 'where', 'filter']):
+        clarifications.append("Do you want all records or filtered results?")
+    return clarifications
+def create_error_recovery_prompt(original_question, error_message):
+    """
+    Create prompt for recovering from errors.
+    """
+    return ERROR_RECOVERY_PROMPT.format(
+        error=error_message,
+        question=original_question
+    )
+# =============================================================================
+# TEST
+# =============================================================================
+def test_prompt_builder():
+    """Test the prompt builder functionality."""
+    print("=" * 60)
+    print("TESTING PROMPT BUILDER")
+    print("=" * 60)
+    builder = PromptBuilder()
+    # Test 1: Normal question
+    print("\n[TEST 1] Normal Question")
+    print("-" * 40)
+    result = builder.build_prompt(
+        "Find all employees with salary above 50000",
+        rag_context="Example 1:\nQ: Get workers earning more than 40000\nSQL: SELECT * FROM employees WHERE salary > 40000"
+    )
+    print(f"Success: {result['success']}")
+    print(f"Query Type: {result.get('query_type')}")
+    print(f"Prompt Length: {len(result.get('prompt', ''))}")
+    # Test 2: Edge case - too short
+    print("\n[TEST 2] Edge Case - Too Short")
+    print("-" * 40)
+    result = builder.build_prompt("SQL")
+    print(f"Success: {result['success']}")
+    print(f"Error: {result.get('error', 'None')}")
+    # Test 3: Edge case - contains SQL
+    print("\n[TEST 3] Edge Case - Contains SQL")
+    print("-" * 40)
+    result = builder.build_prompt("SELECT * FROM users WHERE id = 1")
+    print(f"Success: {result['success']}")
+    print(f"Error: {result.get('error', 'None')}")
+    # Test 4: Edge case - dangerous operation
+    print("\n[TEST 4] Edge Case - Dangerous Operation")
+    print("-" * 40)
+    result = builder.build_prompt("Drop table users")
+    print(f"Success: {result['success']}")
+    print(f"Error: {result.get('error', 'None')}")
+    # Test 5: Aggregation query
+    print("\n[TEST 5] Aggregation Query")
+    print("-" * 40)
+    result = builder.build_prompt("Count total orders by customer")
+    print(f"Success: {result['success']}")
+    print(f"Query Type: {result.get('query_type')}")
+    # Test 6: Context management
+    print("\n[TEST 6] Context Management")
+    print("-" * 40)
+    builder.set_schema({
+        'employees': ['id', 'name', 'salary', 'dept_id'],
+        'departments': ['id', 'name', 'location']
+    })
+    builder.add_response("Show all employees", "SELECT * FROM employees", success=True)
+    result = builder.build_prompt("Now filter by department")
+    print(f"Success: {result['success']}")
+    print(f"Has History: {'Previous conversation' in result.get('prompt', '')}")
+    print("\n" + "=" * 60)
+    print("✓ All tests complete")
+    print("=" * 60)
+if __name__ == "__main__":
+    test_prompt_builder()

src/prompts/system_prompts.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+System Prompts for SQL Learning Assistant
+Systematic prompting strategies for different use cases.
+"""
+# =============================================================================
+# BASE SYSTEM PROMPT
+# =============================================================================
+BASE_SYSTEM_PROMPT = """You are an expert SQL assistant. Your task is to generate accurate SQL queries based on natural language questions.
+Rules:
+1. Generate ONLY the SQL query, no explanations unless asked
+2. Use standard SQL syntax
+3. Be precise and efficient in your queries
+4. If the question is ambiguous, make reasonable assumptions
+5. Always use proper SQL formatting
+"""
+# =============================================================================
+# SPECIALIZED PROMPTS BY USE CASE
+# =============================================================================
+# For simple SELECT queries
+SIMPLE_QUERY_PROMPT = """You are an SQL assistant specializing in simple queries.
+Your task: Convert the natural language question into a basic SQL SELECT query.
+Guidelines:
+- Use simple SELECT, FROM, WHERE clauses
+- Avoid complex joins unless necessary
+- Keep queries straightforward and readable
+"""
+# For complex queries with JOINs
+COMPLEX_QUERY_PROMPT = """You are an SQL assistant specializing in complex queries.
+Your task: Convert the natural language question into an SQL query that may involve:
+- Multiple JOINs (INNER, LEFT, RIGHT)
+- Subqueries
+- Multiple conditions
+- Aggregations with GROUP BY
+Guidelines:
+- Use appropriate JOIN types
+- Structure subqueries clearly
+- Use aliases for readability
+"""
+# For aggregation queries
+AGGREGATION_PROMPT = """You are an SQL assistant specializing in aggregation queries.
+Your task: Convert the natural language question into an SQL query using aggregate functions.
+Guidelines:
+- Use COUNT, SUM, AVG, MAX, MIN appropriately
+- Include GROUP BY when aggregating
+- Use HAVING for aggregate conditions
+- Consider ORDER BY for ranked results
+"""
+# For data modification (if needed)
+MODIFICATION_PROMPT = """You are an SQL assistant for data modification queries.
+Your task: Convert the natural language request into INSERT, UPDATE, or DELETE statements.
+Guidelines:
+- Be cautious with DELETE and UPDATE
+- Always include WHERE clause for UPDATE/DELETE
+- Validate data types for INSERT
+"""
+# =============================================================================
+# PROMPT TEMPLATES WITH CONTEXT
+# =============================================================================
+RAG_CONTEXT_TEMPLATE = """You are an expert SQL assistant.
+Here are similar examples to help you:
+{context}
+Based on these examples, generate the SQL query for:
+Question: {question}
+SQL:"""
+FEW_SHOT_TEMPLATE = """You are an expert SQL assistant. Learn from these examples:
+{examples}
+Now generate SQL for this question:
+Question: {question}
+SQL:"""
+ZERO_SHOT_TEMPLATE = """You are an expert SQL assistant.
+Generate the SQL query for:
+Question: {question}
+SQL:"""
+# =============================================================================
+# ERROR HANDLING PROMPTS
+# =============================================================================
+CLARIFICATION_PROMPT = """I need more information to generate the SQL query.
+Original question: {question}
+Please clarify:
+{clarification_points}
+"""
+ERROR_RECOVERY_PROMPT = """I encountered an issue with the previous query.
+Error: {error}
+Original question: {question}
+Let me try a different approach:
+"""
+# =============================================================================
+# PROMPT SELECTOR
+# =============================================================================
+def get_system_prompt(query_type='general'):
+    """
+    Get appropriate system prompt based on query type.
+    Args:
+        query_type: 'simple', 'complex', 'aggregation', 'modification', 'general'
+    Returns:
+        System prompt string
+    """
+    prompts = {
+        'simple': SIMPLE_QUERY_PROMPT,
+        'complex': COMPLEX_QUERY_PROMPT,
+        'aggregation': AGGREGATION_PROMPT,
+        'modification': MODIFICATION_PROMPT,
+        'general': BASE_SYSTEM_PROMPT
+    }
+    return prompts.get(query_type, BASE_SYSTEM_PROMPT)
+def get_prompt_template(template_type='rag'):
+    """
+    Get prompt template by type.
+    Args:
+        template_type: 'rag', 'few_shot', 'zero_shot'
+    Returns:
+        Template string
+    """
+    templates = {
+        'rag': RAG_CONTEXT_TEMPLATE,
+        'few_shot': FEW_SHOT_TEMPLATE,
+        'zero_shot': ZERO_SHOT_TEMPLATE
+    }
+    return templates.get(template_type, RAG_CONTEXT_TEMPLATE)

src/rag/__init__.py ADDED Viewed

File without changes

src/rag/embeddings.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Embedding Module for RAG System
+Uses FREE sentence-transformers (no API costs).
+Gemini is ONLY used for final SQL generation.
+"""
+from sentence_transformers import SentenceTransformer
+import os
+# =============================================================================
+# FREE LOCAL EMBEDDING MODEL
+# =============================================================================
+# Using all-MiniLM-L6-v2: fast, good quality, 384 dimensions
+MODEL_NAME = "all-MiniLM-L6-v2"
+# Global model instance (loaded once)
+_model = None
+def get_model():
+    """Get or load the embedding model."""
+    global _model
+    if _model is None:
+        print(f"  Loading embedding model: {MODEL_NAME}")
+        _model = SentenceTransformer(MODEL_NAME)
+    return _model
+# =============================================================================
+# EMBEDDING FUNCTIONS
+# =============================================================================
+def get_embedding(text):
+    """Get embedding for a single text."""
+    try:
+        model = get_model()
+        embedding = model.encode(text, convert_to_numpy=True)
+        return embedding.tolist()
+    except Exception as e:
+        print(f"Error getting embedding: {e}")
+        return None
+def get_embeddings_batch(texts):
+    """Get embeddings for multiple texts at once (efficient)."""
+    try:
+        model = get_model()
+        embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
+        return [emb.tolist() for emb in embeddings]
+    except Exception as e:
+        print(f"Error in batch embedding: {e}")
+        return [None] * len(texts)
+# =============================================================================
+# TEST
+# =============================================================================
+def test_embedding():
+    """Test embedding functionality."""
+    print("=" * 50)
+    print("TESTING EMBEDDINGS (FREE - No API)")
+    print("=" * 50)
+    test_texts = [
+        "Find all employees with salary greater than 50000",
+        "Show customers who ordered last month",
+        "Count products by category"
+    ]
+    print(f"\nModel: {MODEL_NAME}")
+    print(f"Testing with {len(test_texts)} texts...\n")
+    # Single embedding
+    emb = get_embedding(test_texts[0])
+    if emb:
+        print(f"✓ Single embedding works")
+        print(f"  Dimension: {len(emb)}")
+    # Batch embedding
+    embs = get_embeddings_batch(test_texts)
+    if embs and embs[0]:
+        print(f"✓ Batch embedding works")
+        print(f"  Got {len(embs)} embeddings")
+    print("\n✓ All tests passed (FREE - No Gemini used)")
+    return True
+if __name__ == "__main__":
+    test_embedding()

src/rag/knowledge_base.py ADDED Viewed

	@@ -0,0 +1,415 @@

+"""
+Knowledge Base Builder for RAG System
+Includes: Chunking Strategies, Vector Storage
+"""
+import os
+import pandas as pd
+import chromadb
+import json
+import re
+from datetime import datetime
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from rag.embeddings import get_embeddings_batch
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+CHROMA_DIR = "chromadb_data"
+COLLECTION_NAME = "sql_knowledge"
+OUTPUT_DIR = "outputs/rag"
+STATS_DIR = f"{OUTPUT_DIR}/stats"
+REPORT_DIR = f"{OUTPUT_DIR}/reports"
+def setup_directories():
+    """Create necessary directories."""
+    for d in [CHROMA_DIR, OUTPUT_DIR, STATS_DIR, REPORT_DIR]:
+        os.makedirs(d, exist_ok=True)
+# =============================================================================
+# CHUNKING STRATEGIES
+# =============================================================================
+def chunk_by_sql_clauses(sql):
+    """
+    Chunking Strategy 1: Split SQL by clauses.
+    Identifies SELECT, FROM, WHERE, GROUP BY, ORDER BY, etc.
+    """
+    clauses = []
+    # Common SQL clause patterns
+    patterns = [
+        (r'\bSELECT\b.*?(?=\bFROM\b|$)', 'SELECT'),
+        (r'\bFROM\b.*?(?=\bWHERE\b|\bGROUP\b|\bORDER\b|\bLIMIT\b|$)', 'FROM'),
+        (r'\bWHERE\b.*?(?=\bGROUP\b|\bORDER\b|\bLIMIT\b|$)', 'WHERE'),
+        (r'\bGROUP BY\b.*?(?=\bHAVING\b|\bORDER\b|\bLIMIT\b|$)', 'GROUP BY'),
+        (r'\bHAVING\b.*?(?=\bORDER\b|\bLIMIT\b|$)', 'HAVING'),
+        (r'\bORDER BY\b.*?(?=\bLIMIT\b|$)', 'ORDER BY'),
+        (r'\bLIMIT\b.*', 'LIMIT'),
+    ]
+    sql_upper = sql.upper()
+    for pattern, clause_name in patterns:
+        match = re.search(pattern, sql_upper, re.IGNORECASE | re.DOTALL)
+        if match:
+            clauses.append(clause_name)
+    return clauses
+def chunk_by_complexity(question, sql):
+    """
+    Chunking Strategy 2: Categorize by query complexity.
+    """
+    sql_upper = sql.upper()
+    # Determine complexity level
+    complexity_score = 0
+    # Check for complex features
+    if 'JOIN' in sql_upper:
+        complexity_score += 2
+    if 'SUBQUERY' in sql_upper or sql_upper.count('SELECT') > 1:
+        complexity_score += 2
+    if 'GROUP BY' in sql_upper:
+        complexity_score += 1
+    if 'HAVING' in sql_upper:
+        complexity_score += 1
+    if 'ORDER BY' in sql_upper:
+        complexity_score += 1
+    if any(agg in sql_upper for agg in ['COUNT', 'SUM', 'AVG', 'MAX', 'MIN']):
+        complexity_score += 1
+    if 'UNION' in sql_upper:
+        complexity_score += 2
+    # Categorize
+    if complexity_score <= 1:
+        return 'simple'
+    elif complexity_score <= 3:
+        return 'intermediate'
+    else:
+        return 'complex'
+def extract_sql_keywords(sql):
+    """
+    Chunking Strategy 3: Extract SQL keywords for metadata.
+    """
+    sql_upper = sql.upper()
+    keywords = []
+    # Operations
+    if 'SELECT' in sql_upper:
+        keywords.append('SELECT')
+    if 'INSERT' in sql_upper:
+        keywords.append('INSERT')
+    if 'UPDATE' in sql_upper:
+        keywords.append('UPDATE')
+    if 'DELETE' in sql_upper:
+        keywords.append('DELETE')
+    # Joins
+    if 'INNER JOIN' in sql_upper:
+        keywords.append('INNER JOIN')
+    elif 'LEFT JOIN' in sql_upper:
+        keywords.append('LEFT JOIN')
+    elif 'RIGHT JOIN' in sql_upper:
+        keywords.append('RIGHT JOIN')
+    elif 'JOIN' in sql_upper:
+        keywords.append('JOIN')
+    # Clauses
+    for clause in ['WHERE', 'GROUP BY', 'HAVING', 'ORDER BY', 'LIMIT']:
+        if clause in sql_upper:
+            keywords.append(clause)
+    # Aggregations
+    for agg in ['COUNT', 'SUM', 'AVG', 'MAX', 'MIN']:
+        if agg in sql_upper:
+            keywords.append(agg)
+    # Subqueries
+    if sql_upper.count('SELECT') > 1:
+        keywords.append('SUBQUERY')
+    return keywords
+def calculate_chunk_size(text):
+    """Calculate appropriate chunk size category."""
+    word_count = len(text.split())
+    if word_count <= 10:
+        return 'short'
+    elif word_count <= 25:
+        return 'medium'
+    else:
+        return 'long'
+# =============================================================================
+# DOCUMENT PREPARATION WITH CHUNKING
+# =============================================================================
+def prepare_documents_with_chunking(datasets):
+    """
+    Prepare documents with chunking metadata.
+    Each document gets rich metadata for filtering/ranking.
+    """
+    documents = []
+    metadatas = []
+    ids = []
+    idx = 0
+    for source, df in datasets.items():
+        for _, row in df.iterrows():
+            question = str(row['question'])
+            sql = str(row['sql'])
+            # Apply chunking strategies
+            sql_clauses = chunk_by_sql_clauses(sql)
+            complexity = chunk_by_complexity(question, sql)
+            keywords = extract_sql_keywords(sql)
+            q_size = calculate_chunk_size(question)
+            sql_size = calculate_chunk_size(sql)
+            # Create rich metadata
+            metadata = {
+                'sql': sql,
+                'source': source,
+                'question': question,
+                # Chunking metadata
+                'complexity': complexity,
+                'sql_clauses': ','.join(sql_clauses),
+                'keywords': ','.join(keywords),
+                'question_size': q_size,
+                'sql_size': sql_size,
+                'keyword_count': len(keywords),
+                'clause_count': len(sql_clauses),
+            }
+            documents.append(question)
+            metadatas.append(metadata)
+            ids.append(f"doc_{idx}")
+            idx += 1
+    return documents, metadatas, ids
+# =============================================================================
+# CHROMADB CLIENT
+# =============================================================================
+def get_chroma_client():
+    """Get ChromaDB persistent client."""
+    return chromadb.PersistentClient(path=CHROMA_DIR)
+def get_or_create_collection(client):
+    """Get or create the SQL knowledge collection."""
+    return client.get_or_create_collection(
+        name=COLLECTION_NAME,
+        metadata={"description": "SQL question-answer pairs with chunking metadata"}
+    )
+# =============================================================================
+# DATA LOADING
+# =============================================================================
+def load_datasets(data_dir="data"):
+    """Load ALL CSV datasets."""
+    datasets = {}
+    files = {
+        'train': 'train.csv',
+        'validation': 'validation.csv',
+        'test': 'test.csv'
+        # 'synthetic': 'synthetic.csv'
+    }
+    for name, filename in files.items():
+        filepath = os.path.join(data_dir, filename)
+        if os.path.exists(filepath):
+            df = pd.read_csv(filepath)
+            datasets[name] = df
+            print(f"  Loaded {name}: {len(df):,} rows")
+        else:
+            print(f"  Skipped {name}: file not found")
+    return datasets
+# =============================================================================
+# KNOWLEDGE BASE BUILDING
+# =============================================================================
+def build_knowledge_base(data_dir="data", batch_size=500):
+    """Build knowledge base with chunking strategies."""
+    print("=" * 50)
+    print("BUILDING RAG KNOWLEDGE BASE")
+    print("With Chunking Strategies")
+    print("=" * 50)
+    setup_directories()
+    # Step 1: Load data
+    print(f"\n[1/5] Loading datasets...")
+    datasets = load_datasets(data_dir)
+    if not datasets:
+        print("ERROR: No datasets found!")
+        return None
+    total_rows = sum(len(df) for df in datasets.values())
+    print(f"  Total rows: {total_rows:,}")
+    # Step 2: Prepare documents with chunking
+    print(f"\n[2/5] Applying chunking strategies...")
+    documents, metadatas, ids = prepare_documents_with_chunking(datasets)
+    print(f"  Total documents: {len(documents):,}")
+    # Show chunking stats
+    complexities = [m['complexity'] for m in metadatas]
+    print(f"  Complexity distribution:")
+    print(f"    Simple: {complexities.count('simple'):,}")
+    print(f"    Intermediate: {complexities.count('intermediate'):,}")
+    print(f"    Complex: {complexities.count('complex'):,}")
+    # Step 3: Initialize ChromaDB
+    print(f"\n[3/5] Initializing ChromaDB...")
+    client = get_chroma_client()
+    try:
+        client.delete_collection(COLLECTION_NAME)
+        print("  Deleted existing collection")
+    except:
+        pass
+    collection = get_or_create_collection(client)
+    print(f"  Collection: {COLLECTION_NAME}")
+    # Step 4: Generate embeddings and store
+    print(f"\n[4/5] Generating embeddings...")
+    total_added = 0
+    for i in range(0, len(documents), batch_size):
+        batch_docs = documents[i:i + batch_size]
+        batch_meta = metadatas[i:i + batch_size]
+        batch_ids = ids[i:i + batch_size]
+        embeddings = get_embeddings_batch(batch_docs)
+        if embeddings and embeddings[0] is not None:
+            collection.add(
+                documents=batch_docs,
+                metadatas=batch_meta,
+                ids=batch_ids,
+                embeddings=embeddings
+            )
+            total_added += len(batch_docs)
+        progress = min(i + batch_size, len(documents))
+        pct = (progress / len(documents)) * 100
+        print(f"  Progress: {progress:,}/{len(documents):,} ({pct:.1f}%)")
+    # Step 5: Save statistics
+    print(f"\n[5/5] Saving statistics...")
+    stats = {
+        'total_documents': total_added,
+        'sources': {name: len(df) for name, df in datasets.items()},
+        'collection_name': COLLECTION_NAME,
+        'embedding_model': 'all-MiniLM-L6-v2',
+        'chunking_strategies': [
+            'sql_clause_extraction',
+            'complexity_classification',
+            'keyword_extraction',
+            'size_categorization'
+        ],
+        'complexity_distribution': {
+            'simple': complexities.count('simple'),
+            'intermediate': complexities.count('intermediate'),
+            'complex': complexities.count('complex')
+        },
+        'created_at': datetime.now().isoformat()
+    }
+    with open(f'{STATS_DIR}/knowledge_base_stats.json', 'w') as f:
+        json.dump(stats, f, indent=2)
+    generate_report(stats)
+    print("\n" + "=" * 50)
+    print("COMPLETE")
+    print("=" * 50)
+    print(f"  Documents indexed: {total_added:,}")
+    print(f"  Storage: {CHROMA_DIR}/")
+    return collection
+# =============================================================================
+# REPORT GENERATION
+# =============================================================================
+def generate_report(stats):
+    """Generate knowledge base report."""
+    report = f"""# RAG Knowledge Base Report
+**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+## Overview
+| Metric | Value |
+|--------|-------|
+| Total Documents | {stats['total_documents']:,} |
+| Collection Name | {stats['collection_name']} |
+| Embedding Model | {stats['embedding_model']} |
+## Data Sources
+| Source | Documents |
+|--------|-----------|
+"""
+    for source, count in stats['sources'].items():
+        report += f"| {source} | {count:,} |\n"
+    report += f"""
+## Chunking Strategies
+1. **SQL Clause Extraction**: Identifies SELECT, FROM, WHERE, GROUP BY, etc.
+2. **Complexity Classification**: Categorizes as simple/intermediate/complex
+3. **Keyword Extraction**: Extracts SQL operations (JOIN, COUNT, etc.)
+4. **Size Categorization**: Classifies question/SQL length
+## Complexity Distribution
+| Level | Count |
+|-------|-------|
+| Simple | {stats['complexity_distribution']['simple']:,} |
+| Intermediate | {stats['complexity_distribution']['intermediate']:,} |
+| Complex | {stats['complexity_distribution']['complex']:,} |
+## Document Metadata Structure
+Each document contains:
+- `sql`: The SQL query
+- `source`: Origin dataset
+- `question`: Original question
+- `complexity`: simple/intermediate/complex
+- `sql_clauses`: Comma-separated clauses
+- `keywords`: SQL keywords found
+- `question_size`: short/medium/long
+- `sql_size`: short/medium/long
+"""
+    with open(f'{REPORT_DIR}/knowledge_base_report.md', 'w') as f:
+        f.write(report)
+    print(f"  Report saved to {REPORT_DIR}/")
+# =============================================================================
+# ENTRY POINT
+# =============================================================================
+if __name__ == "__main__":
+    build_knowledge_base(data_dir="data", batch_size=500)

src/rag/retriever.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""
+Retriever Module for RAG System
+Loads from: Local ChromaDB OR HuggingFace Hub
+"""
+import os
+import sys
+from dotenv import load_dotenv
+load_dotenv()
+# Try new imports first, fall back to old
+try:
+    from langchain_huggingface import HuggingFaceEmbeddings
+    from langchain_chroma import Chroma
+except ImportError:
+    from langchain_community.vectorstores import Chroma
+    from langchain_community.embeddings import HuggingFaceEmbeddings
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+LOCAL_CHROMADB_DIR = "chromadb_data"
+HF_CHROMADB_ID = os.getenv("HF_CHROMADB_ID", None)
+COLLECTION_NAME = "sql_knowledge"
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+# =============================================================================
+# CHROMADB LOADER
+# =============================================================================
+def ensure_chromadb_exists():
+    """Ensure ChromaDB data exists - download from HF if needed."""
+    # Check if local has actual ChromaDB files (not just empty folder)
+    if os.path.exists(LOCAL_CHROMADB_DIR):
+        local_files = os.listdir(LOCAL_CHROMADB_DIR) if os.path.isdir(LOCAL_CHROMADB_DIR) else []
+        # ChromaDB creates files like chroma.sqlite3 or folders
+        has_chroma_files = any('chroma' in f.lower() or 'sqlite' in f.lower() for f in local_files) or len(local_files) > 2
+        if has_chroma_files:
+            print(f"📁 Using local ChromaDB: {LOCAL_CHROMADB_DIR}")
+            return LOCAL_CHROMADB_DIR
+        else:
+            print(f"⚠️ ChromaDB folder exists but is empty or incomplete")
+    # Download from HuggingFace
+    if HF_CHROMADB_ID:
+        print(f"☁️ Downloading ChromaDB from HuggingFace: {HF_CHROMADB_ID}")
+        from huggingface_hub import snapshot_download
+        # Create folder if not exists
+        os.makedirs(LOCAL_CHROMADB_DIR, exist_ok=True)
+        snapshot_download(
+            repo_id=HF_CHROMADB_ID,
+            repo_type="dataset",
+            local_dir=LOCAL_CHROMADB_DIR
+        )
+        print("✓ ChromaDB downloaded!")
+        return LOCAL_CHROMADB_DIR
+    # Need to build it from data
+    print("⚠️ ChromaDB not found and no HF_CHROMADB_ID set. Building from data...")
+    from rag.knowledge_base import build_knowledge_base
+    build_knowledge_base(data_dir="data", batch_size=500)
+    return LOCAL_CHROMADB_DIR
+# =============================================================================
+# LANGCHAIN EMBEDDINGS
+# =============================================================================
+def get_embeddings():
+    """Get HuggingFace embeddings for LangChain."""
+    return HuggingFaceEmbeddings(
+        model_name=EMBEDDING_MODEL,
+        model_kwargs={'device': 'cpu'},
+        encode_kwargs={'normalize_embeddings': True}
+    )
+# =============================================================================
+# RANKING FUNCTIONS
+# =============================================================================
+def calculate_relevance_score(result, query):
+    """Calculate enhanced relevance score."""
+    base_score = result.get('score', 0.5)
+    boost = 0.0
+    query_words = set(query.lower().split())
+    question_words = set(result.get('question', '').lower().split())
+    overlap = len(query_words & question_words)
+    if overlap > 0:
+        boost += 0.05 * min(overlap, 5)
+    query_length = len(query.split())
+    if query_length <= 8 and result.get('complexity') == 'simple':
+        boost += 0.1
+    elif query_length > 15 and result.get('complexity') == 'complex':
+        boost += 0.1
+    return base_score + boost
+def rerank_results(results, query):
+    """Re-rank results using enhanced relevance scoring."""
+    for r in results:
+        r['relevance_score'] = calculate_relevance_score(r, query)
+    results.sort(key=lambda x: x['relevance_score'], reverse=True)
+    return results
+# =============================================================================
+# FILTERING FUNCTIONS
+# =============================================================================
+def filter_by_threshold(results, min_score=0.0):
+    return [r for r in results if r.get('score', 0) >= min_score]
+def filter_by_complexity(results, complexity=None):
+    if complexity is None:
+        return results
+    return [r for r in results if r.get('complexity') == complexity]
+# =============================================================================
+# SQL RETRIEVER CLASS
+# =============================================================================
+class SQLRetriever:
+    """LangChain-based retriever with local/HuggingFace support."""
+    def __init__(self):
+        """Initialize the retriever."""
+        print("Initializing SQL Retriever...")
+        # Ensure ChromaDB exists
+        chromadb_path = ensure_chromadb_exists()
+        # Load embeddings
+        self.embeddings = get_embeddings()
+        # Load ChromaDB
+        self.vectorstore = Chroma(
+            collection_name=COLLECTION_NAME,
+            persist_directory=chromadb_path,
+            embedding_function=self.embeddings
+        )
+        self.doc_count = self.vectorstore._collection.count()
+        print(f"✓ Loaded {self.doc_count:,} documents from {chromadb_path}")
+    def retrieve(self, query, top_k=5, min_score=None, complexity=None, rerank=True):
+        """Retrieve similar questions with filtering and ranking."""
+        fetch_k = min(top_k * 3, 50)
+        docs_with_scores = self.vectorstore.similarity_search_with_score(query, k=fetch_k)
+        # Format results
+        formatted = []
+        for doc, score in docs_with_scores:
+            formatted.append({
+                'question': doc.page_content,
+                'sql': doc.metadata.get('sql', ''),
+                'source': doc.metadata.get('source', 'unknown'),
+                'complexity': doc.metadata.get('complexity', 'unknown'),
+                'keywords': doc.metadata.get('keywords', ''),
+                'sql_clauses': doc.metadata.get('sql_clauses', ''),
+                'distance': score,
+                'score': 1 - score if score <= 1 else 1 / (1 + score)
+            })
+        # Apply filters
+        if min_score is not None:
+            formatted = filter_by_threshold(formatted, min_score)
+        if complexity is not None:
+            formatted = filter_by_complexity(formatted, complexity)
+        # Apply re-ranking
+        if rerank:
+            formatted = rerank_results(formatted, query)
+        return formatted[:top_k]
+    def retrieve_as_context(self, query, top_k=5):
+        """Retrieve and format as context for LLM prompt."""
+        results = self.retrieve(query, top_k=top_k)
+        if not results:
+            return ""
+        context = "Similar SQL examples:\n\n"
+        for i, r in enumerate(results, 1):
+            context += f"Example {i}:\n"
+            context += f"Question: {r['question']}\n"
+            context += f"SQL: {r['sql']}\n\n"
+        return context
+    def get_stats(self):
+        """Get retriever statistics."""
+        return {
+            'total_documents': self.doc_count,
+            'collection_name': COLLECTION_NAME,
+            'embedding_model': EMBEDDING_MODEL,
+        }
+# =============================================================================
+# TEST
+# =============================================================================
+def test_retriever():
+    """Test retriever."""
+    print("=" * 60)
+    print("TESTING SQL RETRIEVER")
+    print("=" * 60)
+    retriever = SQLRetriever()
+    query = "Find all employees with salary above 50000"
+    results = retriever.retrieve(query, top_k=3)
+    print(f"\nQuery: {query}\n")
+    for i, r in enumerate(results, 1):
+        print(f"Result {i}: (score: {r['score']:.3f})")
+        print(f"  Q: {r['question'][:60]}...")
+        print(f"  SQL: {r['sql'][:60]}...")
+        print()
+    print("✓ Test complete")
+if __name__ == "__main__":
+    test_retriever()

src/requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+--extra-index-url https://download.pytorch.org/whl/cu130
+streamlit
+chromadb
+google-generativeai
+python-dotenv
+pandas
+datasets
+transformers
+peft
+accelerate
+bitsandbytes
+torch
+torchvision
+torchaudio
+sentencepiece
+huggingface_hub
+matplotlib
+sentence-transformers
+langchain
+langchain-community
+langchain-chroma

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

src/synthetic/__init__.py ADDED Viewed

File without changes

src/synthetic/generate_data.py ADDED Viewed

	@@ -0,0 +1,401 @@

+"""
+Synthetic Data Generation for SQL Learning Assistant
+Covers:
+1. Create synthetic datasets for training/testing
+2. Implement data augmentation techniques
+3. Ensure diversity and quality of generated data
+4. Address privacy and ethical considerations
+"""
+import pandas as pd
+import random
+import re
+import hashlib
+import json
+from collections import Counter
+from datetime import datetime
+import matplotlib.pyplot as plt
+import os
+import sys
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from synthetic.synonyms import SYNONYMS, get_synonym, has_synonym
+# =============================================================================
+# OUTPUT DIRECTORIES
+# =============================================================================
+OUTPUT_DIR = "outputs/synthetic"
+VIZ_DIR = f"{OUTPUT_DIR}/visualizations"
+REPORT_DIR = f"{OUTPUT_DIR}/reports"
+STATS_DIR = f"{OUTPUT_DIR}/stats"
+def setup_directories():
+    """Create output directories."""
+    for d in [OUTPUT_DIR, VIZ_DIR, REPORT_DIR, STATS_DIR]:
+        os.makedirs(d, exist_ok=True)
+# =============================================================================
+# SENTENCE VARIATIONS
+# =============================================================================
+PREFIXES = ["", "Can you ", "Please ", "I want to ", "I need to ",
+            "Could you ", "Help me ", "Show me how to "]
+SUFFIXES = ["", "?", " please", " for me", " please?"]
+# =============================================================================
+# AUGMENTATION TECHNIQUES
+# =============================================================================
+def replace_synonyms(text, prob=0.4):
+    """Technique 1: Replace words with synonyms."""
+    words = text.split()
+    result = []
+    for word in words:
+        clean = re.sub(r'[^\w]', '', word).lower()
+        if has_synonym(clean) and random.random() < prob:
+            syn = get_synonym(clean)
+            result.append(syn if word[-1] not in '.,?!' else syn + word[-1])
+        else:
+            result.append(word)
+    return ' '.join(result)
+def random_insertion(text, prob=0.15):
+    """Technique 2: Insert contextual words."""
+    inserts = ["also", "specifically", "exactly", "just", "only"]
+    words = text.split()
+    if len(words) > 3 and random.random() < prob:
+        pos = random.randint(1, len(words) - 1)
+        words.insert(pos, random.choice(inserts))
+    return ' '.join(words)
+def random_swap(text, prob=0.1):
+    """Technique 3: Swap adjacent words."""
+    words = text.split()
+    if len(words) > 4 and random.random() < prob:
+        pos = random.randint(1, len(words) - 3)
+        words[pos], words[pos + 1] = words[pos + 1], words[pos]
+    return ' '.join(words)
+def structure_variation(text):
+    """Technique 4: Add prefixes and suffixes."""
+    prefix = random.choice(PREFIXES)
+    suffix = random.choice(SUFFIXES)
+    if prefix:
+        text = text[0].lower() + text[1:] if text else text
+    result = prefix + text + suffix
+    return result[0].upper() + result[1:] if result else result
+def case_variation(text):
+    """Technique 5: Vary capitalization."""
+    r = random.random()
+    if r < 0.6:
+        return text[0].upper() + text[1:].lower() if text else text
+    elif r < 0.85:
+        return text.lower()
+    return text
+def generate_variation(question):
+    """Apply all augmentation techniques."""
+    variation = question
+    variation = replace_synonyms(variation)
+    variation = random_insertion(variation)
+    variation = random_swap(variation)
+    variation = structure_variation(variation)
+    variation = case_variation(variation)
+    return variation
+# =============================================================================
+# QUALITY AND DIVERSITY
+# =============================================================================
+def diversity_score(original, variation):
+    """Calculate diversity between original and variation."""
+    orig_words = set(original.lower().split())
+    var_words = set(variation.lower().split())
+    if not orig_words or not var_words:
+        return 0
+    intersection = orig_words & var_words
+    union = orig_words | var_words
+    return 1 - (len(intersection) / len(union))
+def quality_check(question, sql):
+    """Check if generated data passes quality standards."""
+    if not question or len(question.strip()) < 10:
+        return False
+    if not sql or len(sql.strip()) < 5:
+        return False
+    if not re.search(r'[a-zA-Z]', question):
+        return False
+    if len(question) > 500:
+        return False
+    return True
+def remove_duplicates(data):
+    """Remove duplicate entries."""
+    seen = set()
+    unique = []
+    for item in data:
+        normalized = re.sub(r'[^\w\s]', '', item['question'].lower())
+        normalized = ' '.join(normalized.split())
+        h = hashlib.md5(normalized.encode()).hexdigest()
+        if h not in seen:
+            seen.add(h)
+            unique.append(item)
+    return unique
+# =============================================================================
+# PRIVACY (ETHICAL CONSIDERATIONS)
+# =============================================================================
+def anonymize(text):
+    """Remove sensitive information."""
+    text = re.sub(r'\b[\w.-]+@[\w.-]+\.\w+\b', '[EMAIL]', text)
+    text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
+    text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
+    return text
+# =============================================================================
+# STATISTICS
+# =============================================================================
+def calculate_stats(original_df, synthetic_df):
+    """Calculate dataset statistics."""
+    def get_stats(df, name):
+        questions = df['question'].tolist()
+        lengths = [len(q.split()) for q in questions]
+        return {
+            'name': name,
+            'samples': len(df),
+            'avg_length': round(sum(lengths) / len(lengths), 2),
+            'min_length': min(lengths),
+            'max_length': max(lengths),
+            'unique_words': len(set(' '.join(questions).lower().split()))
+        }
+    orig_stats = get_stats(original_df, 'Original')
+    synth_stats = get_stats(synthetic_df, 'Synthetic')
+    diversity_scores = synthetic_df['diversity_score'].tolist()
+    diversity_stats = {
+        'avg': round(sum(diversity_scores) / len(diversity_scores), 4),
+        'min': round(min(diversity_scores), 4),
+        'max': round(max(diversity_scores), 4)
+    }
+    return {
+        'original': orig_stats,
+        'synthetic': synth_stats,
+        'diversity': diversity_stats,
+        'augmentation_factor': round(len(synthetic_df) / len(original_df), 2)
+    }
+# =============================================================================
+# VISUALIZATIONS
+# =============================================================================
+def create_visualizations(original_df, synthetic_df):
+    """Create and save visualizations."""
+    plt.style.use('seaborn-v0_8-whitegrid')
+    # 1. Dataset Size Comparison
+    fig, ax = plt.subplots(figsize=(8, 5))
+    sizes = [len(original_df), len(synthetic_df)]
+    bars = ax.bar(['Original', 'Synthetic'], sizes, color=['#3498db', '#2ecc71'])
+    for bar, size in zip(bars, sizes):
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20,
+                f'{size:,}', ha='center', fontweight='bold')
+    ax.set_ylabel('Samples')
+    ax.set_title('Dataset Size Comparison')
+    plt.savefig(f'{VIZ_DIR}/01_size_comparison.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    # 2. Question Length Distribution
+    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+    orig_len = [len(q.split()) for q in original_df['question']]
+    synth_len = [len(q.split()) for q in synthetic_df['question']]
+    axes[0].hist(orig_len, bins=25, color='#3498db', alpha=0.7)
+    axes[0].set_title('Original - Question Length')
+    axes[0].set_xlabel('Words')
+    axes[1].hist(synth_len, bins=25, color='#2ecc71', alpha=0.7)
+    axes[1].set_title('Synthetic - Question Length')
+    axes[1].set_xlabel('Words')
+    plt.tight_layout()
+    plt.savefig(f'{VIZ_DIR}/02_length_distribution.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    # 3. Diversity Score Distribution
+    fig, ax = plt.subplots(figsize=(8, 5))
+    ax.hist(synthetic_df['diversity_score'], bins=20, color='#9b59b6', alpha=0.7)
+    ax.axvline(synthetic_df['diversity_score'].mean(), color='red', linestyle='--',
+               label=f"Mean: {synthetic_df['diversity_score'].mean():.3f}")
+    ax.set_xlabel('Diversity Score')
+    ax.set_ylabel('Frequency')
+    ax.set_title('Diversity Score Distribution')
+    ax.legend()
+    plt.savefig(f'{VIZ_DIR}/03_diversity_distribution.png', dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Visualizations saved to {VIZ_DIR}/")
+# =============================================================================
+# REPORT GENERATION
+# =============================================================================
+def generate_report(stats):
+    """Generate markdown report."""
+    report = f"""# Synthetic Data Generation Report
+**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+## Dataset Statistics
+| Metric | Original | Synthetic |
+|--------|----------|-----------|
+| Samples | {stats['original']['samples']:,} | {stats['synthetic']['samples']:,} |
+| Avg Length | {stats['original']['avg_length']} | {stats['synthetic']['avg_length']} |
+| Min Length | {stats['original']['min_length']} | {stats['synthetic']['min_length']} |
+| Max Length | {stats['original']['max_length']} | {stats['synthetic']['max_length']} |
+| Unique Words | {stats['original']['unique_words']:,} | {stats['synthetic']['unique_words']:,} |
+## Augmentation Results
+- **Augmentation Factor:** {stats['augmentation_factor']}x
+- **Avg Diversity Score:** {stats['diversity']['avg']}
+- **Min Diversity Score:** {stats['diversity']['min']}
+- **Max Diversity Score:** {stats['diversity']['max']}
+## Techniques Used
+1. Synonym Replacement (40% probability)
+2. Random Insertion (15% probability)
+3. Random Swap (10% probability)
+4. Structure Variation (prefix/suffix)
+5. Case Variation
+## Quality Controls
+- Minimum question length: 10 characters
+- Maximum question length: 500 characters
+- Minimum diversity score: 0.1
+- Duplicate removal via MD5 hashing
+## Privacy Measures
+- Email anonymization
+- Phone number anonymization
+- SSN anonymization
+## Visualizations
+- `01_size_comparison.png` - Dataset size comparison
+- `02_length_distribution.png` - Question length distribution
+- `03_diversity_distribution.png` - Diversity score distribution
+"""
+    with open(f'{REPORT_DIR}/synthetic_report.md', 'w') as f:
+        f.write(report)
+    print(f"  Report saved to {REPORT_DIR}/synthetic_report.md")
+# =============================================================================
+# MAIN PIPELINE
+# =============================================================================
+def generate_synthetic_data(input_csv, output_csv, sample_size=500, variations=3, min_diversity=0.1):
+    """Main synthetic data generation pipeline."""
+    print("=" * 50)
+    print("SYNTHETIC DATA GENERATION")
+    print("=" * 50)
+    # Setup
+    setup_directories()
+    # Load data
+    print(f"\n[1/6] Loading {input_csv}...")
+    df = pd.read_csv(input_csv)
+    sample_df = df.sample(n=min(sample_size, len(df)), random_state=42)
+    print(f"  Sampled {len(sample_df)} rows")
+    # Generate variations
+    print(f"\n[2/6] Generating variations...")
+    synthetic_data = []
+    skipped = 0
+    for _, row in sample_df.iterrows():
+        question = anonymize(str(row['question']))
+        sql = anonymize(str(row['sql']))
+        for _ in range(variations):
+            variation = generate_variation(question)
+            div_score = diversity_score(question, variation)
+            if div_score < min_diversity or not quality_check(variation, sql):
+                skipped += 1
+                continue
+            synthetic_data.append({
+                'question': variation,
+                'sql': sql,
+                'original_question': question,
+                'diversity_score': round(div_score, 3),
+                'is_synthetic': True
+            })
+    print(f"  Generated: {len(synthetic_data)}, Skipped: {skipped}")
+    # Remove duplicates
+    print(f"\n[3/6] Removing duplicates...")
+    before = len(synthetic_data)
+    synthetic_data = remove_duplicates(synthetic_data)
+    print(f"  Removed {before - len(synthetic_data)} duplicates")
+    # Save data
+    print(f"\n[4/6] Saving data...")
+    synthetic_df = pd.DataFrame(synthetic_data)
+    synthetic_df.to_csv(output_csv, index=False)
+    print(f"  Saved to {output_csv}")
+    # Calculate stats
+    print(f"\n[5/6] Calculating statistics...")
+    stats = calculate_stats(sample_df, synthetic_df)
+    # Save stats as JSON
+    with open(f'{STATS_DIR}/statistics.json', 'w') as f:
+        json.dump(stats, f, indent=2)
+    print(f"  Stats saved to {STATS_DIR}/statistics.json")
+    # Generate visualizations and report
+    print(f"\n[6/6] Creating outputs...")
+    create_visualizations(sample_df, synthetic_df)
+    generate_report(stats)
+    # Summary
+    print("\n" + "=" * 50)
+    print("COMPLETE")
+    print("=" * 50)
+    print(f"  Original: {stats['original']['samples']:,} samples")
+    print(f"  Synthetic: {stats['synthetic']['samples']:,} samples")
+    print(f"  Augmentation: {stats['augmentation_factor']}x")
+    print(f"  Avg Diversity: {stats['diversity']['avg']}")
+    return synthetic_df
+# =============================================================================
+# ENTRY POINT
+# =============================================================================
+if __name__ == "__main__":
+    generate_synthetic_data(
+        input_csv="data/train.csv",
+        output_csv="data/synthetic.csv",
+        sample_size=52527,
+        variations=3,
+        min_diversity=0.1
+    )

src/synthetic/synonyms.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+Synonym Dictionary for SQL Question Augmentation
+"""
+import random
+# =============================================================================
+# SYNONYMS BY CATEGORY
+# =============================================================================
+# Action Verbs
+QUERY_VERBS = {
+    "find": ["get", "show", "display", "list", "retrieve", "fetch", "return", "select"],
+    "show": ["display", "list", "get", "find", "retrieve", "present"],
+    "get": ["find", "show", "display", "list", "retrieve", "fetch", "obtain"],
+    "list": ["show", "display", "get", "find", "enumerate"],
+    "retrieve": ["get", "fetch", "find", "obtain", "extract"],
+    "select": ["choose", "pick", "get", "retrieve", "find"],
+    "search": ["find", "look for", "look up", "query"],
+}
+CALCULATION_VERBS = {
+    "calculate": ["compute", "determine", "find", "figure out", "work out"],
+    "compute": ["calculate", "determine", "figure out"],
+    "count": ["tally", "enumerate", "number", "total up"],
+    "sum": ["total", "add up", "aggregate"],
+    "average": ["mean", "find the average of"],
+}
+MANIPULATION_VERBS = {
+    "sort": ["order", "arrange", "rank", "organize"],
+    "filter": ["narrow down", "limit", "restrict", "select"],
+    "group": ["categorize", "organize", "cluster", "aggregate"],
+    "join": ["combine", "merge", "connect", "link"],
+    "update": ["modify", "change", "edit", "alter"],
+    "delete": ["remove", "erase", "drop", "eliminate"],
+    "insert": ["add", "create", "put", "include"],
+}
+# Comparison Terms
+COMPARISONS = {
+    "greater than": ["more than", "above", "exceeding", "over", "higher than"],
+    "less than": ["below", "under", "fewer than", "smaller than", "lower than"],
+    "equal to": ["equals", "is", "matching", "same as"],
+    "between": ["in the range of", "ranging from", "within"],
+    "contains": ["includes", "has", "with"],
+}
+# Aggregation Terms
+AGGREGATIONS = {
+    "maximum": ["highest", "largest", "greatest", "max", "top"],
+    "minimum": ["lowest", "smallest", "least", "min", "bottom"],
+    "average": ["mean", "avg"],
+    "total": ["sum", "combined", "overall", "aggregate"],
+    "count": ["number of", "how many", "total number of"],
+    "distinct": ["unique", "different", "separate"],
+}
+# Business Entities
+ENTITIES = {
+    "employees": ["workers", "staff", "personnel", "team members"],
+    "customers": ["clients", "users", "buyers", "patrons"],
+    "products": ["items", "goods", "merchandise"],
+    "orders": ["purchases", "transactions", "sales"],
+    "suppliers": ["vendors", "providers", "distributors"],
+    "company": ["firm", "organization", "business"],
+    "department": ["dept", "division", "section", "unit"],
+    "manager": ["supervisor", "boss", "lead", "head"],
+}
+# Financial Terms
+FINANCIAL = {
+    "price": ["cost", "amount", "value", "rate"],
+    "salary": ["pay", "wage", "income", "earnings"],
+    "revenue": ["income", "earnings", "sales"],
+    "profit": ["earnings", "gain", "margin"],
+    "cost": ["price", "expense", "charge"],
+}
+# Time Terms
+TIME_TERMS = {
+    "date": ["day", "time", "period"],
+    "year": ["annum", "calendar year"],
+    "month": ["period", "calendar month"],
+    "recent": ["latest", "newest", "most recent"],
+    "current": ["present", "existing", "active"],
+    "previous": ["prior", "former", "past", "earlier"],
+    "last": ["final", "most recent", "latest"],
+    "first": ["initial", "earliest", "beginning"],
+}
+# Quantifiers
+QUANTIFIERS = {
+    "all": ["every", "each", "the entire", "complete"],
+    "some": ["a few", "certain", "several"],
+    "many": ["numerous", "multiple", "several"],
+    "few": ["some", "a small number of", "limited"],
+    "only": ["just", "solely", "exclusively"],
+}
+# Adjectives
+ADJECTIVES = {
+    "highest": ["greatest", "maximum", "largest", "top"],
+    "lowest": ["smallest", "minimum", "least", "bottom"],
+    "active": ["current", "live", "enabled"],
+    "inactive": ["disabled", "dormant", "idle"],
+    "new": ["recent", "latest", "fresh"],
+    "old": ["previous", "former", "past"],
+}
+# =============================================================================
+# COMBINED DICTIONARY
+# =============================================================================
+def get_all_synonyms():
+    """Combine all synonym dictionaries."""
+    all_synonyms = {}
+    for d in [QUERY_VERBS, CALCULATION_VERBS, MANIPULATION_VERBS,
+              COMPARISONS, AGGREGATIONS, ENTITIES, FINANCIAL,
+              TIME_TERMS, QUANTIFIERS, ADJECTIVES]:
+        all_synonyms.update(d)
+    return all_synonyms
+SYNONYMS = get_all_synonyms()
+# =============================================================================
+# UTILITY FUNCTIONS
+# =============================================================================
+def get_synonym(word):
+    """Get a random synonym for a word."""
+    word_lower = word.lower()
+    if word_lower in SYNONYMS:
+        return random.choice(SYNONYMS[word_lower])
+    return word
+def has_synonym(word):
+    """Check if a word has synonyms."""
+    return word.lower() in SYNONYMS
+def print_stats():
+    """Print synonym statistics."""
+    total_words = len(SYNONYMS)
+    total_synonyms = sum(len(v) for v in SYNONYMS.values())
+    print(f"Total words: {total_words}")
+    print(f"Total synonyms: {total_synonyms}")
+if __name__ == "__main__":
+    print_stats()

src/tests/test_finetuned.py ADDED Viewed

File without changes

src/tests/test_rag.py ADDED Viewed

File without changes

src/tests/test_synthetic.py ADDED Viewed

File without changes