Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- Dockerfile +22 -0
- README.md +26 -6
- app.py +6 -0
- bronze.py +47 -0
- gold.py +50 -0
- requirements.txt +4 -0
- silver.py +51 -0
- train.py +74 -0
Dockerfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python slim image as a parent image for a smaller size
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory inside the container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Copy the requirements file first to leverage Docker's layer caching
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Install the Python dependencies
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Copy the rest of your application's code, including the trained models
|
| 14 |
+
# The .dockerignore file will ensure unnecessary files are excluded
|
| 15 |
+
COPY . .
|
| 16 |
+
|
| 17 |
+
# Expose the port that Gradio runs on
|
| 18 |
+
EXPOSE 7860
|
| 19 |
+
|
| 20 |
+
# The command to run when the container starts
|
| 21 |
+
# This will launch your Gradio application
|
| 22 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,12 +1,32 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Credit Card Fraud Detection with DuckDB
|
| 3 |
+
emoji: 💳
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: "4.44.0"
|
| 8 |
+
python_version: "3.10"
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Credit Card Fraud Detection with DuckDB and Medallion Architecture
|
| 14 |
+
|
| 15 |
+
This project demonstrates an end-to-end pipeline for credit card fraud detection. It uses DuckDB to process data in a Medallion Architecture (Bronze, Silver, Gold) and trains a Random Forest model to identify fraudulent transactions.
|
| 16 |
+
|
| 17 |
+
## Project Structure
|
| 18 |
+
|
| 19 |
+
- `data/`: Contains the raw CSV datasets (`fraudTrain.csv`, `fraudTest.csv`).
|
| 20 |
+
- `src/`: Contains the Python scripts for the data pipeline and model training.
|
| 21 |
+
- `bronze.py`: Ingests raw data into the bronze layer.
|
| 22 |
+
- `silver.py`: Cleans and transforms data for the silver layer.
|
| 23 |
+
- `gold.py`: Creates aggregated features for the gold (analytics) layer.
|
| 24 |
+
- `train.py`: Trains a `RandomForestClassifier` on the gold data and saves the model.
|
| 25 |
+
- `models/`: Directory where the trained model is saved.
|
| 26 |
+
- `requirements.txt`: Lists the required Python packages.
|
| 27 |
+
|
| 28 |
+
## How to Run
|
| 29 |
+
|
| 30 |
+
1. **Install dependencies:**
|
| 31 |
+
```bash
|
| 32 |
+
pip install -r requirements.txt
|
app.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
def status():
|
| 4 |
+
return "Credit Card Fraud Detection Pipeline is ready. Run training using src/train.py"
|
| 5 |
+
|
| 6 |
+
gr.Interface(fn=status, inputs=[], outputs="text").launch()
|
bronze.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import duckdb
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
def setup_bronze_layer():
|
| 5 |
+
"""
|
| 6 |
+
Connects to DuckDB, creates the bronze_transactions table,
|
| 7 |
+
and ingests data from the CSV files.
|
| 8 |
+
"""
|
| 9 |
+
db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
|
| 10 |
+
con = duckdb.connect(database=db_path, read_only=False)
|
| 11 |
+
|
| 12 |
+
# Create schema for raw data
|
| 13 |
+
con.execute("CREATE SCHEMA IF NOT EXISTS bronze;")
|
| 14 |
+
|
| 15 |
+
# Create bronze table from CSV files
|
| 16 |
+
# The read_csv_auto function will infer schemas and combine files.
|
| 17 |
+
# Using glob to read both train and test data.
|
| 18 |
+
train_file = os.path.join('..', 'data', 'fraudTrain.csv')
|
| 19 |
+
test_file = os.path.join('..', 'data', 'fraudTest.csv')
|
| 20 |
+
|
| 21 |
+
# It's better to load them separately and then combine if needed,
|
| 22 |
+
# but for simplicity in bronze, we can create two tables or load into one.
|
| 23 |
+
# Let's load them into one table with an indicator of the source if needed.
|
| 24 |
+
# For now, we just load the training data. We can add test data later.
|
| 25 |
+
|
| 26 |
+
print("Ingesting data into bronze_transactions table...")
|
| 27 |
+
con.execute(f"""
|
| 28 |
+
CREATE OR REPLACE TABLE bronze.bronze_transactions AS
|
| 29 |
+
SELECT * FROM read_csv_auto('{train_file}');
|
| 30 |
+
""")
|
| 31 |
+
|
| 32 |
+
# To add the test data, you could use an INSERT statement:
|
| 33 |
+
con.execute(f"""
|
| 34 |
+
INSERT INTO bronze.bronze_transactions
|
| 35 |
+
SELECT * FROM read_csv_auto('{test_file}');
|
| 36 |
+
""")
|
| 37 |
+
|
| 38 |
+
print("Data ingestion complete.")
|
| 39 |
+
|
| 40 |
+
# Verify the data is loaded
|
| 41 |
+
record_count = con.execute("SELECT COUNT(*) FROM bronze.bronze_transactions;").fetchone()[0]
|
| 42 |
+
print(f"Total records in bronze_transactions: {record_count}")
|
| 43 |
+
|
| 44 |
+
con.close()
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
setup_bronze_layer()
|
gold.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import duckdb
|
| 2 |
+
import os
|
| 3 |
+
from silver import setup_silver_layer
|
| 4 |
+
|
| 5 |
+
def setup_gold_layer():
|
| 6 |
+
"""
|
| 7 |
+
Connects to DuckDB, reads from the silver layer,
|
| 8 |
+
and creates aggregated features for the gold table.
|
| 9 |
+
"""
|
| 10 |
+
# Ensure the silver layer exists before proceeding
|
| 11 |
+
setup_silver_layer()
|
| 12 |
+
|
| 13 |
+
db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
|
| 14 |
+
con = duckdb.connect(database=db_path, read_only=False)
|
| 15 |
+
|
| 16 |
+
# Create schema for gold data
|
| 17 |
+
con.execute("CREATE SCHEMA IF NOT EXISTS gold;")
|
| 18 |
+
|
| 19 |
+
print("Creating aggregated features for the gold layer...")
|
| 20 |
+
|
| 21 |
+
# Create the gold table with aggregated features
|
| 22 |
+
con.execute("""
|
| 23 |
+
CREATE OR REPLACE TABLE gold.gold_transactions AS
|
| 24 |
+
SELECT
|
| 25 |
+
*,
|
| 26 |
+
-- Average transaction amount for the merchant
|
| 27 |
+
AVG(amt) OVER (PARTITION BY merchant) AS avg_merch_spend,
|
| 28 |
+
-- Lag feature: amount of the previous transaction for the card
|
| 29 |
+
LAG(amt, 1, 0) OVER (PARTITION BY cc_num ORDER BY trans_date_time) AS prev_trans_amt,
|
| 30 |
+
-- Lead feature: amount of the next transaction for the card
|
| 31 |
+
LEAD(amt, 1, 0) OVER (PARTITION BY cc_num ORDER BY trans_date_time) AS next_trans_amt
|
| 32 |
+
FROM silver.silver_transactions;
|
| 33 |
+
""")
|
| 34 |
+
|
| 35 |
+
print("Gold layer setup complete.")
|
| 36 |
+
|
| 37 |
+
# Verify the new columns in the gold table
|
| 38 |
+
print("Columns in gold.gold_transactions:")
|
| 39 |
+
print(con.execute("DESCRIBE gold.gold_transactions;").fetchall())
|
| 40 |
+
|
| 41 |
+
record_count = con.execute("SELECT COUNT(*) FROM gold.gold_transactions;").fetchone()[0]
|
| 42 |
+
print(f"Total records in gold_transactions: {record_count}")
|
| 43 |
+
|
| 44 |
+
con.close()
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
# For direct execution, this will now run the full pipeline up to gold
|
| 48 |
+
print("Setting up gold layer (which includes bronze and silver)...")
|
| 49 |
+
setup_gold_layer()
|
| 50 |
+
print("Gold layer setup finished.")
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
duckdb
|
| 2 |
+
pandas
|
| 3 |
+
scikit-learn
|
| 4 |
+
joblib
|
silver.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import duckdb
|
| 2 |
+
import os
|
| 3 |
+
from bronze import setup_bronze_layer
|
| 4 |
+
|
| 5 |
+
def setup_silver_layer():
|
| 6 |
+
"""
|
| 7 |
+
Connects to DuckDB, reads from the bronze layer,
|
| 8 |
+
and applies transformations to create the silver table.
|
| 9 |
+
"""
|
| 10 |
+
# Ensure the bronze layer exists before proceeding
|
| 11 |
+
setup_bronze_layer()
|
| 12 |
+
|
| 13 |
+
db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
|
| 14 |
+
con = duckdb.connect(database=db_path, read_only=False)
|
| 15 |
+
|
| 16 |
+
# Create schema for silver data
|
| 17 |
+
con.execute("CREATE SCHEMA IF NOT EXISTS silver;")
|
| 18 |
+
|
| 19 |
+
print("Transforming data for the silver layer...")
|
| 20 |
+
|
| 21 |
+
# Perform transformations and create the silver table
|
| 22 |
+
con.execute("""
|
| 23 |
+
CREATE OR REPLACE TABLE silver.silver_transactions AS
|
| 24 |
+
SELECT
|
| 25 |
+
*,
|
| 26 |
+
-- The column is already a timestamp, so just alias it
|
| 27 |
+
trans_date_trans_time AS trans_date_time,
|
| 28 |
+
-- Calculate age of the cardholder at the time of transaction
|
| 29 |
+
date_part('year', trans_date_trans_time) - date_part('year', dob) AS age,
|
| 30 |
+
-- Extract hour of day from transaction time
|
| 31 |
+
date_part('hour', trans_date_trans_time) AS trans_hour
|
| 32 |
+
FROM bronze.bronze_transactions;
|
| 33 |
+
""")
|
| 34 |
+
|
| 35 |
+
print("Silver layer setup complete.")
|
| 36 |
+
|
| 37 |
+
# Verify the new columns in the silver table
|
| 38 |
+
print("Columns in silver.silver_transactions:")
|
| 39 |
+
print(con.execute("DESCRIBE silver.silver_transactions;").fetchall())
|
| 40 |
+
|
| 41 |
+
record_count = con.execute("SELECT COUNT(*) FROM silver.silver_transactions;").fetchone()[0]
|
| 42 |
+
print(f"Total records in silver_transactions: {record_count}")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
con.close()
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
# For direct execution, this will now run the full pipeline up to silver
|
| 49 |
+
print("Setting up silver layer (which includes bronze)...")
|
| 50 |
+
setup_silver_layer()
|
| 51 |
+
print("Silver layer setup finished.")
|
train.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import duckdb
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sklearn.model_selection import train_test_split
|
| 5 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 6 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
| 7 |
+
import joblib
|
| 8 |
+
from gold import setup_gold_layer
|
| 9 |
+
|
| 10 |
+
def train_model():
|
| 11 |
+
"""
|
| 12 |
+
Trains a RandomForestClassifier on the gold layer data.
|
| 13 |
+
"""
|
| 14 |
+
# Ensure the full data pipeline has been run
|
| 15 |
+
setup_gold_layer()
|
| 16 |
+
|
| 17 |
+
db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
|
| 18 |
+
con = duckdb.connect(database=db_path, read_only=False)
|
| 19 |
+
|
| 20 |
+
print("Loading data from gold.gold_transactions...")
|
| 21 |
+
# Load the entire table into a pandas DataFrame
|
| 22 |
+
df = con.execute("SELECT * FROM gold.gold_transactions").fetchdf()
|
| 23 |
+
con.close()
|
| 24 |
+
|
| 25 |
+
print("Preparing data for training...")
|
| 26 |
+
|
| 27 |
+
# Define features (X) and target (y)
|
| 28 |
+
# Exclude identifiers, raw timestamps, and the target variable itself
|
| 29 |
+
features = [col for col in df.columns if col not in [
|
| 30 |
+
'cc_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'dob',
|
| 31 |
+
'trans_num', 'trans_date_trans_time', 'trans_date_time', 'is_fraud'
|
| 32 |
+
]]
|
| 33 |
+
|
| 34 |
+
X = df[features]
|
| 35 |
+
y = df['is_fraud']
|
| 36 |
+
|
| 37 |
+
# One-hot encode categorical features
|
| 38 |
+
categorical_features = ['merchant', 'category', 'gender', 'job']
|
| 39 |
+
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
|
| 40 |
+
|
| 41 |
+
# Align columns for prediction later - crucial if test set has different categories
|
| 42 |
+
train_cols = X.columns
|
| 43 |
+
|
| 44 |
+
# Split data into training and testing sets
|
| 45 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
|
| 46 |
+
|
| 47 |
+
print("Training RandomForestClassifier model...")
|
| 48 |
+
# Initialize and train the model
|
| 49 |
+
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
|
| 50 |
+
model.fit(X_train, y_train)
|
| 51 |
+
|
| 52 |
+
print("Evaluating model performance...")
|
| 53 |
+
# Make predictions and evaluate
|
| 54 |
+
y_pred = model.predict(X_test)
|
| 55 |
+
|
| 56 |
+
print("Classification Report:")
|
| 57 |
+
print(classification_report(y_test, y_pred))
|
| 58 |
+
|
| 59 |
+
print("Confusion Matrix:")
|
| 60 |
+
print(confusion_matrix(y_test, y_pred))
|
| 61 |
+
|
| 62 |
+
# Save the trained model and the column list
|
| 63 |
+
model_path = os.path.join('..', 'models')
|
| 64 |
+
if not os.path.exists(model_path):
|
| 65 |
+
os.makedirs(model_path)
|
| 66 |
+
|
| 67 |
+
joblib.dump(model, os.path.join(model_path, 'fraud_detection_model.joblib'))
|
| 68 |
+
joblib.dump(train_cols, os.path.join(model_path, 'model_columns.joblib'))
|
| 69 |
+
|
| 70 |
+
print(f"Model saved to {model_path}")
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
# The train_model function now handles the full pipeline run and training
|
| 74 |
+
train_model()
|