| | import pandas as pd |
| | import io |
| | import re |
| | import yaml |
| | from typing import List, Optional |
| | from fastapi import FastAPI, File, UploadFile, HTTPException, Query |
| | from fastapi.responses import JSONResponse |
| | import uvicorn |
| | from fastapi.middleware.cors import CORSMiddleware |
| |
|
| | app = FastAPI() |
| |
|
| | |
| | with open("column_config.yaml") as f: |
| | COLUMN_CONFIG = yaml.safe_load(f) |
| |
|
| | |
| | def detect_column_type(dtype): |
| | if pd.api.types.is_datetime64_any_dtype(dtype): |
| | return "datetime" |
| | elif pd.api.types.is_numeric_dtype(dtype): |
| | return "number" |
| | return "text" |
| |
|
| | |
| | def normalize_column_names(column_names: List[str]) -> List[str]: |
| | normalized = [] |
| | for raw_col in column_names: |
| | sanitized = re.sub(r'[\W]+', '_', raw_col.strip()).lower().strip('_') |
| | for config_col, config in COLUMN_CONFIG['columns'].items(): |
| | synonyms = [ |
| | re.sub(r'[\W]+', '_', s.strip()).lower().strip('_') |
| | for s in [config_col] + config.get('synonyms', []) |
| | ] |
| | if sanitized in synonyms: |
| | normalized.append(config_col) |
| | break |
| | else: |
| | normalized.append(sanitized) |
| | return normalized |
| |
|
| | |
| | def clean_data(df: pd.DataFrame) -> pd.DataFrame: |
| | df.columns = normalize_column_names(df.columns) |
| |
|
| | |
| | for col in df.columns: |
| | if col in COLUMN_CONFIG['columns']: |
| | col_type = COLUMN_CONFIG['columns'][col].get('type', 'text') |
| | if col_type == 'datetime': |
| | df[col] = pd.to_datetime(df[col], errors='coerce') |
| | elif col_type == 'numeric': |
| | df[col] = pd.to_numeric(df[col], errors='coerce') |
| | elif col_type == 'categorical': |
| | allowed = COLUMN_CONFIG['columns'][col].get('allowed', []) |
| | df[col] = df[col].where(df[col].isin(allowed), None) |
| |
|
| | |
| | for col in df.columns: |
| | if col in COLUMN_CONFIG['columns']: |
| | col_type = COLUMN_CONFIG['columns'][col].get('type', 'text') |
| | if col_type == 'datetime': |
| | fmt = COLUMN_CONFIG['columns'][col].get('format') |
| | df[col] = pd.to_datetime(df[col], errors='coerce', format=fmt) |
| | df[col] = df[col].dt.strftime('%Y-%m-%dT%H:%M:%SZ') |
| | elif col_type == 'numeric': |
| | df[col] = pd.to_numeric(df[col], errors='coerce').astype(float) |
| | elif col_type == 'categorical': |
| | allowed = COLUMN_CONFIG['columns'][col].get('allowed', []) |
| | df[col] = df[col].where(df[col].isin(allowed)) |
| |
|
| | |
| | for col in df.columns: |
| | if col in COLUMN_CONFIG['columns']: |
| | col_type = COLUMN_CONFIG['columns'][col].get('type', 'text') |
| | if col_type == 'numeric': |
| | q1 = df[col].quantile(0.25) |
| | q3 = df[col].quantile(0.75) |
| | iqr = q3 - q1 |
| | lower_bound = q1 - 1.5 * iqr |
| | upper_bound = q3 + 1.5 * iqr |
| | df[col] = df[col].clip(lower=lower_bound, upper=upper_bound) |
| |
|
| | |
| | df.drop_duplicates(inplace=True) |
| |
|
| | |
| | for col in df.columns: |
| | if col in COLUMN_CONFIG['columns']: |
| | col_type = COLUMN_CONFIG['columns'][col].get('type', 'text') |
| | if col_type == 'numeric': |
| | df[col] = pd.to_numeric(df[col], errors='coerce') |
| | elif col_type == 'datetime': |
| | df[col] = pd.to_datetime(df[col], errors='coerce') |
| |
|
| | |
| | for col in df.columns: |
| | if col in COLUMN_CONFIG['columns']: |
| | col_type = COLUMN_CONFIG['columns'][col].get('type', 'text') |
| | if col_type == 'text': |
| | df[col] = df[col].str.strip().str.lower() |
| |
|
| | return df.replace({pd.NA: None}) |
| |
|
| | |
| | def process_file(file: UploadFile, sheet_name: Optional[str] = None) -> pd.DataFrame: |
| | try: |
| | content = file.file.read() |
| | extension = file.filename.split('.')[-1] |
| | if extension == 'csv': |
| | df = pd.read_csv(io.BytesIO(content)) |
| | elif extension == 'xlsx': |
| | if sheet_name is None: |
| | sheet_name = 0 |
| | df = pd.read_excel(io.BytesIO(content), sheet_name=sheet_name) |
| | else: |
| | raise HTTPException(400, "Formato de arquivo não suportado") |
| | return df, clean_data(df) |
| | except Exception as e: |
| | raise HTTPException(500, f"Erro ao processar o arquivo: {str(e)}") |
| |
|
| | |
| | @app.post("/process-file") |
| | async def process_file_endpoint(file: UploadFile = File(...), sheet_name: Optional[str] = Query(None)): |
| | try: |
| | raw_df, df = process_file(file, sheet_name) |
| |
|
| | columns = [{ |
| | "name": col, |
| | "type": detect_column_type(df[col].dtype) |
| | } for col in df.columns] |
| |
|
| | rows = [] |
| | for idx, row in df.iterrows(): |
| | cells = {} |
| | for col, val in row.items(): |
| | cells[col] = { |
| | "value": val, |
| | "displayValue": str(val), |
| | "columnId": col |
| | } |
| | rows.append({"id": str(idx), "cells": cells}) |
| |
|
| | return JSONResponse( |
| | content={ |
| | "data": { |
| | "columns": columns, |
| | "rows": rows |
| | }, |
| | "metadata": { |
| | "totalRows": len(df), |
| | "processedAt": pd.Timestamp.now().isoformat() |
| | } |
| | }) |
| | except Exception as e: |
| | raise HTTPException(500, f"Erro: {str(e)}") |
| |
|
| | |
| | app.add_middleware( |
| | CORSMiddleware, |
| | allow_origins=["*"], |
| | allow_credentials=True, |
| | allow_methods=["*"], |
| | allow_headers=["*"], |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True) |