Test-Prompt / backend /utils /data_processing.py
abhiman181025's picture
First commit
1314bf5
import pandas as pd
import os
from pathlib import Path
from typing import Dict, List, Tuple, Union, Any
def extract_file_dict(folder_path: List[Path]) -> Dict[str, Path]:
"""
Extract file dictionary from folder path.
Args:
folder_path: List of Path objects from Gradio file upload
Returns:
Dictionary mapping filename to full path
"""
file_dict = {}
for file in folder_path:
filepath = file
filename = filepath.name.split("/")[-1]
file_dict[filename] = filepath
return file_dict
def validate_data(file_dict: Dict[str, Path]) -> Tuple[Union[bool, str], str]:
"""
Validate the uploaded data structure.
Args:
file_dict: Dictionary of filename to path mappings
Returns:
Tuple of (validation_result, message)
validation_result can be:
- True: Valid data with CSV
- False: Invalid data
- "no_csv": Valid but no CSV file
- "multiple_csv": Valid but multiple CSV files
"""
# Find CSV file
csv_files = [fname for fname in file_dict if fname.lower().endswith('.csv')]
# Find image files
image_exts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']
image_files = [fname for fname in file_dict if any(fname.lower().endswith(ext) for ext in image_exts)]
if not image_files:
return False, "No image files found in the folder or subfolders"
# If no CSV or multiple CSVs, we'll proceed with file-based processing
if len(csv_files) == 0:
return "no_csv", "No CSV file found. Will extract data from file paths and names."
elif len(csv_files) > 1:
return "multiple_csv", "Multiple CSV files found. Will extract data from file paths and names."
# Check if single CSV has required columns
try:
df = pd.read_csv(file_dict[csv_files[0]])
if 'Ground Truth' not in df.columns:
return False, "CSV file does not contain 'Ground Truth' column"
if 'Image Name' not in df.columns:
return False, "CSV file does not contain 'Image Name' column"
except Exception as e:
return False, f"Error reading CSV file: {str(e)}"
return True, "Data validation successful"
def extract_binary_output(
model_output: str,
ground_truth: str = "",
all_ground_truths: List[str] = None
) -> str:
"""
Extract binary output from model response based on unique ground truth keywords.
Args:
model_output: The model's text response
ground_truth: Current item's ground truth (for fallback)
all_ground_truths: List of all ground truth values to extract unique keywords
Returns:
Extracted keyword that best matches the model output
"""
if all_ground_truths is None:
all_ground_truths = []
# Unique lowercase keywords
unique_keywords = sorted({str(gt).strip().lower() for gt in all_ground_truths if gt})
# Take only the first line of model output
first_line = model_output.split("\n", 1)[0].lower()
print(f"DEBUG: Unique keywords extracted: {first_line}")
print(f"DEBUG: Model output: {model_output[:100]}...") # First 100 chars
for keyword in unique_keywords:
if keyword in first_line:
return keyword
return "Enter the output manually"