Spaces:

donato11
/

Capibara

Sleeping

App Files Files Community

donato11 commited on Nov 14, 2025

Commit

6be796b

1 Parent(s): 9535b59

Changed models hyperparameters for baseline

Browse files

Files changed (2) hide show

syntetic_issue_report_data_generation/config.py +17 -15
syntetic_issue_report_data_generation/modeling/train.py +4 -3

syntetic_issue_report_data_generation/config.py CHANGED Viewed

@@ -4,12 +4,13 @@ from dotenv import load_dotenv
 # Load environment variables from .env file if it exists
 load_dotenv()
-# --- DIRECTORY PATHS ---
 PROJ_ROOT = Path(__file__).resolve().parents[1]
 DATA_DIR = PROJ_ROOT / "data"
 RAW_DATA_DIR = DATA_DIR / "raw"
 INTERIM_DATA_DIR = DATA_DIR / "interim"
 PROCESSED_DATA_DIR = DATA_DIR / "processed"
 EXTERNAL_DATA_DIR = DATA_DIR / "external"
@@ -21,42 +22,42 @@ FIGURES_DIR = REPORTS_DIR / "figures"
 DATASET_CONFIGs = {
     'nasa_cfs_train': {
         'data_path': 'issue-report-classification/nasa/cfs_train.csv',
-        'label_col': 'label',
-        'title_col': 'title',
-        'body_col': 'body'
     },
-    'nasa_fprime_train': {
         'data_path': 'issue-report-classification/nasa/fprime_train.csv',
         'label_col': 'label',
         'title_col': 'title',
         'body_col': 'body'
     },
-    'nasa_train': {
         'data_path': 'issue-report-classification/nasa/nasa_train_sample.csv',
         'label_col': 'label',
         'title_col': None,
         'body_col': 'text'
     },
-    'nlbse23_train': {
         'data_path': 'issue-report-classification/nlbse23/nlbse23-issue-classification-train.csv',
         'label_col': 'labels',
         'title_col': 'title',
         'body_col': 'body'
-    },
     'nlbse24_train': {
         'data_path': 'issue-report-classification/nlbse24/issues_train.csv',
         'label_col': 'label',
         'title_col': 'title',
         'body_col': 'body'
-    },
     'pySenti4SD_train': {
         'data_path': 'pySenti4SD/test_stackoverflow.csv',
         'label_col': 'Polarity',
         'title_col': None,
         'body_col': 'Text',
         'sep': ';'
-    },
-    'nasa_cfs_test': {
         'data_path': 'issue-report-classification/nasa/cfs_test.csv',
         'label_col': 'label',
         'title_col': 'title',
@@ -129,7 +130,7 @@ MODEL_CONFIGS = {
             "per_device_train_batch_size": 16,
             "per_device_eval_batch_size": 32,
             "gradient_accumulation_steps": 4,
-            "num_train_epochs": 3,
             "learning_rate": 2e-5,
             "weight_decay": 0.01,
             "warmup_steps": 500,
@@ -139,8 +140,9 @@ MODEL_CONFIGS = {
         "model_checkpoint": "roberta-base",
         "params": {
             "per_device_train_batch_size": 16,
-            "per_device_eval_batch_size": 16,
-            "num_train_epochs": 3,
             "learning_rate": 2e-5,
             "weight_decay": 0.01,
             "warmup_steps": 500,
@@ -150,4 +152,4 @@ MODEL_CONFIGS = {
 # --- IMPOSTAZIONI MLFLOW ---
 MLFLOW_TRACKING_URI = "https://dagshub.com/se4ai2526-uniba/Capibara.mlflow"
-MLFLOW_EXPERIMENT_NAME = "Tests"

 # Load environment variables from .env file if it exists
 load_dotenv()
+# DIRECTORY PATHS
 PROJ_ROOT = Path(__file__).resolve().parents[1]
 DATA_DIR = PROJ_ROOT / "data"
 RAW_DATA_DIR = DATA_DIR / "raw"
 INTERIM_DATA_DIR = DATA_DIR / "interim"
+SOFT_CLEANED_DATA_DIR = DATA_DIR / "soft_cleaned"
 PROCESSED_DATA_DIR = DATA_DIR / "processed"
 EXTERNAL_DATA_DIR = DATA_DIR / "external"
 DATASET_CONFIGs = {
     'nasa_cfs_train': {
         'data_path': 'issue-report-classification/nasa/cfs_train.csv',
+        'label_col': 'label',
+        'title_col': 'title',
+        'body_col': 'body'
     },
+    'nasa_fprime_train': {
         'data_path': 'issue-report-classification/nasa/fprime_train.csv',
         'label_col': 'label',
         'title_col': 'title',
         'body_col': 'body'
     },
+    'nasa_train': {
         'data_path': 'issue-report-classification/nasa/nasa_train_sample.csv',
         'label_col': 'label',
         'title_col': None,
         'body_col': 'text'
     },
+    'nlbse23_train': {
         'data_path': 'issue-report-classification/nlbse23/nlbse23-issue-classification-train.csv',
         'label_col': 'labels',
         'title_col': 'title',
         'body_col': 'body'
+    },
     'nlbse24_train': {
         'data_path': 'issue-report-classification/nlbse24/issues_train.csv',
         'label_col': 'label',
         'title_col': 'title',
         'body_col': 'body'
+    },
     'pySenti4SD_train': {
         'data_path': 'pySenti4SD/test_stackoverflow.csv',
         'label_col': 'Polarity',
         'title_col': None,
         'body_col': 'Text',
         'sep': ';'
+    },
+    'nasa_cfs_test': {
         'data_path': 'issue-report-classification/nasa/cfs_test.csv',
         'label_col': 'label',
         'title_col': 'title',
             "per_device_train_batch_size": 16,
             "per_device_eval_batch_size": 32,
             "gradient_accumulation_steps": 4,
+            "num_train_epochs": 15,
             "learning_rate": 2e-5,
             "weight_decay": 0.01,
             "warmup_steps": 500,
         "model_checkpoint": "roberta-base",
         "params": {
             "per_device_train_batch_size": 16,
+            "per_device_eval_batch_size": 32,
+            "gradient_accumulation_steps": 4,
+            "num_train_epochs": 15,
             "learning_rate": 2e-5,
             "weight_decay": 0.01,
             "warmup_steps": 500,
 # --- IMPOSTAZIONI MLFLOW ---
 MLFLOW_TRACKING_URI = "https://dagshub.com/se4ai2526-uniba/Capibara.mlflow"
+MLFLOW_EXPERIMENT_NAME = "Baseline_Transformers"

syntetic_issue_report_data_generation/modeling/train.py CHANGED Viewed

@@ -20,7 +20,8 @@ from syntetic_issue_report_data_generation.config import (
     MODEL_CONFIGS,
     MLFLOW_TRACKING_URI,
     MLFLOW_EXPERIMENT_NAME,
-    INTERIM_DATA_DIR
 )
@@ -93,7 +94,7 @@ def load_and_prepare_data(train_config, test_config=None, test_size=0.2, max_tra
     print(f"Loading train data from: {train_config['data_path']}")
     # Get train configuration
-    train_path = INTERIM_DATA_DIR / train_config['data_path']
     train_label_col = train_config['label_col']
     train_title_col = train_config.get('title_col')
     train_body_col = train_config['body_col']
@@ -109,7 +110,7 @@ def load_and_prepare_data(train_config, test_config=None, test_size=0.2, max_tra
     # Handle test data
     if test_config:
         print(f"Loading test data from: {test_config['data_path']}")
-        test_path = INTERIM_DATA_DIR / test_config['data_path']
         test_label_col = test_config['label_col']
         test_title_col = test_config.get('title_col')
         test_body_col = test_config['body_col']

     MODEL_CONFIGS,
     MLFLOW_TRACKING_URI,
     MLFLOW_EXPERIMENT_NAME,
+    INTERIM_DATA_DIR,
+    SOFT_CLEANED_DATA_DIR
 )
     print(f"Loading train data from: {train_config['data_path']}")
     # Get train configuration
+    train_path = SOFT_CLEANED_DATA_DIR / train_config['data_path']
     train_label_col = train_config['label_col']
     train_title_col = train_config.get('title_col')
     train_body_col = train_config['body_col']
     # Handle test data
     if test_config:
         print(f"Loading test data from: {test_config['data_path']}")
+        test_path = SOFT_CLEANED_DATA_DIR / test_config['data_path']
         test_label_col = test_config['label_col']
         test_title_col = test_config.get('title_col')
         test_body_col = test_config['body_col']