File size: 4,522 Bytes
5b9fa06 c393453 5b9fa06 6be796b 77a8bee 5b9fa06 37ebc62 5b9fa06 0964d15 5b9fa06 39c39d5 5b9fa06 60d5f26 bde0b52 37ebc62 bde0b52 fd01bb3 60d5f26 bde0b52 37ebc62 bde0b52 60d5f26 bde0b52 37ebc62 bde0b52 60d5f26 bde0b52 37ebc62 bde0b52 aae5e27 bde0b52 37ebc62 bde0b52 37ebc62 282911e bde0b52 ed95dc6 bde0b52 37ebc62 bde0b52 fd01bb3 ed95dc6 bde0b52 37ebc62 bde0b52 ed95dc6 bde0b52 37ebc62 bde0b52 ed95dc6 bde0b52 37ebc62 bde0b52 ed95dc6 bde0b52 37ebc62 bde0b52 37ebc62 ed95dc6 bde0b52 ed95dc6 60d5f26 ed95dc6 257335f ed95dc6 551710c c393453 ed95dc6 257335f ed95dc6 551710c ed95dc6 551710c c393453 ed95dc6 c393453 ed95dc6 c393453 1c4d1f2 5b9fa06 1c4d1f2 fd01bb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from pathlib import Path
from dotenv import load_dotenv
# Load environment variables from .env file if it exists
load_dotenv()
# DIRECTORY PATHS
PROJ_ROOT = Path(__file__).resolve().parents[1]
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
ISSUE_REPORT_DIR = INTERIM_DATA_DIR / "issue-report-classification"
SOFT_CLEANED_DATA_DIR = ISSUE_REPORT_DIR / "soft-cleaned"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
EXTERNAL_DATA_DIR = DATA_DIR / "external"
SAMPLES_DIR = DATA_DIR / "samples"
EMBEDDING_DIR = DATA_DIR / "embeddings"
MODELS_DIR = PROJ_ROOT / "models"
REPORTS_DIR = PROJ_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"
RANDOM_SEED = 42
DATASET_CONFIGs = {
"nasa_cfs_train": {
"data_path": "nasa/cfs_train.csv",
"label_col": "label",
"title_col": None,
"body_col": "issue",
},
"nasa_fprime_train": {
"data_path": "nasa/fprime_train.csv",
"label_col": "label",
"title_col": "title",
"body_col": "body",
},
"nasa_train": {
"data_path": "nasa/nasa_train_sample.csv",
"label_col": "label",
"title_col": None,
"body_col": "text",
},
"nlbse23_train": {
"data_path": "nlbse23/nlbse23-issue-classification-train.csv",
"label_col": "labels",
"title_col": "title",
"body_col": "body",
},
"nlbse24_train": {
"data_path": "nlbse24/issues_train.csv",
"label_col": "label",
"title_col": None,
"body_col": "issue",
},
"pySenti4SD_train": {
"data_path": "pySenti4SD/test_stackoverflow.csv",
"label_col": "Polarity",
"title_col": None,
"body_col": "Text",
"sep": ";",
},
"nasa_cfs_test": {
"data_path": "nasa/cfs_test.csv",
"label_col": "label",
"title_col": None,
"body_col": "issue",
},
"nasa_fprime_test": {
"data_path": "nasa/fprime_test.csv",
"label_col": "label",
"title_col": "title",
"body_col": "body",
},
"nasa_test": {
"data_path": "nasa/nasa_test_sample.csv",
"label_col": "label",
"title_col": None,
"body_col": "text",
},
"nlbse23_test": {
"data_path": "nlbse23/nlbse23-issue-classification-test.csv",
"label_col": "labels",
"title_col": "title",
"body_col": "body",
},
"nlbse24_test": {
"data_path": "nlbse24/issues_test.csv",
"label_col": "label",
"title_col": None,
"body_col": "issue",
},
"pySenti4SD_test": {
"data_path": "pySenti4SD/test_stackoverflow.csv",
"label_col": "Polarity",
"title_col": None,
"body_col": "Text",
"sep": ";",
},
"test": {
"data_path": "test/test.csv",
"label_col": "label",
"title_col": "title",
"body_col": "body",
},
}
# MODELS CONFIGURATION
MODEL_CONFIGS = {
# SetFit Models
"setfit-minilm": {
"model_checkpoint": "sentence-transformers/all-MiniLM-L6-v2",
"params": {
"batch_size": 16,
"num_epochs": 1,
"num_iterations": 20,
"learning_rate": 2e-5,
},
},
"setfit-distilroberta": {
"model_checkpoint": "sentence-transformers/all-distilroberta-v1",
"params": {
"batch_size": 16,
"num_epochs": 1,
"num_iterations": 20,
"learning_rate": 2e-5,
},
},
# Standard Transformers Models
"modernbert-base": {
"model_checkpoint": "answerdotai/ModernBERT-base",
"params": {
"per_device_train_batch_size": 16,
"per_device_eval_batch_size": 32,
"gradient_accumulation_steps": 4,
"num_train_epochs": 10,
"learning_rate": 2e-5,
"weight_decay": 0.01,
"warmup_steps": 500,
},
},
"roberta-base": {
"model_checkpoint": "roberta-base",
"params": {
"per_device_train_batch_size": 16,
"per_device_eval_batch_size": 32,
"gradient_accumulation_steps": 4,
"num_train_epochs": 15,
"learning_rate": 2e-5,
"weight_decay": 0.01,
"warmup_steps": 500,
},
},
}
# --- IMPOSTAZIONI MLFLOW ---
MLFLOW_TRACKING_URI = "https://dagshub.com/se4ai2526-uniba/Capibara.mlflow"
MLFLOW_EXPERIMENT_NAME = "Baselines_SetFit"
|