File size: 4,522 Bytes
5b9fa06
c393453
5b9fa06
 
 
 
 
6be796b
77a8bee
5b9fa06
 
 
 
37ebc62
 
5b9fa06
 
0964d15
 
5b9fa06
 
 
 
 
 
39c39d5
5b9fa06
60d5f26
bde0b52
37ebc62
bde0b52
fd01bb3
 
60d5f26
bde0b52
37ebc62
bde0b52
 
 
60d5f26
bde0b52
37ebc62
bde0b52
 
 
60d5f26
bde0b52
37ebc62
bde0b52
 
 
aae5e27
bde0b52
37ebc62
bde0b52
37ebc62
 
282911e
bde0b52
 
 
 
 
 
ed95dc6
bde0b52
37ebc62
bde0b52
fd01bb3
 
ed95dc6
bde0b52
37ebc62
bde0b52
 
 
ed95dc6
bde0b52
37ebc62
bde0b52
 
 
ed95dc6
bde0b52
37ebc62
bde0b52
 
 
ed95dc6
bde0b52
37ebc62
bde0b52
37ebc62
 
ed95dc6
bde0b52
 
 
 
 
 
 
 
 
 
 
 
ed95dc6
60d5f26
 
ed95dc6
 
 
257335f
 
ed95dc6
 
 
551710c
 
c393453
ed95dc6
257335f
 
ed95dc6
551710c
ed95dc6
551710c
 
c393453
ed95dc6
 
 
 
 
 
 
 
 
 
 
 
c393453
ed95dc6
 
 
 
 
 
 
 
 
 
 
c393453
1c4d1f2
 
5b9fa06
1c4d1f2
 
fd01bb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from pathlib import Path

from dotenv import load_dotenv

# Load environment variables from .env file if it exists
load_dotenv()

# DIRECTORY PATHS
PROJ_ROOT = Path(__file__).resolve().parents[1]

DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
ISSUE_REPORT_DIR = INTERIM_DATA_DIR / "issue-report-classification"
SOFT_CLEANED_DATA_DIR = ISSUE_REPORT_DIR / "soft-cleaned"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
EXTERNAL_DATA_DIR = DATA_DIR / "external"
SAMPLES_DIR = DATA_DIR / "samples"
EMBEDDING_DIR = DATA_DIR / "embeddings"

MODELS_DIR = PROJ_ROOT / "models"

REPORTS_DIR = PROJ_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"

RANDOM_SEED = 42

DATASET_CONFIGs = {
    "nasa_cfs_train": {
        "data_path": "nasa/cfs_train.csv",
        "label_col": "label",
        "title_col": None,
        "body_col": "issue",
    },
    "nasa_fprime_train": {
        "data_path": "nasa/fprime_train.csv",
        "label_col": "label",
        "title_col": "title",
        "body_col": "body",
    },
    "nasa_train": {
        "data_path": "nasa/nasa_train_sample.csv",
        "label_col": "label",
        "title_col": None,
        "body_col": "text",
    },
    "nlbse23_train": {
        "data_path": "nlbse23/nlbse23-issue-classification-train.csv",
        "label_col": "labels",
        "title_col": "title",
        "body_col": "body",
    },
    "nlbse24_train": {
        "data_path": "nlbse24/issues_train.csv",
        "label_col": "label",
        "title_col": None,
        "body_col": "issue",
    },
    "pySenti4SD_train": {
        "data_path": "pySenti4SD/test_stackoverflow.csv",
        "label_col": "Polarity",
        "title_col": None,
        "body_col": "Text",
        "sep": ";",
    },
    "nasa_cfs_test": {
        "data_path": "nasa/cfs_test.csv",
        "label_col": "label",
        "title_col": None,
        "body_col": "issue",
    },
    "nasa_fprime_test": {
        "data_path": "nasa/fprime_test.csv",
        "label_col": "label",
        "title_col": "title",
        "body_col": "body",
    },
    "nasa_test": {
        "data_path": "nasa/nasa_test_sample.csv",
        "label_col": "label",
        "title_col": None,
        "body_col": "text",
    },
    "nlbse23_test": {
        "data_path": "nlbse23/nlbse23-issue-classification-test.csv",
        "label_col": "labels",
        "title_col": "title",
        "body_col": "body",
    },
    "nlbse24_test": {
        "data_path": "nlbse24/issues_test.csv",
        "label_col": "label",
        "title_col": None,
        "body_col": "issue",
    },
    "pySenti4SD_test": {
        "data_path": "pySenti4SD/test_stackoverflow.csv",
        "label_col": "Polarity",
        "title_col": None,
        "body_col": "Text",
        "sep": ";",
    },
    "test": {
        "data_path": "test/test.csv",
        "label_col": "label",
        "title_col": "title",
        "body_col": "body",
    },
}

# MODELS CONFIGURATION
MODEL_CONFIGS = {
    # SetFit Models
    "setfit-minilm": {
        "model_checkpoint": "sentence-transformers/all-MiniLM-L6-v2",
        "params": {
            "batch_size": 16,
            "num_epochs": 1,
            "num_iterations": 20,
            "learning_rate": 2e-5,
        },
    },
    "setfit-distilroberta": {
        "model_checkpoint": "sentence-transformers/all-distilroberta-v1",
        "params": {
            "batch_size": 16,
            "num_epochs": 1,
            "num_iterations": 20,
            "learning_rate": 2e-5,
        },
    },
    # Standard Transformers Models
    "modernbert-base": {
        "model_checkpoint": "answerdotai/ModernBERT-base",
        "params": {
            "per_device_train_batch_size": 16,
            "per_device_eval_batch_size": 32,
            "gradient_accumulation_steps": 4,
            "num_train_epochs": 10,
            "learning_rate": 2e-5,
            "weight_decay": 0.01,
            "warmup_steps": 500,
        },
    },
    "roberta-base": {
        "model_checkpoint": "roberta-base",
        "params": {
            "per_device_train_batch_size": 16,
            "per_device_eval_batch_size": 32,
            "gradient_accumulation_steps": 4,
            "num_train_epochs": 15,
            "learning_rate": 2e-5,
            "weight_decay": 0.01,
            "warmup_steps": 500,
        },
    },
}

# --- IMPOSTAZIONI MLFLOW ---
MLFLOW_TRACKING_URI = "https://dagshub.com/se4ai2526-uniba/Capibara.mlflow"
MLFLOW_EXPERIMENT_NAME = "Baselines_SetFit"