donato11 commited on
Commit
6be796b
·
1 Parent(s): 9535b59

Changed models hyperparameters for baseline

Browse files
syntetic_issue_report_data_generation/config.py CHANGED
@@ -4,12 +4,13 @@ from dotenv import load_dotenv
4
  # Load environment variables from .env file if it exists
5
  load_dotenv()
6
 
7
- # --- DIRECTORY PATHS ---
8
  PROJ_ROOT = Path(__file__).resolve().parents[1]
9
 
10
  DATA_DIR = PROJ_ROOT / "data"
11
  RAW_DATA_DIR = DATA_DIR / "raw"
12
  INTERIM_DATA_DIR = DATA_DIR / "interim"
 
13
  PROCESSED_DATA_DIR = DATA_DIR / "processed"
14
  EXTERNAL_DATA_DIR = DATA_DIR / "external"
15
 
@@ -21,42 +22,42 @@ FIGURES_DIR = REPORTS_DIR / "figures"
21
  DATASET_CONFIGs = {
22
  'nasa_cfs_train': {
23
  'data_path': 'issue-report-classification/nasa/cfs_train.csv',
24
- 'label_col': 'label',
25
- 'title_col': 'title',
26
- 'body_col': 'body'
27
  },
28
- 'nasa_fprime_train': {
29
  'data_path': 'issue-report-classification/nasa/fprime_train.csv',
30
  'label_col': 'label',
31
  'title_col': 'title',
32
  'body_col': 'body'
33
  },
34
- 'nasa_train': {
35
  'data_path': 'issue-report-classification/nasa/nasa_train_sample.csv',
36
  'label_col': 'label',
37
  'title_col': None,
38
  'body_col': 'text'
39
  },
40
- 'nlbse23_train': {
41
  'data_path': 'issue-report-classification/nlbse23/nlbse23-issue-classification-train.csv',
42
  'label_col': 'labels',
43
  'title_col': 'title',
44
  'body_col': 'body'
45
- },
46
  'nlbse24_train': {
47
  'data_path': 'issue-report-classification/nlbse24/issues_train.csv',
48
  'label_col': 'label',
49
  'title_col': 'title',
50
  'body_col': 'body'
51
- },
52
  'pySenti4SD_train': {
53
  'data_path': 'pySenti4SD/test_stackoverflow.csv',
54
  'label_col': 'Polarity',
55
  'title_col': None,
56
  'body_col': 'Text',
57
  'sep': ';'
58
- },
59
- 'nasa_cfs_test': {
60
  'data_path': 'issue-report-classification/nasa/cfs_test.csv',
61
  'label_col': 'label',
62
  'title_col': 'title',
@@ -129,7 +130,7 @@ MODEL_CONFIGS = {
129
  "per_device_train_batch_size": 16,
130
  "per_device_eval_batch_size": 32,
131
  "gradient_accumulation_steps": 4,
132
- "num_train_epochs": 3,
133
  "learning_rate": 2e-5,
134
  "weight_decay": 0.01,
135
  "warmup_steps": 500,
@@ -139,8 +140,9 @@ MODEL_CONFIGS = {
139
  "model_checkpoint": "roberta-base",
140
  "params": {
141
  "per_device_train_batch_size": 16,
142
- "per_device_eval_batch_size": 16,
143
- "num_train_epochs": 3,
 
144
  "learning_rate": 2e-5,
145
  "weight_decay": 0.01,
146
  "warmup_steps": 500,
@@ -150,4 +152,4 @@ MODEL_CONFIGS = {
150
 
151
  # --- IMPOSTAZIONI MLFLOW ---
152
  MLFLOW_TRACKING_URI = "https://dagshub.com/se4ai2526-uniba/Capibara.mlflow"
153
- MLFLOW_EXPERIMENT_NAME = "Tests"
 
4
  # Load environment variables from .env file if it exists
5
  load_dotenv()
6
 
7
+ # DIRECTORY PATHS
8
  PROJ_ROOT = Path(__file__).resolve().parents[1]
9
 
10
  DATA_DIR = PROJ_ROOT / "data"
11
  RAW_DATA_DIR = DATA_DIR / "raw"
12
  INTERIM_DATA_DIR = DATA_DIR / "interim"
13
+ SOFT_CLEANED_DATA_DIR = DATA_DIR / "soft_cleaned"
14
  PROCESSED_DATA_DIR = DATA_DIR / "processed"
15
  EXTERNAL_DATA_DIR = DATA_DIR / "external"
16
 
 
22
  DATASET_CONFIGs = {
23
  'nasa_cfs_train': {
24
  'data_path': 'issue-report-classification/nasa/cfs_train.csv',
25
+ 'label_col': 'label',
26
+ 'title_col': 'title',
27
+ 'body_col': 'body'
28
  },
29
+ 'nasa_fprime_train': {
30
  'data_path': 'issue-report-classification/nasa/fprime_train.csv',
31
  'label_col': 'label',
32
  'title_col': 'title',
33
  'body_col': 'body'
34
  },
35
+ 'nasa_train': {
36
  'data_path': 'issue-report-classification/nasa/nasa_train_sample.csv',
37
  'label_col': 'label',
38
  'title_col': None,
39
  'body_col': 'text'
40
  },
41
+ 'nlbse23_train': {
42
  'data_path': 'issue-report-classification/nlbse23/nlbse23-issue-classification-train.csv',
43
  'label_col': 'labels',
44
  'title_col': 'title',
45
  'body_col': 'body'
46
+ },
47
  'nlbse24_train': {
48
  'data_path': 'issue-report-classification/nlbse24/issues_train.csv',
49
  'label_col': 'label',
50
  'title_col': 'title',
51
  'body_col': 'body'
52
+ },
53
  'pySenti4SD_train': {
54
  'data_path': 'pySenti4SD/test_stackoverflow.csv',
55
  'label_col': 'Polarity',
56
  'title_col': None,
57
  'body_col': 'Text',
58
  'sep': ';'
59
+ },
60
+ 'nasa_cfs_test': {
61
  'data_path': 'issue-report-classification/nasa/cfs_test.csv',
62
  'label_col': 'label',
63
  'title_col': 'title',
 
130
  "per_device_train_batch_size": 16,
131
  "per_device_eval_batch_size": 32,
132
  "gradient_accumulation_steps": 4,
133
+ "num_train_epochs": 15,
134
  "learning_rate": 2e-5,
135
  "weight_decay": 0.01,
136
  "warmup_steps": 500,
 
140
  "model_checkpoint": "roberta-base",
141
  "params": {
142
  "per_device_train_batch_size": 16,
143
+ "per_device_eval_batch_size": 32,
144
+ "gradient_accumulation_steps": 4,
145
+ "num_train_epochs": 15,
146
  "learning_rate": 2e-5,
147
  "weight_decay": 0.01,
148
  "warmup_steps": 500,
 
152
 
153
  # --- IMPOSTAZIONI MLFLOW ---
154
  MLFLOW_TRACKING_URI = "https://dagshub.com/se4ai2526-uniba/Capibara.mlflow"
155
+ MLFLOW_EXPERIMENT_NAME = "Baseline_Transformers"
syntetic_issue_report_data_generation/modeling/train.py CHANGED
@@ -20,7 +20,8 @@ from syntetic_issue_report_data_generation.config import (
20
  MODEL_CONFIGS,
21
  MLFLOW_TRACKING_URI,
22
  MLFLOW_EXPERIMENT_NAME,
23
- INTERIM_DATA_DIR
 
24
  )
25
 
26
 
@@ -93,7 +94,7 @@ def load_and_prepare_data(train_config, test_config=None, test_size=0.2, max_tra
93
  print(f"Loading train data from: {train_config['data_path']}")
94
 
95
  # Get train configuration
96
- train_path = INTERIM_DATA_DIR / train_config['data_path']
97
  train_label_col = train_config['label_col']
98
  train_title_col = train_config.get('title_col')
99
  train_body_col = train_config['body_col']
@@ -109,7 +110,7 @@ def load_and_prepare_data(train_config, test_config=None, test_size=0.2, max_tra
109
  # Handle test data
110
  if test_config:
111
  print(f"Loading test data from: {test_config['data_path']}")
112
- test_path = INTERIM_DATA_DIR / test_config['data_path']
113
  test_label_col = test_config['label_col']
114
  test_title_col = test_config.get('title_col')
115
  test_body_col = test_config['body_col']
 
20
  MODEL_CONFIGS,
21
  MLFLOW_TRACKING_URI,
22
  MLFLOW_EXPERIMENT_NAME,
23
+ INTERIM_DATA_DIR,
24
+ SOFT_CLEANED_DATA_DIR
25
  )
26
 
27
 
 
94
  print(f"Loading train data from: {train_config['data_path']}")
95
 
96
  # Get train configuration
97
+ train_path = SOFT_CLEANED_DATA_DIR / train_config['data_path']
98
  train_label_col = train_config['label_col']
99
  train_title_col = train_config.get('title_col')
100
  train_body_col = train_config['body_col']
 
110
  # Handle test data
111
  if test_config:
112
  print(f"Loading test data from: {test_config['data_path']}")
113
+ test_path = SOFT_CLEANED_DATA_DIR / test_config['data_path']
114
  test_label_col = test_config['label_col']
115
  test_title_col = test_config.get('title_col')
116
  test_body_col = test_config['body_col']