|
|
from pathlib import Path |
|
|
import sys |
|
|
import tempfile |
|
|
from unittest.mock import patch |
|
|
|
|
|
from datasets import Dataset |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import pytest |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
from syntetic_issue_report_data_generation.config import MODEL_CONFIGS, DATASET_CONFIGs |
|
|
from syntetic_issue_report_data_generation.modeling.train import ( |
|
|
init_parser, |
|
|
load_and_prepare_data, |
|
|
train_model_setfit, |
|
|
train_model_transformers, |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def temp_data_dir(): |
|
|
"""Create a temporary directory for test data files.""" |
|
|
with tempfile.TemporaryDirectory() as tmpdirname: |
|
|
yield Path(tmpdirname) |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_train_data(): |
|
|
"""Create sample training data with balanced classes.""" |
|
|
return pd.DataFrame( |
|
|
{ |
|
|
"title": [ |
|
|
"Bug in login", |
|
|
"Feature request", |
|
|
"Performance issue", |
|
|
"UI problem", |
|
|
"Bug in logout", |
|
|
"Add search", |
|
|
"Memory leak", |
|
|
"New API endpoint", |
|
|
"Crash on startup", |
|
|
"Enhancement needed", |
|
|
], |
|
|
"body": [ |
|
|
"Cannot login to system", |
|
|
"Add dark mode feature", |
|
|
"Slow loading times", |
|
|
"Button misaligned", |
|
|
"Cannot logout properly", |
|
|
"Need search functionality", |
|
|
"High memory usage", |
|
|
"REST API needed", |
|
|
"Application crashes", |
|
|
"Improve user experience", |
|
|
], |
|
|
"label": [ |
|
|
"bug", |
|
|
"enhancement", |
|
|
"bug", |
|
|
"bug", |
|
|
"bug", |
|
|
"enhancement", |
|
|
"bug", |
|
|
"enhancement", |
|
|
"bug", |
|
|
"enhancement", |
|
|
], |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def sample_imbalanced_data(): |
|
|
"""Create sample data with imbalanced classes (for stratified sampling test).""" |
|
|
return pd.DataFrame( |
|
|
{ |
|
|
"title": [ |
|
|
"Bug 1", |
|
|
"Bug 2", |
|
|
"Bug 3", |
|
|
"Bug 4", |
|
|
"Bug 5", |
|
|
"Bug 6", |
|
|
"Bug 7", |
|
|
"Bug 8", |
|
|
"Enhancement 1", |
|
|
"Enhancement 2", |
|
|
], |
|
|
"body": [ |
|
|
"Bug body 1", |
|
|
"Bug body 2", |
|
|
"Bug body 3", |
|
|
"Bug body 4", |
|
|
"Bug body 5", |
|
|
"Bug body 6", |
|
|
"Bug body 7", |
|
|
"Bug body 8", |
|
|
"Enhancement body 1", |
|
|
"Enhancement body 2", |
|
|
], |
|
|
"label": [ |
|
|
"bug", |
|
|
"bug", |
|
|
"bug", |
|
|
"bug", |
|
|
"bug", |
|
|
"bug", |
|
|
"bug", |
|
|
"bug", |
|
|
"enhancement", |
|
|
"enhancement", |
|
|
], |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def train_config_with_title(temp_data_dir, sample_train_data): |
|
|
"""Create train config with title and body columns.""" |
|
|
train_path = temp_data_dir / "train_with_title.csv" |
|
|
sample_train_data.to_csv(train_path, index=False) |
|
|
|
|
|
return { |
|
|
"data_path": "train_with_title.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
"sep": ",", |
|
|
} |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def imbalanced_train_config(temp_data_dir, sample_imbalanced_data): |
|
|
"""Create train config with imbalanced data.""" |
|
|
train_path = temp_data_dir / "train_imbalanced.csv" |
|
|
sample_imbalanced_data.to_csv(train_path, index=False) |
|
|
|
|
|
return { |
|
|
"data_path": "train_imbalanced.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
"sep": ",", |
|
|
} |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def minimal_train_data(): |
|
|
"""Create minimal training data for quick training tests.""" |
|
|
return pd.DataFrame( |
|
|
{ |
|
|
"title": [ |
|
|
"Bug 1", |
|
|
"Bug 2", |
|
|
"Enhancement 1", |
|
|
"Enhancement 2", |
|
|
"Bug 3", |
|
|
"Enhancement 3", |
|
|
], |
|
|
"body": [ |
|
|
"Bug body 1", |
|
|
"Bug body 2", |
|
|
"Enh body 1", |
|
|
"Enh body 2", |
|
|
"Bug body 3", |
|
|
"Enh body 3", |
|
|
], |
|
|
"label": ["bug", "bug", "enhancement", "enhancement", "bug", "enhancement"], |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def minimal_train_config(temp_data_dir, minimal_train_data): |
|
|
"""Create train config with minimal data for fast training.""" |
|
|
train_path = temp_data_dir / "minimal_train.csv" |
|
|
minimal_train_data.to_csv(train_path, index=False) |
|
|
|
|
|
return { |
|
|
"data_path": "minimal_train.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
"sep": ",", |
|
|
} |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def minimal_model_config_setfit(): |
|
|
"""Create minimal SetFit model configuration for testing.""" |
|
|
return { |
|
|
"model_checkpoint": "sentence-transformers/paraphrase-MiniLM-L3-v2", |
|
|
"params": {"num_epochs": 1, "batch_size": 4, "num_iterations": 5, "max_length": 64}, |
|
|
} |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def minimal_model_config_transformers(): |
|
|
"""Create minimal Transformers model configuration for testing.""" |
|
|
return { |
|
|
"model_checkpoint": "prajjwal1/bert-tiny", |
|
|
"params": { |
|
|
"num_train_epochs": 1, |
|
|
"per_device_train_batch_size": 2, |
|
|
"per_device_eval_batch_size": 2, |
|
|
"learning_rate": 5e-5, |
|
|
"warmup_steps": 0, |
|
|
"weight_decay": 0.01, |
|
|
"logging_steps": 1, |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
class TestDataLoadingAndPreparation: |
|
|
"""Test class for data loading and preparation functionality.""" |
|
|
|
|
|
def test_load_data_with_valid_config(self, train_config_with_title, temp_data_dir): |
|
|
"""Verify that data loads correctly with valid train dataset configuration.""" |
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
train_ds, test_ds, eval_strategy = load_and_prepare_data( |
|
|
train_config_with_title, test_config=None, test_size=0.2 |
|
|
) |
|
|
|
|
|
|
|
|
assert isinstance(train_ds, Dataset) |
|
|
assert isinstance(test_ds, Dataset) |
|
|
|
|
|
|
|
|
assert set(train_ds.column_names) == {"text", "label"} |
|
|
assert set(test_ds.column_names) == {"text", "label"} |
|
|
|
|
|
|
|
|
assert len(train_ds) > 0 |
|
|
assert len(test_ds) > 0 |
|
|
|
|
|
|
|
|
assert len(train_ds) + len(test_ds) == 10 |
|
|
|
|
|
|
|
|
assert hasattr(train_ds, "label_encoder") |
|
|
assert hasattr(test_ds, "label_encoder") |
|
|
|
|
|
|
|
|
assert all(isinstance(label, int) for label in train_ds["label"]) |
|
|
assert all(isinstance(label, int) for label in test_ds["label"]) |
|
|
|
|
|
def test_load_data_creates_holdout_split(self, train_config_with_title, temp_data_dir): |
|
|
"""Verify holdout split is created when no test dataset is provided.""" |
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
train_ds, test_ds, eval_strategy = load_and_prepare_data( |
|
|
train_config_with_title, test_config=None, test_size=0.2 |
|
|
) |
|
|
|
|
|
|
|
|
assert eval_strategy == "holdout" |
|
|
|
|
|
|
|
|
assert len(train_ds) > 0 |
|
|
assert len(test_ds) > 0 |
|
|
|
|
|
|
|
|
total_samples = len(train_ds) + len(test_ds) |
|
|
assert total_samples == 10 |
|
|
assert len(test_ds) == 2 |
|
|
assert len(train_ds) == 8 |
|
|
|
|
|
|
|
|
train_texts = set(train_ds["text"]) |
|
|
test_texts = set(test_ds["text"]) |
|
|
assert len(train_texts.intersection(test_texts)) == 0 |
|
|
|
|
|
def test_label_encoding_consistency(self, train_config_with_title, temp_data_dir): |
|
|
"""Verify labels are encoded consistently across train/test sets.""" |
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
train_ds, test_ds, eval_strategy = load_and_prepare_data( |
|
|
train_config_with_title, test_config=None, test_size=0.2 |
|
|
) |
|
|
|
|
|
|
|
|
assert train_ds.label_encoder is test_ds.label_encoder |
|
|
|
|
|
|
|
|
assert all(isinstance(label, int) for label in train_ds["label"]) |
|
|
assert all(isinstance(label, int) for label in test_ds["label"]) |
|
|
|
|
|
|
|
|
train_label_classes = train_ds.label_encoder.classes_ |
|
|
test_label_classes = test_ds.label_encoder.classes_ |
|
|
assert list(train_label_classes) == list(test_label_classes) |
|
|
|
|
|
|
|
|
expected_classes = sorted(["bug", "enhancement"]) |
|
|
actual_classes = sorted(train_ds.label_encoder.classes_) |
|
|
assert actual_classes == expected_classes |
|
|
|
|
|
|
|
|
num_classes = len(train_ds.label_encoder.classes_) |
|
|
assert num_classes == 2 |
|
|
assert all(0 <= label < num_classes for label in train_ds["label"]) |
|
|
assert all(0 <= label < num_classes for label in test_ds["label"]) |
|
|
|
|
|
|
|
|
train_unique_labels = set(train_ds["label"]) |
|
|
assert len(train_unique_labels) == 2 |
|
|
|
|
|
def test_text_column_creation_with_title_and_body( |
|
|
self, train_config_with_title, temp_data_dir |
|
|
): |
|
|
"""Verify text column combines title and body correctly.""" |
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
train_ds, test_ds, eval_strategy = load_and_prepare_data( |
|
|
train_config_with_title, test_config=None, test_size=0.2 |
|
|
) |
|
|
|
|
|
|
|
|
assert "text" in train_ds.column_names |
|
|
assert "text" in test_ds.column_names |
|
|
assert "title" not in train_ds.column_names |
|
|
assert "body" not in train_ds.column_names |
|
|
|
|
|
|
|
|
assert all(isinstance(text, str) and len(text) > 0 for text in train_ds["text"]) |
|
|
assert all(isinstance(text, str) and len(text) > 0 for text in test_ds["text"]) |
|
|
|
|
|
|
|
|
assert all(len(text.strip()) > 0 for text in train_ds["text"]) |
|
|
assert all(len(text.strip()) > 0 for text in test_ds["text"]) |
|
|
|
|
|
|
|
|
|
|
|
for text in train_ds["text"]: |
|
|
|
|
|
assert len(text) >= 10 |
|
|
|
|
|
def test_max_train_samples_stratified_sampling(self, imbalanced_train_config, temp_data_dir): |
|
|
"""Verify stratified sampling works correctly when max_train_samples is specified.""" |
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
|
|
|
|
|
|
train_ds, test_ds, eval_strategy = load_and_prepare_data( |
|
|
imbalanced_train_config, test_config=None, test_size=0.2, max_train_samples=4 |
|
|
) |
|
|
|
|
|
|
|
|
assert len(train_ds) == 4 |
|
|
|
|
|
|
|
|
unique_labels = set(train_ds["label"]) |
|
|
assert len(unique_labels) == 2, "Stratified sampling should preserve both classes" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
label_counts = {} |
|
|
for label in train_ds["label"]: |
|
|
label_name = train_ds.label_encoder.inverse_transform([label])[0] |
|
|
label_counts[label_name] = label_counts.get(label_name, 0) + 1 |
|
|
|
|
|
|
|
|
assert label_counts.get("enhancement", 0) >= 1 |
|
|
|
|
|
assert label_counts.get("bug", 0) >= label_counts.get("enhancement", 0) |
|
|
|
|
|
|
|
|
assert len(test_ds) == 2 |
|
|
|
|
|
|
|
|
assert len(train_ds) + len(test_ds) < 10 |
|
|
|
|
|
|
|
|
class TestConfiguration: |
|
|
"""Test class for configuration and argument parsing functionality.""" |
|
|
|
|
|
def test_parser_accepts_valid_arguments(self): |
|
|
"""Verify parser accepts all valid combinations of arguments.""" |
|
|
parser = init_parser() |
|
|
|
|
|
|
|
|
valid_dataset = list(DATASET_CONFIGs.keys())[0] |
|
|
valid_model = list(MODEL_CONFIGS.keys())[0] |
|
|
|
|
|
|
|
|
args = parser.parse_args(["--train-dataset", valid_dataset, "--model-name", valid_model]) |
|
|
assert args.train_dataset == valid_dataset |
|
|
assert args.model_name == valid_model |
|
|
assert args.test_dataset is None |
|
|
assert args.test_size == 0.2 |
|
|
assert args.max_train_samples is None |
|
|
assert args.use_setfit is False |
|
|
assert args.run_name is None |
|
|
|
|
|
|
|
|
if len(DATASET_CONFIGs.keys()) > 1: |
|
|
valid_test_dataset = list(DATASET_CONFIGs.keys())[1] |
|
|
else: |
|
|
valid_test_dataset = valid_dataset |
|
|
|
|
|
args = parser.parse_args( |
|
|
[ |
|
|
"--train-dataset", |
|
|
valid_dataset, |
|
|
"--test-dataset", |
|
|
valid_test_dataset, |
|
|
"--model-name", |
|
|
valid_model, |
|
|
"--use-setfit", |
|
|
"--test-size", |
|
|
"0.3", |
|
|
"--max-train-samples", |
|
|
"100", |
|
|
"--run-name", |
|
|
"test_run", |
|
|
] |
|
|
) |
|
|
assert args.train_dataset == valid_dataset |
|
|
assert args.test_dataset == valid_test_dataset |
|
|
assert args.model_name == valid_model |
|
|
assert args.use_setfit is True |
|
|
assert args.test_size == 0.3 |
|
|
assert args.max_train_samples == 100 |
|
|
assert args.run_name == "test_run" |
|
|
|
|
|
|
|
|
args = parser.parse_args( |
|
|
["--train-dataset", valid_dataset, "--model-name", valid_model, "--use-setfit"] |
|
|
) |
|
|
assert args.use_setfit is True |
|
|
|
|
|
|
|
|
args = parser.parse_args( |
|
|
["--train-dataset", valid_dataset, "--model-name", valid_model, "--test-size", "0.15"] |
|
|
) |
|
|
assert args.test_size == 0.15 |
|
|
|
|
|
|
|
|
args = parser.parse_args( |
|
|
[ |
|
|
"--train-dataset", |
|
|
valid_dataset, |
|
|
"--model-name", |
|
|
valid_model, |
|
|
"--max-train-samples", |
|
|
"500", |
|
|
] |
|
|
) |
|
|
assert args.max_train_samples == 500 |
|
|
|
|
|
def test_parser_rejects_invalid_dataset_names(self): |
|
|
"""Verify parser rejects dataset names not in DATASET_CONFIGs.""" |
|
|
parser = init_parser() |
|
|
|
|
|
|
|
|
valid_model = list(MODEL_CONFIGS.keys())[0] |
|
|
|
|
|
|
|
|
with pytest.raises(SystemExit): |
|
|
parser.parse_args( |
|
|
["--train-dataset", "invalid_dataset_name", "--model-name", valid_model] |
|
|
) |
|
|
|
|
|
|
|
|
valid_dataset = list(DATASET_CONFIGs.keys())[0] |
|
|
with pytest.raises(SystemExit): |
|
|
parser.parse_args( |
|
|
[ |
|
|
"--train-dataset", |
|
|
valid_dataset, |
|
|
"--test-dataset", |
|
|
"invalid_test_dataset", |
|
|
"--model-name", |
|
|
valid_model, |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
with pytest.raises(SystemExit): |
|
|
parser.parse_args( |
|
|
["--train-dataset", valid_dataset, "--model-name", "invalid_model_name"] |
|
|
) |
|
|
|
|
|
|
|
|
with pytest.raises(SystemExit): |
|
|
parser.parse_args(["--model-name", valid_model]) |
|
|
|
|
|
|
|
|
with pytest.raises(SystemExit): |
|
|
parser.parse_args(["--train-dataset", valid_dataset]) |
|
|
|
|
|
|
|
|
class TestTrainingPipeline: |
|
|
@pytest.mark.slow |
|
|
def test_setfit_training_completes( |
|
|
self, minimal_train_config, minimal_model_config_setfit, temp_data_dir |
|
|
): |
|
|
"""Verify SetFit training runs without errors (using minimal data).""" |
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
|
|
|
train_ds, test_ds, eval_strategy = load_and_prepare_data( |
|
|
minimal_train_config, |
|
|
test_config=None, |
|
|
test_size=0.33, |
|
|
) |
|
|
|
|
|
|
|
|
with patch("syntetic_issue_report_data_generation.modeling.train.mlflow"): |
|
|
|
|
|
result = train_model_setfit(minimal_model_config_setfit, train_ds, test_ds) |
|
|
|
|
|
|
|
|
assert result is not None |
|
|
|
|
|
|
|
|
model, metrics, y_true, y_pred, model_type = result |
|
|
|
|
|
|
|
|
assert model_type == "setfit" |
|
|
|
|
|
|
|
|
assert model is not None |
|
|
|
|
|
|
|
|
assert isinstance(metrics, dict) |
|
|
assert "accuracy" in metrics |
|
|
assert "f1_macro" in metrics |
|
|
assert "f1_weighted" in metrics |
|
|
|
|
|
|
|
|
assert 0 <= metrics["accuracy"] <= 1 |
|
|
assert 0 <= metrics["f1_macro"] <= 1 |
|
|
assert 0 <= metrics["f1_weighted"] <= 1 |
|
|
|
|
|
|
|
|
assert y_true is not None |
|
|
assert y_pred is not None |
|
|
|
|
|
|
|
|
assert len(y_true) == len(test_ds) |
|
|
assert len(y_pred) == len(test_ds) |
|
|
|
|
|
|
|
|
num_classes = len(train_ds.label_encoder.classes_) |
|
|
assert all(0 <= pred < num_classes for pred in y_pred) |
|
|
assert all(0 <= true < num_classes for true in y_true) |
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_transformers_training_completes( |
|
|
self, minimal_train_config, minimal_model_config_transformers, temp_data_dir |
|
|
): |
|
|
"""Verify Transformers training runs without errors (using minimal data).""" |
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
|
|
|
train_ds, test_ds, eval_strategy = load_and_prepare_data( |
|
|
minimal_train_config, |
|
|
test_config=None, |
|
|
test_size=0.33, |
|
|
) |
|
|
|
|
|
|
|
|
with patch("syntetic_issue_report_data_generation.modeling.train.mlflow"): |
|
|
|
|
|
result = train_model_transformers( |
|
|
minimal_model_config_transformers, train_ds, test_ds |
|
|
) |
|
|
|
|
|
|
|
|
assert result is not None |
|
|
|
|
|
|
|
|
model_tuple, metrics, y_true, y_pred, model_type = result |
|
|
|
|
|
|
|
|
assert model_type == "transformers" |
|
|
|
|
|
|
|
|
assert model_tuple is not None |
|
|
assert isinstance(model_tuple, tuple) |
|
|
assert len(model_tuple) == 2 |
|
|
model, tokenizer = model_tuple |
|
|
assert model is not None |
|
|
assert tokenizer is not None |
|
|
|
|
|
|
|
|
assert isinstance(metrics, dict) |
|
|
|
|
|
assert any("accuracy" in key for key in metrics.keys()) |
|
|
|
|
|
|
|
|
accuracy_key = [k for k in metrics.keys() if "accuracy" in k][0] |
|
|
accuracy = metrics[accuracy_key] |
|
|
assert 0 <= accuracy <= 1 |
|
|
|
|
|
|
|
|
assert y_true is not None |
|
|
assert y_pred is not None |
|
|
|
|
|
|
|
|
assert len(y_true) == len(test_ds) |
|
|
assert len(y_pred) == len(test_ds) |
|
|
|
|
|
|
|
|
num_classes = len(train_ds.label_encoder.classes_) |
|
|
assert all(0 <= pred < num_classes for pred in y_pred) |
|
|
assert all(0 <= true < num_classes for true in y_true) |
|
|
|
|
|
|
|
|
assert all(isinstance(pred, (int, np.integer)) for pred in y_pred) |
|
|
assert all(isinstance(true, (int, np.integer)) for true in y_true) |
|
|
|
|
|
|
|
|
class TestErrorHandling: |
|
|
"""Test class for error handling functionality.""" |
|
|
|
|
|
def test_missing_train_file_raises_error(self, temp_data_dir): |
|
|
"""Verify appropriate error when train file doesn't exist.""" |
|
|
|
|
|
missing_file_config = { |
|
|
"data_path": "non_existent_file.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
"sep": ",", |
|
|
} |
|
|
|
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
|
|
|
with pytest.raises(SystemExit) as excinfo: |
|
|
load_and_prepare_data(missing_file_config, test_config=None, test_size=0.2) |
|
|
|
|
|
|
|
|
assert excinfo.value.code == 1 |
|
|
|
|
|
def test_invalid_label_column(self, temp_data_dir): |
|
|
"""Verify error handling when specified label column doesn't exist.""" |
|
|
|
|
|
sample_data = pd.DataFrame( |
|
|
{ |
|
|
"title": ["Bug 1", "Enhancement 1"], |
|
|
"body": ["Bug body", "Enhancement body"], |
|
|
"type": ["bug", "enhancement"], |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
train_path = temp_data_dir / "invalid_label_col.csv" |
|
|
sample_data.to_csv(train_path, index=False) |
|
|
|
|
|
|
|
|
invalid_label_config = { |
|
|
"data_path": "invalid_label_col.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
"sep": ",", |
|
|
} |
|
|
|
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
|
|
|
with pytest.raises(SystemExit) as excinfo: |
|
|
load_and_prepare_data(invalid_label_config, test_config=None, test_size=0.2) |
|
|
|
|
|
|
|
|
assert excinfo.value.code == 1 |
|
|
|
|
|
def test_invalid_body_column(self, temp_data_dir): |
|
|
"""Verify error handling when specified body column doesn't exist.""" |
|
|
|
|
|
sample_data = pd.DataFrame( |
|
|
{ |
|
|
"title": ["Bug 1", "Enhancement 1"], |
|
|
"description": ["Bug body", "Enhancement body"], |
|
|
"label": ["bug", "enhancement"], |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
train_path = temp_data_dir / "invalid_body_col.csv" |
|
|
sample_data.to_csv(train_path, index=False) |
|
|
|
|
|
|
|
|
invalid_body_config = { |
|
|
"data_path": "invalid_body_col.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
"sep": ",", |
|
|
} |
|
|
|
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
|
|
|
with pytest.raises(SystemExit) as excinfo: |
|
|
load_and_prepare_data(invalid_body_config, test_config=None, test_size=0.2) |
|
|
|
|
|
|
|
|
assert excinfo.value.code == 1 |
|
|
|
|
|
|
|
|
class TestEdgeCases: |
|
|
"""Test class for edge case scenarios.""" |
|
|
|
|
|
def test_very_small_dataset(self, temp_data_dir): |
|
|
"""Verify training with very small datasets (< 10 samples).""" |
|
|
|
|
|
very_small_data = pd.DataFrame( |
|
|
{ |
|
|
"title": ["Bug 1", "Bug 2", "Bug 3", "Enh 1", "Enh 2", "Enh 3"], |
|
|
"body": [ |
|
|
"Small bug 1", |
|
|
"Small bug 2", |
|
|
"Small bug 3", |
|
|
"Small enh 1", |
|
|
"Small enh 2", |
|
|
"Small enh 3", |
|
|
], |
|
|
"label": ["bug", "bug", "bug", "enhancement", "enhancement", "enhancement"], |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
train_path = temp_data_dir / "very_small.csv" |
|
|
very_small_data.to_csv(train_path, index=False) |
|
|
|
|
|
small_config = { |
|
|
"data_path": "very_small.csv", |
|
|
"label_col": "label", |
|
|
"title_col": "title", |
|
|
"body_col": "body", |
|
|
"sep": ",", |
|
|
} |
|
|
|
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
|
|
|
train_ds, test_ds, eval_strategy = load_and_prepare_data( |
|
|
small_config, |
|
|
test_config=None, |
|
|
test_size=0.33, |
|
|
) |
|
|
|
|
|
|
|
|
assert isinstance(train_ds, Dataset) |
|
|
assert isinstance(test_ds, Dataset) |
|
|
|
|
|
|
|
|
assert len(train_ds) > 0 |
|
|
assert len(test_ds) > 0 |
|
|
|
|
|
|
|
|
assert len(train_ds) + len(test_ds) == 6 |
|
|
|
|
|
|
|
|
train_unique_labels = set(train_ds["label"]) |
|
|
assert len(train_unique_labels) >= 1 |
|
|
|
|
|
|
|
|
num_classes = len(train_ds.label_encoder.classes_) |
|
|
assert num_classes == 2 |
|
|
assert all(0 <= label < num_classes for label in train_ds["label"]) |
|
|
assert all(0 <= label < num_classes for label in test_ds["label"]) |
|
|
|
|
|
|
|
|
assert all(isinstance(text, str) and len(text) > 0 for text in train_ds["text"]) |
|
|
assert all(isinstance(text, str) and len(text) > 0 for text in test_ds["text"]) |
|
|
|
|
|
|
|
|
class TestOutputValidation: |
|
|
"""Test class for output validation functionality.""" |
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_predictions_match_label_space( |
|
|
self, |
|
|
minimal_train_config, |
|
|
minimal_model_config_setfit, |
|
|
minimal_model_config_transformers, |
|
|
temp_data_dir, |
|
|
): |
|
|
"""Verify predictions are within valid label space.""" |
|
|
with patch( |
|
|
"syntetic_issue_report_data_generation.modeling.train.SOFT_CLEANED_DATA_DIR", |
|
|
temp_data_dir, |
|
|
): |
|
|
|
|
|
train_ds, test_ds, eval_strategy = load_and_prepare_data( |
|
|
minimal_train_config, |
|
|
test_config=None, |
|
|
test_size=0.33, |
|
|
) |
|
|
|
|
|
|
|
|
num_classes = len(train_ds.label_encoder.classes_) |
|
|
valid_label_space = set(range(num_classes)) |
|
|
|
|
|
|
|
|
with patch("syntetic_issue_report_data_generation.modeling.train.mlflow"): |
|
|
|
|
|
model, metrics, y_true, y_pred, model_type = train_model_setfit( |
|
|
minimal_model_config_setfit, train_ds, test_ds |
|
|
) |
|
|
|
|
|
|
|
|
assert all( |
|
|
pred in valid_label_space for pred in y_pred |
|
|
), f"SetFit predictions contain invalid labels. Valid: {valid_label_space}, Got: {set(y_pred)}" |
|
|
|
|
|
|
|
|
assert all( |
|
|
true in valid_label_space for true in y_true |
|
|
), f"True labels contain invalid values. Valid: {valid_label_space}, Got: {set(y_true)}" |
|
|
|
|
|
|
|
|
assert all( |
|
|
0 <= pred < num_classes for pred in y_pred |
|
|
), f"SetFit predictions out of range [0, {num_classes})" |
|
|
|
|
|
|
|
|
assert list(y_true) == list( |
|
|
test_ds["label"] |
|
|
), "True labels don't match original test dataset labels" |
|
|
|
|
|
|
|
|
(model_t, tokenizer), metrics_t, y_true_t, y_pred_t, model_type_t = ( |
|
|
train_model_transformers(minimal_model_config_transformers, train_ds, test_ds) |
|
|
) |
|
|
|
|
|
|
|
|
assert all( |
|
|
pred in valid_label_space for pred in y_pred_t |
|
|
), f"Transformers predictions contain invalid labels. Valid: {valid_label_space}, Got: {set(y_pred_t)}" |
|
|
|
|
|
|
|
|
assert all( |
|
|
true in valid_label_space for true in y_true_t |
|
|
), f"True labels contain invalid values. Valid: {valid_label_space}, Got: {set(y_true_t)}" |
|
|
|
|
|
|
|
|
assert all( |
|
|
0 <= pred < num_classes for pred in y_pred_t |
|
|
), f"Transformers predictions out of range [0, {num_classes})" |
|
|
|
|
|
|
|
|
assert list(y_true_t) == list( |
|
|
test_ds["label"] |
|
|
), "True labels don't match original test dataset labels" |
|
|
|
|
|
|
|
|
assert all( |
|
|
isinstance(pred, (int, np.integer)) for pred in y_pred |
|
|
), "SetFit predictions must be integers" |
|
|
assert all( |
|
|
isinstance(pred, (int, np.integer)) for pred in y_pred_t |
|
|
), "Transformers predictions must be integers" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
unique_preds = len(set(y_pred)) |
|
|
unique_preds_t = len(set(y_pred_t)) |
|
|
assert unique_preds >= 1, "SetFit made no predictions" |
|
|
assert unique_preds_t >= 1, "Transformers made no predictions" |
|
|
|