import pytest from unittest.mock import patch @pytest.fixture(autouse=True, scope="session") def patch_raw_data_dir(): import syntetic_issue_report_data_generation.config as config_mod from pathlib import Path project_root = Path.cwd() raw_dir = project_root / "data" / "raw" # Check if the expected files exist issues_test = raw_dir / "issue-report-classification" / "nlbse24" / "issues_test.csv" test_csv = raw_dir / "test" / "test.csv" if not (issues_test.exists() and test_csv.exists()): # fallback: likely running from site-packages raw_dir = Path("/opt/hostedtoolcache/Python/3.10.19/x64/lib/python3.10/site-packages/data/raw") with patch.object(config_mod, "RAW_DATA_DIR", raw_dir): yield @pytest.fixture def data_preprocessing(): """ Fixture used to load the dataset before running the test suite and to delete the DataPreprocessing object after the execution of the test suite """ # Import after patch is applied from syntetic_issue_report_data_generation.preprocessing import DataPreprocessing # load the dataset before the tests dp = DataPreprocessing("test") dp.load_dataset() dp.basic_stats() yield dp del dp class TestDataValidation: """ Test class to test the validation functions of the class DataPreprocessing """ @pytest.fixture def dataset_reset(self, data_preprocessing): """ Fixture used to reload the dataset after each test in the class """ yield # reset the dataset after each test data_preprocessing.load_dataset() def test_columns_type_integrity(self, data_preprocessing): # check if columns type unintegrity is detected res = data_preprocessing.check_dataset(checks=["column_types"]) # all columns have the unique type assert res[0]["success"] and res[1]["success"] and res[2]["success"] assert ( res[0]["num_of_failed_rows"] == 0 and res[1]["num_of_failed_rows"] == 0 and res[2]["num_of_failed_rows"] == 0 ) def test_missing_values(self, data_preprocessing): # check if missing values are detected res = data_preprocessing.check_dataset(checks=["missing_values"]) # only body and title has missing values assert res[0]["success"] and not res[1]["success"] and not res[2]["success"] assert ( res[0]["num_of_failed_rows"] == 0 and res[1]["num_of_failed_rows"] == 1 and res[2]["num_of_failed_rows"] == 1 ) def test_duplicates(self, data_preprocessing): # check if duplicates are detected res = data_preprocessing.check_dataset(checks=["duplicates"])[0] # two duplicates should be detected assert not res["success"] assert res["num_of_failed_rows"] == 2 def test_unmeaningful_bodies(self, data_preprocessing): # check if unmeaningful bodies are detected res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0] # some unmeaningful body should be detected assert not res["success"] class TestDataCleaning: """ Test class to test the cleaning functions of the class DataPreprocessing """ @pytest.fixture def dataset_reset(self, data_preprocessing): """ Fixture used to reload the dataset after each test in the class """ yield # reset the dataset after each test data_preprocessing.load_dataset() def test_clean_columns_integrity(self, data_preprocessing): # clean the dataset columns integrity data_preprocessing.clean_columns_integrity() # check if columns type unintegrity is detected res = data_preprocessing.check_dataset(checks=["column_types"]) # all columns have the unique type assert all(x["success"] for x in res) def test_clean_missing_values(self, data_preprocessing): # clean the dataset missing values data_preprocessing.clean_missing_values() # check if missing values are detected res = data_preprocessing.check_dataset(checks=["missing_values"]) # no columns should have missing values assert all(x["success"] for x in res) def test_clean_duplicates(self, data_preprocessing): # clean the dataset duplicates data_preprocessing.clean_duplicates() # check if duplicates are detected res = data_preprocessing.check_dataset(checks=["duplicates"])[0] # no duplicates should be detected assert res["success"] def test_clean_unmeaningful_bodies(self, data_preprocessing): # clean the dataset unmeaningful bodies data_preprocessing.clean_unmeaningful_bodies() # check if unmeaningful bodies are detected res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0] # no unmeaningful bodies should be detected assert res["success"] def test_clean_bodies(self, data_preprocessing): # clean the dataset bodies data_preprocessing.clean_bodies() # check if the body is cleaned df = data_preprocessing.get_dataset() print(df.iloc[0]["body"]) assert ( df.iloc[0]["body"] == "Here is a long body with URL and path and hex ." )