import pytest
from unittest.mock import patch

@pytest.fixture(autouse=True, scope="session")
def patch_raw_data_dir():
    import syntetic_issue_report_data_generation.config as config_mod
    from pathlib import Path
    project_root = Path.cwd()
    raw_dir = project_root / "data" / "raw"
    # Check if the expected files exist
    issues_test = raw_dir / "issue-report-classification" / "nlbse24" / "issues_test.csv"
    test_csv = raw_dir / "test" / "test.csv"
    if not (issues_test.exists() and test_csv.exists()):
        # fallback: likely running from site-packages
        raw_dir = Path("/opt/hostedtoolcache/Python/3.10.19/x64/lib/python3.10/site-packages/data/raw")
    with patch.object(config_mod, "RAW_DATA_DIR", raw_dir):
        yield

@pytest.fixture
def data_preprocessing():
    """
    Fixture used to load the dataset before running the test suite and to delete the
    DataPreprocessing object after the execution of the test suite
    """
    # Import after patch is applied
    from syntetic_issue_report_data_generation.preprocessing import DataPreprocessing
    
    # load the dataset before the tests
    dp = DataPreprocessing("test")
    dp.load_dataset()
    dp.basic_stats()

    yield dp

    del dp


class TestDataValidation:
    """
    Test class to test the validation functions of the class DataPreprocessing
    """

    @pytest.fixture
    def dataset_reset(self, data_preprocessing):
        """
        Fixture used to reload the dataset after each test in the class
        """
        yield

        # reset the dataset after each test
        data_preprocessing.load_dataset()

    def test_columns_type_integrity(self, data_preprocessing):
        # check if columns type unintegrity is detected
        res = data_preprocessing.check_dataset(checks=["column_types"])

        # all columns have the unique type
        assert res[0]["success"] and res[1]["success"] and res[2]["success"]
        assert (
            res[0]["num_of_failed_rows"] == 0
            and res[1]["num_of_failed_rows"] == 0
            and res[2]["num_of_failed_rows"] == 0
        )

    def test_missing_values(self, data_preprocessing):
        # check if missing values are detected
        res = data_preprocessing.check_dataset(checks=["missing_values"])

        # only body and title has missing values
        assert res[0]["success"] and not res[1]["success"] and not res[2]["success"]
        assert (
            res[0]["num_of_failed_rows"] == 0
            and res[1]["num_of_failed_rows"] == 1
            and res[2]["num_of_failed_rows"] == 1
        )

    def test_duplicates(self, data_preprocessing):
        # check if duplicates are detected
        res = data_preprocessing.check_dataset(checks=["duplicates"])[0]

        # two duplicates should be detected
        assert not res["success"]
        assert res["num_of_failed_rows"] == 2

    def test_unmeaningful_bodies(self, data_preprocessing):
        # check if unmeaningful bodies are detected
        res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0]

        # some unmeaningful body should be detected
        assert not res["success"]


class TestDataCleaning:
    """
    Test class to test the cleaning functions of the class DataPreprocessing
    """

    @pytest.fixture
    def dataset_reset(self, data_preprocessing):
        """
        Fixture used to reload the dataset after each test in the class
        """
        yield

        # reset the dataset after each test
        data_preprocessing.load_dataset()

    def test_clean_columns_integrity(self, data_preprocessing):
        # clean the dataset columns integrity
        data_preprocessing.clean_columns_integrity()

        # check if columns type unintegrity is detected
        res = data_preprocessing.check_dataset(checks=["column_types"])

        # all columns have the unique type
        assert all(x["success"] for x in res)

    def test_clean_missing_values(self, data_preprocessing):
        # clean the dataset missing values
        data_preprocessing.clean_missing_values()

        # check if missing values are detected
        res = data_preprocessing.check_dataset(checks=["missing_values"])

        # no columns should have missing values
        assert all(x["success"] for x in res)

    def test_clean_duplicates(self, data_preprocessing):
        # clean the dataset duplicates
        data_preprocessing.clean_duplicates()

        # check if duplicates are detected
        res = data_preprocessing.check_dataset(checks=["duplicates"])[0]

        # no duplicates should be detected
        assert res["success"]

    def test_clean_unmeaningful_bodies(self, data_preprocessing):
        # clean the dataset unmeaningful bodies
        data_preprocessing.clean_unmeaningful_bodies()

        # check if unmeaningful bodies are detected
        res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0]

        # no unmeaningful bodies should be detected
        assert res["success"]

    def test_clean_bodies(self, data_preprocessing):
        # clean the dataset bodies
        data_preprocessing.clean_bodies()

        # check if the body is cleaned
        df = data_preprocessing.get_dataset()

        print(df.iloc[0]["body"])

        assert (
            df.iloc[0]["body"]
            == "Here is a long body with URL <URL> and path <PATH> and hex <HEX>."
        )