|
|
import pytest |
|
|
|
|
|
from syntetic_issue_report_data_generation.preprocessing import DataPreprocessing |
|
|
|
|
|
|
|
|
@pytest.fixture |
|
|
def data_preprocessing(): |
|
|
""" |
|
|
Fixture used to load the dataset before running the test suite and to delete the |
|
|
DataPreprocessing object after the execution of the test suite |
|
|
""" |
|
|
|
|
|
dp = DataPreprocessing("test") |
|
|
dp.load_dataset() |
|
|
dp.basic_stats() |
|
|
|
|
|
yield dp |
|
|
|
|
|
del dp |
|
|
|
|
|
|
|
|
class TestDataValidation: |
|
|
""" |
|
|
Test class to test the validation functions of the class DataPreprocessing |
|
|
""" |
|
|
|
|
|
@pytest.fixture |
|
|
def dataset_reset(self, data_preprocessing): |
|
|
""" |
|
|
Fixture used to reload the dataset after each test in the class |
|
|
""" |
|
|
yield |
|
|
|
|
|
|
|
|
data_preprocessing.load_dataset() |
|
|
|
|
|
def test_columns_type_integrity(self, data_preprocessing): |
|
|
|
|
|
res = data_preprocessing.check_dataset(checks=["column_types"]) |
|
|
|
|
|
|
|
|
assert res[0]["success"] and res[1]["success"] and res[2]["success"] |
|
|
assert ( |
|
|
res[0]["num_of_failed_rows"] == 0 |
|
|
and res[1]["num_of_failed_rows"] == 0 |
|
|
and res[2]["num_of_failed_rows"] == 0 |
|
|
) |
|
|
|
|
|
def test_missing_values(self, data_preprocessing): |
|
|
|
|
|
res = data_preprocessing.check_dataset(checks=["missing_values"]) |
|
|
|
|
|
|
|
|
assert res[0]["success"] and not res[1]["success"] and not res[2]["success"] |
|
|
assert ( |
|
|
res[0]["num_of_failed_rows"] == 0 |
|
|
and res[1]["num_of_failed_rows"] == 1 |
|
|
and res[2]["num_of_failed_rows"] == 1 |
|
|
) |
|
|
|
|
|
def test_duplicates(self, data_preprocessing): |
|
|
|
|
|
res = data_preprocessing.check_dataset(checks=["duplicates"])[0] |
|
|
|
|
|
|
|
|
assert not res["success"] |
|
|
assert res["num_of_failed_rows"] == 2 |
|
|
|
|
|
def test_unmeaningful_bodies(self, data_preprocessing): |
|
|
|
|
|
res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0] |
|
|
|
|
|
|
|
|
assert not res["success"] |
|
|
|
|
|
|
|
|
class TestDataCleaning: |
|
|
""" |
|
|
Test class to test the cleaning functions of the class DataPreprocessing |
|
|
""" |
|
|
|
|
|
@pytest.fixture |
|
|
def dataset_reset(self, data_preprocessing): |
|
|
""" |
|
|
Fixture used to reload the dataset after each test in the class |
|
|
""" |
|
|
yield |
|
|
|
|
|
|
|
|
data_preprocessing.load_dataset() |
|
|
|
|
|
def test_clean_columns_integrity(self, data_preprocessing): |
|
|
|
|
|
data_preprocessing.clean_columns_integrity() |
|
|
|
|
|
|
|
|
res = data_preprocessing.check_dataset(checks=["column_types"]) |
|
|
|
|
|
|
|
|
assert all(x["success"] for x in res) |
|
|
|
|
|
def test_clean_missing_values(self, data_preprocessing): |
|
|
|
|
|
data_preprocessing.clean_missing_values() |
|
|
|
|
|
|
|
|
res = data_preprocessing.check_dataset(checks=["missing_values"]) |
|
|
|
|
|
|
|
|
assert all(x["success"] for x in res) |
|
|
|
|
|
def test_clean_duplicates(self, data_preprocessing): |
|
|
|
|
|
data_preprocessing.clean_duplicates() |
|
|
|
|
|
|
|
|
res = data_preprocessing.check_dataset(checks=["duplicates"])[0] |
|
|
|
|
|
|
|
|
assert res["success"] |
|
|
|
|
|
def test_clean_unmeaningful_bodies(self, data_preprocessing): |
|
|
|
|
|
data_preprocessing.clean_unmeaningful_bodies() |
|
|
|
|
|
|
|
|
res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0] |
|
|
|
|
|
|
|
|
assert res["success"] |
|
|
|
|
|
def test_clean_bodies(self, data_preprocessing): |
|
|
|
|
|
data_preprocessing.clean_bodies() |
|
|
|
|
|
|
|
|
df = data_preprocessing.get_dataset() |
|
|
|
|
|
print(df.iloc[0]["body"]) |
|
|
|
|
|
assert ( |
|
|
df.iloc[0]["body"] |
|
|
== "Here is a long body with URL <URL> and path <PATH> and hex <HEX>." |
|
|
) |
|
|
|