Capibara / tests /test_data.py
ashkihotah
prova
ed993b7
import pytest
from syntetic_issue_report_data_generation.preprocessing import DataPreprocessing
@pytest.fixture
def data_preprocessing():
"""
Fixture used to load the dataset before running the test suite and to delete the
DataPreprocessing object after the execution of the test suite
"""
# load the dataset before the tests
dp = DataPreprocessing("test")
dp.load_dataset()
dp.basic_stats()
yield dp
del dp
class TestDataValidation:
"""
Test class to test the validation functions of the class DataPreprocessing
"""
@pytest.fixture
def dataset_reset(self, data_preprocessing):
"""
Fixture used to reload the dataset after each test in the class
"""
yield
# reset the dataset after each test
data_preprocessing.load_dataset()
def test_columns_type_integrity(self, data_preprocessing):
# check if columns type unintegrity is detected
res = data_preprocessing.check_dataset(checks=["column_types"])
# all columns have the unique type
assert res[0]["success"] and res[1]["success"] and res[2]["success"]
assert (
res[0]["num_of_failed_rows"] == 0
and res[1]["num_of_failed_rows"] == 0
and res[2]["num_of_failed_rows"] == 0
)
def test_missing_values(self, data_preprocessing):
# check if missing values are detected
res = data_preprocessing.check_dataset(checks=["missing_values"])
# only body and title has missing values
assert res[0]["success"] and not res[1]["success"] and not res[2]["success"]
assert (
res[0]["num_of_failed_rows"] == 0
and res[1]["num_of_failed_rows"] == 1
and res[2]["num_of_failed_rows"] == 1
)
def test_duplicates(self, data_preprocessing):
# check if duplicates are detected
res = data_preprocessing.check_dataset(checks=["duplicates"])[0]
# two duplicates should be detected
assert not res["success"]
assert res["num_of_failed_rows"] == 2
def test_unmeaningful_bodies(self, data_preprocessing):
# check if unmeaningful bodies are detected
res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0]
# some unmeaningful body should be detected
assert not res["success"]
class TestDataCleaning:
"""
Test class to test the cleaning functions of the class DataPreprocessing
"""
@pytest.fixture
def dataset_reset(self, data_preprocessing):
"""
Fixture used to reload the dataset after each test in the class
"""
yield
# reset the dataset after each test
data_preprocessing.load_dataset()
def test_clean_columns_integrity(self, data_preprocessing):
# clean the dataset columns integrity
data_preprocessing.clean_columns_integrity()
# check if columns type unintegrity is detected
res = data_preprocessing.check_dataset(checks=["column_types"])
# all columns have the unique type
assert all(x["success"] for x in res)
def test_clean_missing_values(self, data_preprocessing):
# clean the dataset missing values
data_preprocessing.clean_missing_values()
# check if missing values are detected
res = data_preprocessing.check_dataset(checks=["missing_values"])
# no columns should have missing values
assert all(x["success"] for x in res)
def test_clean_duplicates(self, data_preprocessing):
# clean the dataset duplicates
data_preprocessing.clean_duplicates()
# check if duplicates are detected
res = data_preprocessing.check_dataset(checks=["duplicates"])[0]
# no duplicates should be detected
assert res["success"]
def test_clean_unmeaningful_bodies(self, data_preprocessing):
# clean the dataset unmeaningful bodies
data_preprocessing.clean_unmeaningful_bodies()
# check if unmeaningful bodies are detected
res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0]
# no unmeaningful bodies should be detected
assert res["success"]
def test_clean_bodies(self, data_preprocessing):
# clean the dataset bodies
data_preprocessing.clean_bodies()
# check if the body is cleaned
df = data_preprocessing.get_dataset()
print(df.iloc[0]["body"])
assert (
df.iloc[0]["body"]
== "Here is a long body with URL <URL> and path <PATH> and hex <HEX>."
)