Spaces:

donato11
/

Capibara

Running

Capibara / tests /test_data.py

ashkihotah

prova

ed993b7 20 days ago

4.65 kB

	import pytest

	from syntetic_issue_report_data_generation.preprocessing import DataPreprocessing


	@pytest.fixture
	def data_preprocessing():
	"""
	Fixture used to load the dataset before running the test suite and to delete the
	DataPreprocessing object after the execution of the test suite
	"""
	# load the dataset before the tests
	dp = DataPreprocessing("test")
	dp.load_dataset()
	dp.basic_stats()

	yield dp

	del dp


	class TestDataValidation:
	"""
	Test class to test the validation functions of the class DataPreprocessing
	"""

	@pytest.fixture
	def dataset_reset(self, data_preprocessing):
	"""
	Fixture used to reload the dataset after each test in the class
	"""
	yield

	# reset the dataset after each test
	data_preprocessing.load_dataset()

	def test_columns_type_integrity(self, data_preprocessing):
	# check if columns type unintegrity is detected
	res = data_preprocessing.check_dataset(checks=["column_types"])

	# all columns have the unique type
	assert res[0]["success"] and res[1]["success"] and res[2]["success"]
	assert (
	res[0]["num_of_failed_rows"] == 0
	and res[1]["num_of_failed_rows"] == 0
	and res[2]["num_of_failed_rows"] == 0
	)

	def test_missing_values(self, data_preprocessing):
	# check if missing values are detected
	res = data_preprocessing.check_dataset(checks=["missing_values"])

	# only body and title has missing values
	assert res[0]["success"] and not res[1]["success"] and not res[2]["success"]
	assert (
	res[0]["num_of_failed_rows"] == 0
	and res[1]["num_of_failed_rows"] == 1
	and res[2]["num_of_failed_rows"] == 1
	)

	def test_duplicates(self, data_preprocessing):
	# check if duplicates are detected
	res = data_preprocessing.check_dataset(checks=["duplicates"])[0]

	# two duplicates should be detected
	assert not res["success"]
	assert res["num_of_failed_rows"] == 2

	def test_unmeaningful_bodies(self, data_preprocessing):
	# check if unmeaningful bodies are detected
	res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0]

	# some unmeaningful body should be detected
	assert not res["success"]


	class TestDataCleaning:
	"""
	Test class to test the cleaning functions of the class DataPreprocessing
	"""

	@pytest.fixture
	def dataset_reset(self, data_preprocessing):
	"""
	Fixture used to reload the dataset after each test in the class
	"""
	yield

	# reset the dataset after each test
	data_preprocessing.load_dataset()

	def test_clean_columns_integrity(self, data_preprocessing):
	# clean the dataset columns integrity
	data_preprocessing.clean_columns_integrity()

	# check if columns type unintegrity is detected
	res = data_preprocessing.check_dataset(checks=["column_types"])

	# all columns have the unique type
	assert all(x["success"] for x in res)

	def test_clean_missing_values(self, data_preprocessing):
	# clean the dataset missing values
	data_preprocessing.clean_missing_values()

	# check if missing values are detected
	res = data_preprocessing.check_dataset(checks=["missing_values"])

	# no columns should have missing values
	assert all(x["success"] for x in res)

	def test_clean_duplicates(self, data_preprocessing):
	# clean the dataset duplicates
	data_preprocessing.clean_duplicates()

	# check if duplicates are detected
	res = data_preprocessing.check_dataset(checks=["duplicates"])[0]

	# no duplicates should be detected
	assert res["success"]

	def test_clean_unmeaningful_bodies(self, data_preprocessing):
	# clean the dataset unmeaningful bodies
	data_preprocessing.clean_unmeaningful_bodies()

	# check if unmeaningful bodies are detected
	res = data_preprocessing.check_dataset(checks=["unmeaningful_bodies"])[0]

	# no unmeaningful bodies should be detected
	assert res["success"]

	def test_clean_bodies(self, data_preprocessing):
	# clean the dataset bodies
	data_preprocessing.clean_bodies()

	# check if the body is cleaned
	df = data_preprocessing.get_dataset()

	print(df.iloc[0]["body"])

	assert (
	df.iloc[0]["body"]
	== "Here is a long body with URL <URL> and path <PATH> and hex <HEX>."
	)