Spaces:

donato11
/

Capibara

Running

App Files Files Community

Capibara / tests /test_clustering.py

MattiaColucci

add test class for clustering. Add the possibility to import clusters in the clustering sampling.

a1f096f 20 days ago

raw

history blame contribute delete

5.1 kB

	import pytest
	import pandas as pd
	import numpy as np
	import os

	from syntetic_issue_report_data_generation.utils.clustering_sampling import ClusteringSampling
	from syntetic_issue_report_data_generation.utils.text_encoder import Encoder
	from syntetic_issue_report_data_generation.config import RANDOM_SEED, DATASET_CONFIGs, RAW_DATA_DIR, EMBEDDING_DIR
	from sklearn.datasets import make_blobs

	@pytest.fixture
	def dataset():
	"""
	Fixture used to load the dataset used to test the clustering functions
	"""

	# load the test dataset
	df = pd.read_csv(
	RAW_DATA_DIR / f"{DATASET_CONFIGs['nlbse24_test']['data_path']}",
	sep=DATASET_CONFIGs["nlbse24_test"].get('sep', ','),
	encoding="utf-8",
	)

	return df

	@pytest.fixture
	def clustering_sampling():
	"""
	Fixture used to create the Clustering sampling model that needs to be tested
	"""
	return ClusteringSampling(n_clusters=5, n_samples=20, method='medoid')

	class TestClustering:
	"""
	Test class to test the clustering functions of the class ClusteringSampling.

	The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'.
	"""

	def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
	"""
	Test method that checks if the clustering model works with some basic samples
	"""
	# Create a basic datset from sklearn make blobs
	X, y = make_blobs(n_samples=200, n_features=2, centers=5, random_state=RANDOM_SEED)

	# Train the model
	sample_dict = clustering_sampling.forward(X)

	# evaluate the model
	evaluation = clustering_sampling.evaluate(sample_dict["sample"], X)

	# Check if the clustering metrics are satisfied
	assert evaluation["silhouette"] > 0.5
	assert evaluation["mse"] < 1.5

	def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
	"""
	Test the clustering model on the test dataset
	"""
	# Extract the bodies of the dataset
	bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()]

	# Extract embeddings of the bodies
	embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy"

	if not os.path.exists(embeddings_path):
	encoder = Encoder()
	embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path)
	else:
	embeddings = np.load(embeddings_path)

	# Train the model
	sample_dict = clustering_sampling.forward(embeddings)

	# evaluate the model
	evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings)

	# Check if the clustering metrics are satisfied
	assert evaluation["silhouette"] > 0.5
	assert evaluation["mse"] < 1.5

	class TestSampling:
	"""
	Test class to test the sampling functions of the class ClusteringSampling.

	The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'.
	"""

	def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
	"""
	Test method that checks if the sampling method works with a basid dataset
	"""
	# Create a basic datset from a normal distribution
	X = np.random.normal(0,1,(200,2))

	# Train the model
	sample_dict = clustering_sampling.forward(X)

	# evaluate the model
	evaluation = clustering_sampling.evaluate(sample_dict["sample"], X)

	# Check if the sampling metrics are satisfied
	assert evaluation["distributions_mean_norm_sim"] < 0.5
	assert evaluation["distributions_std_norm_sim"] < 0.5
	assert evaluation["kst"] == False


	def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
	"""
	Test the sampling method on the test dataset
	"""
	# Extract the bodies of the dataset
	bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()]

	# Extract embeddings of the bodies
	embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy"

	if not os.path.exists(embeddings_path):
	encoder = Encoder()
	embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path)
	else:
	embeddings = np.load(embeddings_path)

	# Train the model
	sample_dict = clustering_sampling.forward(embeddings)

	# evaluate the model
	evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings)

	# Check if the sampling metrics are satisfied
	assert evaluation["distributions_mean_norm_sim"] < 0.5
	assert evaluation["distributions_std_norm_sim"] < 0.5
	assert evaluation["kst"] == False