Capibara / tests /test_clustering.py
MattiaColucci's picture
add test class for clustering. Add the possibility to import clusters in the clustering sampling.
a1f096f
import pytest
import pandas as pd
import numpy as np
import os
from syntetic_issue_report_data_generation.utils.clustering_sampling import ClusteringSampling
from syntetic_issue_report_data_generation.utils.text_encoder import Encoder
from syntetic_issue_report_data_generation.config import RANDOM_SEED, DATASET_CONFIGs, RAW_DATA_DIR, EMBEDDING_DIR
from sklearn.datasets import make_blobs
@pytest.fixture
def dataset():
"""
Fixture used to load the dataset used to test the clustering functions
"""
# load the test dataset
df = pd.read_csv(
RAW_DATA_DIR / f"{DATASET_CONFIGs['nlbse24_test']['data_path']}",
sep=DATASET_CONFIGs["nlbse24_test"].get('sep', ','),
encoding="utf-8",
)
return df
@pytest.fixture
def clustering_sampling():
"""
Fixture used to create the Clustering sampling model that needs to be tested
"""
return ClusteringSampling(n_clusters=5, n_samples=20, method='medoid')
class TestClustering:
"""
Test class to test the clustering functions of the class ClusteringSampling.
The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'.
"""
def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
"""
Test method that checks if the clustering model works with some basic samples
"""
# Create a basic datset from sklearn make blobs
X, y = make_blobs(n_samples=200, n_features=2, centers=5, random_state=RANDOM_SEED)
# Train the model
sample_dict = clustering_sampling.forward(X)
# evaluate the model
evaluation = clustering_sampling.evaluate(sample_dict["sample"], X)
# Check if the clustering metrics are satisfied
assert evaluation["silhouette"] > 0.5
assert evaluation["mse"] < 1.5
def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
"""
Test the clustering model on the test dataset
"""
# Extract the bodies of the dataset
bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()]
# Extract embeddings of the bodies
embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy"
if not os.path.exists(embeddings_path):
encoder = Encoder()
embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path)
else:
embeddings = np.load(embeddings_path)
# Train the model
sample_dict = clustering_sampling.forward(embeddings)
# evaluate the model
evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings)
# Check if the clustering metrics are satisfied
assert evaluation["silhouette"] > 0.5
assert evaluation["mse"] < 1.5
class TestSampling:
"""
Test class to test the sampling functions of the class ClusteringSampling.
The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'.
"""
def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
"""
Test method that checks if the sampling method works with a basid dataset
"""
# Create a basic datset from a normal distribution
X = np.random.normal(0,1,(200,2))
# Train the model
sample_dict = clustering_sampling.forward(X)
# evaluate the model
evaluation = clustering_sampling.evaluate(sample_dict["sample"], X)
# Check if the sampling metrics are satisfied
assert evaluation["distributions_mean_norm_sim"] < 0.5
assert evaluation["distributions_std_norm_sim"] < 0.5
assert evaluation["kst"] == False
def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
"""
Test the sampling method on the test dataset
"""
# Extract the bodies of the dataset
bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()]
# Extract embeddings of the bodies
embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy"
if not os.path.exists(embeddings_path):
encoder = Encoder()
embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path)
else:
embeddings = np.load(embeddings_path)
# Train the model
sample_dict = clustering_sampling.forward(embeddings)
# evaluate the model
evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings)
# Check if the sampling metrics are satisfied
assert evaluation["distributions_mean_norm_sim"] < 0.5
assert evaluation["distributions_std_norm_sim"] < 0.5
assert evaluation["kst"] == False