add test class for clustering. Add the possibility to import clusters in the clustering sampling.
a1f096f
| import pytest | |
| import pandas as pd | |
| import numpy as np | |
| import os | |
| from syntetic_issue_report_data_generation.utils.clustering_sampling import ClusteringSampling | |
| from syntetic_issue_report_data_generation.utils.text_encoder import Encoder | |
| from syntetic_issue_report_data_generation.config import RANDOM_SEED, DATASET_CONFIGs, RAW_DATA_DIR, EMBEDDING_DIR | |
| from sklearn.datasets import make_blobs | |
| def dataset(): | |
| """ | |
| Fixture used to load the dataset used to test the clustering functions | |
| """ | |
| # load the test dataset | |
| df = pd.read_csv( | |
| RAW_DATA_DIR / f"{DATASET_CONFIGs['nlbse24_test']['data_path']}", | |
| sep=DATASET_CONFIGs["nlbse24_test"].get('sep', ','), | |
| encoding="utf-8", | |
| ) | |
| return df | |
| def clustering_sampling(): | |
| """ | |
| Fixture used to create the Clustering sampling model that needs to be tested | |
| """ | |
| return ClusteringSampling(n_clusters=5, n_samples=20, method='medoid') | |
| class TestClustering: | |
| """ | |
| Test class to test the clustering functions of the class ClusteringSampling. | |
| The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'. | |
| """ | |
| def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling): | |
| """ | |
| Test method that checks if the clustering model works with some basic samples | |
| """ | |
| # Create a basic datset from sklearn make blobs | |
| X, y = make_blobs(n_samples=200, n_features=2, centers=5, random_state=RANDOM_SEED) | |
| # Train the model | |
| sample_dict = clustering_sampling.forward(X) | |
| # evaluate the model | |
| evaluation = clustering_sampling.evaluate(sample_dict["sample"], X) | |
| # Check if the clustering metrics are satisfied | |
| assert evaluation["silhouette"] > 0.5 | |
| assert evaluation["mse"] < 1.5 | |
| def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling): | |
| """ | |
| Test the clustering model on the test dataset | |
| """ | |
| # Extract the bodies of the dataset | |
| bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()] | |
| # Extract embeddings of the bodies | |
| embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy" | |
| if not os.path.exists(embeddings_path): | |
| encoder = Encoder() | |
| embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path) | |
| else: | |
| embeddings = np.load(embeddings_path) | |
| # Train the model | |
| sample_dict = clustering_sampling.forward(embeddings) | |
| # evaluate the model | |
| evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings) | |
| # Check if the clustering metrics are satisfied | |
| assert evaluation["silhouette"] > 0.5 | |
| assert evaluation["mse"] < 1.5 | |
| class TestSampling: | |
| """ | |
| Test class to test the sampling functions of the class ClusteringSampling. | |
| The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'. | |
| """ | |
| def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling): | |
| """ | |
| Test method that checks if the sampling method works with a basid dataset | |
| """ | |
| # Create a basic datset from a normal distribution | |
| X = np.random.normal(0,1,(200,2)) | |
| # Train the model | |
| sample_dict = clustering_sampling.forward(X) | |
| # evaluate the model | |
| evaluation = clustering_sampling.evaluate(sample_dict["sample"], X) | |
| # Check if the sampling metrics are satisfied | |
| assert evaluation["distributions_mean_norm_sim"] < 0.5 | |
| assert evaluation["distributions_std_norm_sim"] < 0.5 | |
| assert evaluation["kst"] == False | |
| def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling): | |
| """ | |
| Test the sampling method on the test dataset | |
| """ | |
| # Extract the bodies of the dataset | |
| bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()] | |
| # Extract embeddings of the bodies | |
| embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy" | |
| if not os.path.exists(embeddings_path): | |
| encoder = Encoder() | |
| embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path) | |
| else: | |
| embeddings = np.load(embeddings_path) | |
| # Train the model | |
| sample_dict = clustering_sampling.forward(embeddings) | |
| # evaluate the model | |
| evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings) | |
| # Check if the sampling metrics are satisfied | |
| assert evaluation["distributions_mean_norm_sim"] < 0.5 | |
| assert evaluation["distributions_std_norm_sim"] < 0.5 | |
| assert evaluation["kst"] == False |