Spaces:

donato11
/

Capibara

Running

File size: 5,095 Bytes

a1f096f

import pytest
import pandas as pd
import numpy as np
import os

from syntetic_issue_report_data_generation.utils.clustering_sampling import ClusteringSampling
from syntetic_issue_report_data_generation.utils.text_encoder import Encoder
from syntetic_issue_report_data_generation.config import RANDOM_SEED, DATASET_CONFIGs, RAW_DATA_DIR, EMBEDDING_DIR
from sklearn.datasets import make_blobs

@pytest.fixture
def dataset():
    """
    Fixture used to load the dataset used to test the clustering functions
    """

    # load the test dataset
    df = pd.read_csv(
        RAW_DATA_DIR / f"{DATASET_CONFIGs['nlbse24_test']['data_path']}",
        sep=DATASET_CONFIGs["nlbse24_test"].get('sep', ','),
        encoding="utf-8",
    )

    return df

@pytest.fixture
def clustering_sampling():
    """
    Fixture used to create the Clustering sampling model that needs to be tested
    """
    return ClusteringSampling(n_clusters=5, n_samples=20, method='medoid')

class TestClustering:
    """
    Test class to test the clustering functions of the class ClusteringSampling.

    The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'.
    """
    
    def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
        """
        Test method that checks if the clustering model works with some basic samples
        """
        # Create a basic datset from sklearn make blobs
        X, y = make_blobs(n_samples=200, n_features=2, centers=5, random_state=RANDOM_SEED)

        # Train the model
        sample_dict = clustering_sampling.forward(X)

        # evaluate the model
        evaluation = clustering_sampling.evaluate(sample_dict["sample"], X)

        # Check if the clustering metrics are satisfied
        assert evaluation["silhouette"] > 0.5
        assert evaluation["mse"] < 1.5
    
    def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
        """
        Test the clustering model on the test dataset
        """
        # Extract the bodies of the dataset
        bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()]

        # Extract embeddings of the bodies
        embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy"

        if not os.path.exists(embeddings_path):
            encoder = Encoder()
            embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path)
        else:
            embeddings = np.load(embeddings_path)

        # Train the model
        sample_dict = clustering_sampling.forward(embeddings)

        # evaluate the model
        evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings)

        # Check if the clustering metrics are satisfied
        assert evaluation["silhouette"] > 0.5
        assert evaluation["mse"] < 1.5

class TestSampling:
    """
    Test class to test the sampling functions of the class ClusteringSampling.

    The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'.
    """
    
    def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
        """
        Test method that checks if the sampling method works with a basid dataset
        """
        # Create a basic datset from a normal distribution
        X = np.random.normal(0,1,(200,2))

        # Train the model
        sample_dict = clustering_sampling.forward(X)

        # evaluate the model
        evaluation = clustering_sampling.evaluate(sample_dict["sample"], X)

        # Check if the sampling metrics are satisfied
        assert evaluation["distributions_mean_norm_sim"] < 0.5
        assert evaluation["distributions_std_norm_sim"] < 0.5
        assert evaluation["kst"] == False

    
    def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
        """
        Test the sampling method on the test dataset
        """
        # Extract the bodies of the dataset
        bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()]

        # Extract embeddings of the bodies
        embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy"

        if not os.path.exists(embeddings_path):
            encoder = Encoder()
            embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path)
        else:
            embeddings = np.load(embeddings_path)

        # Train the model
        sample_dict = clustering_sampling.forward(embeddings)

        # evaluate the model
        evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings)

        # Check if the sampling metrics are satisfied
        assert evaluation["distributions_mean_norm_sim"] < 0.5
        assert evaluation["distributions_std_norm_sim"] < 0.5
        assert evaluation["kst"] == False