File size: 5,095 Bytes
a1f096f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import pytest
import pandas as pd
import numpy as np
import os

from syntetic_issue_report_data_generation.utils.clustering_sampling import ClusteringSampling
from syntetic_issue_report_data_generation.utils.text_encoder import Encoder
from syntetic_issue_report_data_generation.config import RANDOM_SEED, DATASET_CONFIGs, RAW_DATA_DIR, EMBEDDING_DIR
from sklearn.datasets import make_blobs

@pytest.fixture
def dataset():
    """
    Fixture used to load the dataset used to test the clustering functions
    """

    # load the test dataset
    df = pd.read_csv(
        RAW_DATA_DIR / f"{DATASET_CONFIGs['nlbse24_test']['data_path']}",
        sep=DATASET_CONFIGs["nlbse24_test"].get('sep', ','),
        encoding="utf-8",
    )

    return df

@pytest.fixture
def clustering_sampling():
    """
    Fixture used to create the Clustering sampling model that needs to be tested
    """
    return ClusteringSampling(n_clusters=5, n_samples=20, method='medoid')

class TestClustering:
    """
    Test class to test the clustering functions of the class ClusteringSampling.

    The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'.
    """
    
    def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
        """
        Test method that checks if the clustering model works with some basic samples
        """
        # Create a basic datset from sklearn make blobs
        X, y = make_blobs(n_samples=200, n_features=2, centers=5, random_state=RANDOM_SEED)

        # Train the model
        sample_dict = clustering_sampling.forward(X)

        # evaluate the model
        evaluation = clustering_sampling.evaluate(sample_dict["sample"], X)

        # Check if the clustering metrics are satisfied
        assert evaluation["silhouette"] > 0.5
        assert evaluation["mse"] < 1.5
    
    def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
        """
        Test the clustering model on the test dataset
        """
        # Extract the bodies of the dataset
        bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()]

        # Extract embeddings of the bodies
        embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy"

        if not os.path.exists(embeddings_path):
            encoder = Encoder()
            embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path)
        else:
            embeddings = np.load(embeddings_path)

        # Train the model
        sample_dict = clustering_sampling.forward(embeddings)

        # evaluate the model
        evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings)

        # Check if the clustering metrics are satisfied
        assert evaluation["silhouette"] > 0.5
        assert evaluation["mse"] < 1.5

class TestSampling:
    """
    Test class to test the sampling functions of the class ClusteringSampling.

    The model is tested on its best hyperparameters configuration found in the notebook 'clustering-sampling-techniques-comparison.ipynb'.
    """
    
    def test_minimum_functionality(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
        """
        Test method that checks if the sampling method works with a basid dataset
        """
        # Create a basic datset from a normal distribution
        X = np.random.normal(0,1,(200,2))

        # Train the model
        sample_dict = clustering_sampling.forward(X)

        # evaluate the model
        evaluation = clustering_sampling.evaluate(sample_dict["sample"], X)

        # Check if the sampling metrics are satisfied
        assert evaluation["distributions_mean_norm_sim"] < 0.5
        assert evaluation["distributions_std_norm_sim"] < 0.5
        assert evaluation["kst"] == False

    
    def test_dataset(self, dataset : pd.DataFrame, clustering_sampling : ClusteringSampling):
        """
        Test the sampling method on the test dataset
        """
        # Extract the bodies of the dataset
        bodies = [str(x) for x in dataset[DATASET_CONFIGs["nlbse24_test"]["body_col"]].tolist()]

        # Extract embeddings of the bodies
        embeddings_path = EMBEDDING_DIR / "issue-report-classification" / "nlbse24" / "nlbse24_test_embeddings.npy"

        if not os.path.exists(embeddings_path):
            encoder = Encoder()
            embeddings = encoder.encode(bodies, save=True, save_path=embeddings_path)
        else:
            embeddings = np.load(embeddings_path)

        # Train the model
        sample_dict = clustering_sampling.forward(embeddings)

        # evaluate the model
        evaluation = clustering_sampling.evaluate(sample_dict["sample"], embeddings)

        # Check if the sampling metrics are satisfied
        assert evaluation["distributions_mean_norm_sim"] < 0.5
        assert evaluation["distributions_std_norm_sim"] < 0.5
        assert evaluation["kst"] == False