# DATA PREPROCESSING PIPELINE
- Import the dataset
- Dataset cleaning
    - Checking column types integrity
    - Handle missing values
        - Remove rows with missing values
    - Remove duplicated samples
    - Clean body of the samples (remove html chars, urls, etc.)
    - Check if there are unmeaningful bodies (filtering out sampeswith too short bodies)


In [1]:
from syntetic_issue_report_data_generation.dataset import DataPreprocessing
import pprint

# NASA CFS DATASET PREPROCESSING

In [None]:
datasets = ["nasa_cfs_train", "nasa_fprime_train", "nasa_train", "nlbse23_train", "nlbse24_train", 
            "nasa_cfs_test", "nasa_fprime_test", "nasa_test", "nlbse23_test", "nlbse24_test"]

for dataset in datasets:
    print(f"\n\nProcessing dataset: {dataset}")
    # load the dataset and print basic stats
    dp = DataPreprocessing(dataset)
    dp.load_dataset()
    dp.basic_stats()

    # check the dataset
    checks = ["column_types", "missing_values", "duplicates", "unmeaningful_bodies"]
    results = dp.check_dataset(checks, save_report=True, report_path="Raw data")

    pprint.pprint(results)
    
    # clean the dataset
    dp.automated_cleaning()

    print("\nCleaning completed!\n")

    # check the dataset again to verify if the cleaning process worked
    results = dp.check_dataset(checks,save_report=True,report_path="Interim data")
    pprint.pprint(results)
    
    # save the dataset
    dp.save_dataset()

    # clean dataset allocated memory
    del dp

[32m2025-11-19 18:10:31.659[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mload_dataset[0m:[36m68[0m - [1mDataset loaded: issue-report-classification/nasa/cfs_train.csv[0m
[32m2025-11-19 18:10:31.679[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m313[0m - [1mSolving columns integrity issues...[0m




Processing dataset: nasa_cfs_train
                                                    title  \
count                                                2178   
unique                                               2021   
top     Apps should use CFE_MSG_PTR macro instead of c...   
freq                                                    9   

                                                     body    label  \
count                                                2178     2179   
unique                                               2058        2   
top     **Checklist (Please check before submitting)**...  non-bug   
freq                                                    9     1517   

                                           url  
count                                     2178  
unique                                    2178  
top     https://github.com/nasa/osal/issues/68  
freq                                         1  


[32m2025-11-19 18:10:31.697[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m323[0m - [1mColumns integrity issues solved![0m
[32m2025-11-19 18:10:31.699[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m324[0m - [1mNumber of samples after cleaning columns integrity: 2178[0m
[32m2025-11-19 18:10:31.701[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m333[0m - [1mCleaning missing values...[0m
[32m2025-11-19 18:10:32.090[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m340[0m - [1mMissing values cleaned![0m
[32m2025-11-19 18:10:32.091[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m341[0m - [1mNumber of samples after cleaning missing values


Cleaning completed!



Calculating Metrics: 100%|██████████| 66/66 [00:00<00:00, 533.40it/s]
[32m2025-11-19 18:10:34.044[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mcheck_dataset[0m:[36m288[0m - [1mDataset checking completed![0m
[32m2025-11-19 18:10:34.141[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mload_dataset[0m:[36m68[0m - [1mDataset loaded: issue-report-classification/nasa/fprime_train.csv[0m
[32m2025-11-19 18:10:34.156[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m313[0m - [1mSolving columns integrity issues...[0m
[32m2025-11-19 18:10:34.164[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m323[0m - [1mColumns integrity issues solved![0m
[32m2025-11-19 18:10:34.166[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_i

<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we

[32m2025-11-19 18:10:34.272[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m340[0m - [1mMissing values cleaned![0m
[32m2025-11-19 18:10:34.275[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m341[0m - [1mNumber of samples after cleaning missing values: 599[0m
[32m2025-11-19 18:10:34.276[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m349[0m - [1mCleaning duplicates...[0m
[32m2025-11-19 18:10:34.287[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m354[0m - [1mDuplicates cleaned![0m
[32m2025-11-19 18:10:34.296[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m355[0m - [1mNumber of samples after cleaning duplicates: 599[0m
[32m2025-11-19 18:10:34.296[0m 


Cleaning completed!



Calculating Metrics: 100%|██████████| 66/66 [00:00<00:00, 742.10it/s]
[32m2025-11-19 18:10:35.336[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mcheck_dataset[0m:[36m288[0m - [1mDataset checking completed![0m
[32m2025-11-19 18:10:35.424[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mload_dataset[0m:[36m68[0m - [1mDataset loaded: issue-report-classification/nasa/nasa_train_sample.csv[0m
[32m2025-11-19 18:10:35.431[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m313[0m - [1mSolving columns integrity issues...[0m
[32m2025-11-19 18:10:35.438[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m323[0m - [1mColumns integrity issues solved![0m
[32m2025-11-19 18:10:35.440[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_colu

<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we

[32m2025-11-19 18:10:35.513[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_columns_type_integrity_expectation[0m:[36m180[0m - [1mColumns type integrity checks set[0m
[32m2025-11-19 18:10:35.531[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_missing_values_expectation[0m:[36m191[0m - [1mMissing values checks set[0m
[32m2025-11-19 18:10:35.541[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_duplicates_expectation[0m:[36m204[0m - [1mDuplicates checks set[0m
[32m2025-11-19 18:10:35.554[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_unmeaningful_bodies_expectation[0m:[36m222[0m - [1mUnmeaningful bodies checks set[0m
Calculating Metrics: 100%|██████████| 42/42 [00:00<00:00, 684.44it/s]
[32m2025-11-19 18:10:36.023[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset

<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we

[32m2025-11-19 18:11:52.219[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mload_dataset[0m:[36m68[0m - [1mDataset loaded: issue-report-classification/nlbse23/nlbse23-issue-classification-train.csv[0m
[32m2025-11-19 18:11:59.295[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m313[0m - [1mSolving columns integrity issues...[0m


                  id   labels                              title     body  \
count   1.275881e+06  1275881                            1275877  1271200   
unique           NaN        4                            1183033  1203337   
top              NaN      bug  Need a service that has a counter      TBD   
freq             NaN   670951                               1433     5382   
mean    1.137417e+09      NaN                                NaN      NaN   
std     2.253828e+08      NaN                                NaN      NaN   
min     2.747400e+04      NaN                                NaN      NaN   
25%     1.097049e+09      NaN                                NaN      NaN   
50%     1.182023e+09      NaN                                NaN      NaN   
75%     1.279079e+09      NaN                                NaN      NaN   
max     1.393120e+09      NaN                                NaN      NaN   

       author_association  
count             1275881  
unique             

[32m2025-11-19 18:12:03.929[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m323[0m - [1mColumns integrity issues solved![0m
[32m2025-11-19 18:12:03.931[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m324[0m - [1mNumber of samples after cleaning columns integrity: 1271197[0m
[32m2025-11-19 18:12:03.932[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m333[0m - [1mCleaning missing values...[0m
[32m2025-11-19 18:14:42.064[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m340[0m - [1mMissing values cleaned![0m
[32m2025-11-19 18:14:42.066[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m341[0m - [1mNumber of samples after cleaning missing val


Cleaning completed!



[32m2025-11-19 18:21:45.505[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_columns_type_integrity_expectation[0m:[36m174[0m - [1mMost frequent type in column id: <class 'int'>[0m
[32m2025-11-19 18:21:45.850[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_columns_type_integrity_expectation[0m:[36m174[0m - [1mMost frequent type in column labels: <class 'str'>[0m
[32m2025-11-19 18:21:46.144[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_columns_type_integrity_expectation[0m:[36m174[0m - [1mMost frequent type in column title: <class 'str'>[0m
[32m2025-11-19 18:21:46.607[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_columns_type_integrity_expectation[0m:[36m174[0m - [1mMost frequent type in column body: <class 'str'>[0m
[32m2025-11-19 18:21:46.944[0m | [1mINFO    [0m | [36msyntetic

<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we

[32m2025-11-19 18:22:48.492[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mload_dataset[0m:[36m68[0m - [1mDataset loaded: issue-report-classification/nlbse24/issues_train.csv[0m
[32m2025-11-19 18:22:48.536[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m313[0m - [1mSolving columns integrity issues...[0m
[32m2025-11-19 18:22:48.561[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m323[0m - [1mColumns integrity issues solved![0m
[32m2025-11-19 18:22:48.562[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m324[0m - [1mNumber of samples after cleaning columns integrity: 1500[0m
[32m2025-11-19 18:22:48.562[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m333[0m - [1

                  repo           created_at label  \
count             1500                 1500  1500   
unique               5                 1500     3   
top     facebook/react  2022-01-11 16:30:53   bug   
freq               300                    1   500   

                                                    title  \
count                                                1500   
unique                                               1481   
top     [DevTools Bug] Cannot add node "1" because a n...   
freq                                                   12   

                                                     body  
count                                                1500  
unique                                               1492  
top     Please go to Stack Overflow for help and suppo...  
freq                                                    4  


[32m2025-11-19 18:22:48.835[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m340[0m - [1mMissing values cleaned![0m
[32m2025-11-19 18:22:48.837[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m341[0m - [1mNumber of samples after cleaning missing values: 1500[0m
[32m2025-11-19 18:22:48.838[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m349[0m - [1mCleaning duplicates...[0m
[32m2025-11-19 18:22:48.858[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m354[0m - [1mDuplicates cleaned![0m
[32m2025-11-19 18:22:48.861[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m355[0m - [1mNumber of samples after cleaning duplicates: 1500[0m
[32m2025-11-19 18:22:48.863[0


Cleaning completed!



Calculating Metrics: 100%|██████████| 78/78 [00:00<00:00, 313.09it/s]
[32m2025-11-19 18:22:51.109[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mcheck_dataset[0m:[36m288[0m - [1mDataset checking completed![0m
[32m2025-11-19 18:22:51.259[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mload_dataset[0m:[36m68[0m - [1mDataset loaded: issue-report-classification/nasa/cfs_test.csv[0m
[32m2025-11-19 18:22:51.279[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m313[0m - [1mSolving columns integrity issues...[0m
[32m2025-11-19 18:22:51.279[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m323[0m - [1mColumns integrity issues solved![0m
[32m2025-11-19 18:22:51.289[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integ

<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we

[32m2025-11-19 18:22:51.389[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m340[0m - [1mMissing values cleaned![0m
[32m2025-11-19 18:22:51.389[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m341[0m - [1mNumber of samples after cleaning missing values: 545[0m
[32m2025-11-19 18:22:51.389[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m349[0m - [1mCleaning duplicates...[0m
[32m2025-11-19 18:22:51.403[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m354[0m - [1mDuplicates cleaned![0m
[32m2025-11-19 18:22:51.403[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m355[0m - [1mNumber of samples after cleaning duplicates: 545[0m
[32m2025-11-19 18:22:51.403[0m 


Cleaning completed!



Calculating Metrics: 100%|██████████| 66/66 [00:00<00:00, 640.52it/s]
[32m2025-11-19 18:22:52.469[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mcheck_dataset[0m:[36m288[0m - [1mDataset checking completed![0m


<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we

[32m2025-11-19 18:22:52.659[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mload_dataset[0m:[36m68[0m - [1mDataset loaded: issue-report-classification/nasa/fprime_test.csv[0m
[32m2025-11-19 18:22:52.691[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m313[0m - [1mSolving columns integrity issues...[0m
[32m2025-11-19 18:22:52.702[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m323[0m - [1mColumns integrity issues solved![0m
[32m2025-11-19 18:22:52.703[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m324[0m - [1mNumber of samples after cleaning columns integrity: 150[0m
[32m2025-11-19 18:22:52.709[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m333[0m - [1mClea

                                          title  \
count                                       151   
unique                                      151   
top     Graviation compass system ref component   
freq                                          1   

                                                     body    label  \
count                                                 150      151   
unique                                                149        2   
top     | | |\r\n|:---|:---|\r\n|**_F´ Version_**| |\r...  non-bug   
freq                                                    2       76   

                                               url  
count                                          151  
unique                                         151  
top     https://github.com/nasa/fprime/issues/2118  
freq                                             1  

Cleaning completed!



[32m2025-11-19 18:22:52.882[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_columns_type_integrity_expectation[0m:[36m174[0m - [1mMost frequent type in column label: <class 'str'>[0m
[32m2025-11-19 18:22:52.898[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_columns_type_integrity_expectation[0m:[36m174[0m - [1mMost frequent type in column url: <class 'str'>[0m
[32m2025-11-19 18:22:52.909[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_columns_type_integrity_expectation[0m:[36m180[0m - [1mColumns type integrity checks set[0m
[32m2025-11-19 18:22:52.965[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_missing_values_expectation[0m:[36m191[0m - [1mMissing values checks set[0m
[32m2025-11-19 18:22:52.980[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m

<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we

[32m2025-11-19 18:22:53.868[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_duplicates_expectation[0m:[36m204[0m - [1mDuplicates checks set[0m
[32m2025-11-19 18:22:53.882[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_unmeaningful_bodies_expectation[0m:[36m222[0m - [1mUnmeaningful bodies checks set[0m
Calculating Metrics: 100%|██████████| 42/42 [00:00<00:00, 658.95it/s]
[32m2025-11-19 18:22:54.450[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mcheck_dataset[0m:[36m288[0m - [1mDataset checking completed![0m


<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we

[32m2025-11-19 18:23:07.779[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mload_dataset[0m:[36m68[0m - [1mDataset loaded: issue-report-classification/nlbse23/nlbse23-issue-classification-test.csv[0m
[32m2025-11-19 18:23:08.507[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m313[0m - [1mSolving columns integrity issues...[0m


                  id  labels                              title    body  \
count   1.423200e+05  142320                             142320  141854   
unique           NaN       4                             137990  138488   
top              NaN     bug  Need a service that has a counter     TBD   
freq             NaN   74781                                159     649   
mean    1.137254e+09     NaN                                NaN     NaN   
std     2.256949e+08     NaN                                NaN     NaN   
min     4.838050e+05     NaN                                NaN     NaN   
25%     1.096958e+09     NaN                                NaN     NaN   
50%     1.181684e+09     NaN                                NaN     NaN   
75%     1.278590e+09     NaN                                NaN     NaN   
max     1.393073e+09     NaN                                NaN     NaN   

       author_association  
count              142320  
unique                  6  
top            

[32m2025-11-19 18:23:09.224[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m323[0m - [1mColumns integrity issues solved![0m
[32m2025-11-19 18:23:09.228[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m324[0m - [1mNumber of samples after cleaning columns integrity: 141854[0m
[32m2025-11-19 18:23:09.229[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m333[0m - [1mCleaning missing values...[0m
[32m2025-11-19 18:23:34.000[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m340[0m - [1mMissing values cleaned![0m
[32m2025-11-19 18:23:34.002[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m341[0m - [1mNumber of samples after cleaning missing valu


Cleaning completed!



[32m2025-11-19 18:24:20.499[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_columns_type_integrity_expectation[0m:[36m180[0m - [1mColumns type integrity checks set[0m
[32m2025-11-19 18:24:20.540[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_missing_values_expectation[0m:[36m191[0m - [1mMissing values checks set[0m
[32m2025-11-19 18:24:20.552[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_duplicates_expectation[0m:[36m204[0m - [1mDuplicates checks set[0m
[32m2025-11-19 18:24:20.562[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36m__check_unmeaningful_bodies_expectation[0m:[36m222[0m - [1mUnmeaningful bodies checks set[0m
Calculating Metrics: 100%|██████████| 71/71 [00:03<00:00, 18.74it/s]
[32m2025-11-19 18:24:26.358[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset

<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we

[32m2025-11-19 18:24:26.978[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mload_dataset[0m:[36m68[0m - [1mDataset loaded: issue-report-classification/nlbse24/issues_test.csv[0m
[32m2025-11-19 18:24:26.998[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m313[0m - [1mSolving columns integrity issues...[0m
[32m2025-11-19 18:24:27.021[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m323[0m - [1mColumns integrity issues solved![0m
[32m2025-11-19 18:24:27.023[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_columns_integrity[0m:[36m324[0m - [1mNumber of samples after cleaning columns integrity: 1498[0m
[32m2025-11-19 18:24:27.024[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m333[0m - [1m

                  repo           created_at label  \
count             1500                 1500  1500   
unique               5                 1500     3   
top     facebook/react  2022-01-06 11:01:32   bug   
freq               300                    1   500   

                                                    title  \
count                                                1500   
unique                                               1490   
top     [DevTools Bug] Cannot add node "1" because a n...   
freq                                                    4   

                                                     body  
count                                                1498  
unique                                               1488  
top     Please go to Stack Overflow for help and suppo...  
freq                                                    4  


[32m2025-11-19 18:24:27.285[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m340[0m - [1mMissing values cleaned![0m
[32m2025-11-19 18:24:27.287[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_missing_values[0m:[36m341[0m - [1mNumber of samples after cleaning missing values: 1498[0m
[32m2025-11-19 18:24:27.288[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m349[0m - [1mCleaning duplicates...[0m
[32m2025-11-19 18:24:27.321[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m354[0m - [1mDuplicates cleaned![0m
[32m2025-11-19 18:24:27.322[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mclean_duplicates[0m:[36m355[0m - [1mNumber of samples after cleaning duplicates: 1498[0m
[32m2025-11-19 18:24:27.324[0


Cleaning completed!



Calculating Metrics: 100%|██████████| 78/78 [00:00<00:00, 446.44it/s]
[32m2025-11-19 18:24:29.148[0m | [1mINFO    [0m | [36msyntetic_issue_report_data_generation.dataset[0m:[36mcheck_dataset[0m:[36m288[0m - [1mDataset checking completed![0m


<!DOCTYPE html>
<html>
  <head>
    <title>Data documentation compiled by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>
    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -we