{ "cells": [ { "cell_type": "markdown", "id": "3d9f16ce", "metadata": {}, "source": [ "# DATA PREPROCESSING PIPELINE\n", "- Import the dataset\n", "- Dataset cleaning\n", " - Checking column types integrity\n", " - Handle missing values\n", " - Remove rows with missing values\n", " - Remove duplicated samples\n", " - Clean body of the samples (remove html chars, urls, etc.)\n", " - Check if there are unmeaningful bodies (filtering out sampeswith too short bodies)\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "da581a28", "metadata": {}, "outputs": [], "source": [ "from syntetic_issue_report_data_generation.dataset import DataPreprocessing\n", "import pprint" ] }, { "cell_type": "markdown", "id": "b407fe25", "metadata": {}, "source": [ "# NASA CFS DATASET PREPROCESSING" ] }, { "cell_type": "code", "execution_count": null, "id": "25ff6cd0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:10:31.659\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nasa/cfs_train.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:10:31.679\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Processing dataset: nasa_cfs_train\n", " title \\\n", "count 2178 \n", "unique 2021 \n", "top Apps should use CFE_MSG_PTR macro instead of c... \n", "freq 9 \n", "\n", " body label \\\n", "count 2178 2179 \n", "unique 2058 2 \n", "top **Checklist (Please check before submitting)**... non-bug \n", "freq 9 1517 \n", "\n", " url \n", "count 2178 \n", "unique 2178 \n", "top https://github.com/nasa/osal/issues/68 \n", "freq 1 \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:10:31.697\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:31.699\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 2178\u001b[0m\n", "\u001b[32m2025-11-19 18:10:31.701\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.090\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.091\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 2178\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.092\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.109\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.111\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 2178\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.112\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.697\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.697\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 2178\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.697\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.713\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 107\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.719\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.721\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 2107\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.723\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column title: \u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.735\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column body: \u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.743\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column label: \u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.754\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column url: \u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.775\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.836\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.859\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:10:32.876\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Calculating Metrics: 100%|██████████| 66/66 [00:00<00:00, 533.40it/s]\n", "\u001b[32m2025-11-19 18:10:34.044\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.141\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nasa/fprime_train.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.156\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.164\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.166\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 599\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.170\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Succeeded\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
10
\n", "
Successful Expectations
10
\n", "
Unsuccessful Expectations
0
\n", "
Success Percent
100%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:10:33Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T171032.918001Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " 573ec3611019dad2ea8d64f51c9bb5a0\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: title, body, label, url\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " body\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 107 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " label\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " title\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " url\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "

Stay current on everything GX with our newsletter Subscribe

\n", "
\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntitle\"}, \"id\": \"22929ba0-0c39-49b2-9875-55d98d5a3a88\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\"}, \"meta\": {\"tag\": \"missing_values_columntitle\"}, \"id\": \"b4ecd045-0b5d-4356-adb8-f67f0decc826\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnbody\"}, \"id\": \"3e094218-6f50-4d59-b57d-d68c619ae4e5\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\"}, \"meta\": {\"tag\": \"missing_values_columnbody\"}, \"id\": \"f74018d0-2047-43c8-95ab-57021ba173d0\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"min_value\": 107}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"456d6801-f869-4185-8ebf-b7851345169d\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabel\"}, \"id\": \"12f547ea-b486-4913-a15d-08da90602847\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\"}, \"meta\": {\"tag\": \"missing_values_columnlabel\"}, \"id\": \"27ad6d8a-cd7c-4683-bf37-d7f8cab3fcaf\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"url\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnurl\"}, \"id\": \"6cd2a03c-5050-4d64-8dca-1fb8eb1b940c\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"url\"}, \"meta\": {\"tag\": \"missing_values_columnurl\"}, \"id\": \"65309808-0e59-4a16-83b8-dd8cd797fad0\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"title\", \"body\", \"label\", \"url\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"3a0d6ddb-b3bc-4954-b2f6-b0607742561e\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True}]\n", "\n", "\n", "Processing dataset: nasa_fprime_train\n", " title \\\n", "count 600 \n", "unique 600 \n", "top StubFileTest Randomly Fails \n", "freq 1 \n", "\n", " body label \\\n", "count 599 600 \n", "unique 594 2 \n", "top | | |\\r\\n|:---|:---|\\r\\n|**_F´ Version_**| |\\r... non-bug \n", "freq 5 304 \n", "\n", " url \n", "count 600 \n", "unique 600 \n", "top https://github.com/nasa/fprime/issues/2733 \n", "freq 1 \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:10:34.272\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.275\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 599\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.276\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.287\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.296\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 599\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.296\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.571\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.574\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 599\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.576\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.582\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 104\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.587\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.589\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 581\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.595\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column title: \u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.603\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column body: \u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column label: \u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.628\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column url: \u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.641\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.690\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.709\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:10:34.727\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Calculating Metrics: 100%|██████████| 66/66 [00:00<00:00, 742.10it/s]\n", "\u001b[32m2025-11-19 18:10:35.336\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.424\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nasa/nasa_train_sample.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.431\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.438\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.440\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 10\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.441\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.449\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.455\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 10\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.456\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.461\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 10\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.465\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.474\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.476\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 10\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.477\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.484\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 185\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.487\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.487\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 9\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.494\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column label: \u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.498\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column text: \u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Succeeded\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
10
\n", "
Successful Expectations
10
\n", "
Unsuccessful Expectations
0
\n", "
Success Percent
100%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:10:35Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T171034.757186Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " 926f9f43aabb59a79637b7b58bf62fef\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: title, body, label, url\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " body\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 104 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " label\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " title\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " url\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "

Stay current on everything GX with our newsletter Subscribe

\n", "
\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntitle\"}, \"id\": \"9f86d341-1cfa-47c0-bf00-b51bb1c3d7e0\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\"}, \"meta\": {\"tag\": \"missing_values_columntitle\"}, \"id\": \"8081e179-9f11-4504-aa35-ec1cc4c5366a\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnbody\"}, \"id\": \"dc5c961c-b580-4c59-b20d-45a07f077771\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\"}, \"meta\": {\"tag\": \"missing_values_columnbody\"}, \"id\": \"a8169b18-6807-4de7-aeaa-083846da3e38\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"min_value\": 104}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"52a7ad9a-31b9-4010-b816-dbdc6e5d2ebf\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabel\"}, \"id\": \"114097a4-6768-4a80-b833-d4cb060fcdb5\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\"}, \"meta\": {\"tag\": \"missing_values_columnlabel\"}, \"id\": \"23be55d2-9ef9-4ff5-b0f2-715d9cea88ee\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"url\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnurl\"}, \"id\": \"3546a5bc-707f-4fe4-b5fc-7e13ac9178ee\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"url\"}, \"meta\": {\"tag\": \"missing_values_columnurl\"}, \"id\": \"cc2c8ee8-1b7e-4d8b-b55c-b1c3a71ce174\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"title\", \"body\", \"label\", \"url\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"a6f548f1-1c5f-4aa9-b7ef-78aa996b7903\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True}]\n", "\n", "\n", "Processing dataset: nasa_train\n", " label text\n", "count 10 10\n", "unique 4 10\n", "top feature Exiting an Application Creates an Application ...\n", "freq 6 1\n", "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:10:35.513\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.531\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.541\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:10:35.554\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n", "Calculating Metrics: 100%|██████████| 42/42 [00:00<00:00, 684.44it/s]\n", "\u001b[32m2025-11-19 18:10:36.023\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Succeeded\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
6
\n", "
Successful Expectations
6
\n", "
Unsuccessful Expectations
0
\n", "
Success Percent
100%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:10:35Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T171035.569816Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " a59cbecf6d7368f98a5ca6d1ddef0f7f\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: label, text\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " label\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " text\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 185 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "

Stay current on everything GX with our newsletter Subscribe

\n", "
\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabel\"}, \"id\": \"b104a4b1-8be1-4f06-b78a-9ffff2d58fd0\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\"}, \"meta\": {\"tag\": \"missing_values_columnlabel\"}, \"id\": \"8bb7d949-c66a-44d6-ae51-864d9d2e0798\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"text\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntext\"}, \"id\": \"cb47df8c-0126-4bf6-a1b8-dd03b1c83366\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"text\"}, \"meta\": {\"tag\": \"missing_values_columntext\"}, \"id\": \"d6eab28d-c371-4cf3-9c89-1e3fc80d251e\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"text\", \"min_value\": 185}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"bf7b6f50-9c67-4388-8699-7496419d8c76\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"label\", \"text\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"1ec3ebf8-91db-48ba-a3b0-535fb507d5e6\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True}]\n", "\n", "\n", "Processing dataset: nlbse23_train\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:11:52.219\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nlbse23/nlbse23-issue-classification-train.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:11:59.295\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " id labels title body \\\n", "count 1.275881e+06 1275881 1275877 1271200 \n", "unique NaN 4 1183033 1203337 \n", "top NaN bug Need a service that has a counter TBD \n", "freq NaN 670951 1433 5382 \n", "mean 1.137417e+09 NaN NaN NaN \n", "std 2.253828e+08 NaN NaN NaN \n", "min 2.747400e+04 NaN NaN NaN \n", "25% 1.097049e+09 NaN NaN NaN \n", "50% 1.182023e+09 NaN NaN NaN \n", "75% 1.279079e+09 NaN NaN NaN \n", "max 1.393120e+09 NaN NaN NaN \n", "\n", " author_association \n", "count 1275881 \n", "unique 6 \n", "top NONE \n", "freq 539679 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:12:03.929\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:12:03.931\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 1271197\u001b[0m\n", "\u001b[32m2025-11-19 18:12:03.932\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n", "\u001b[32m2025-11-19 18:14:42.064\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:14:42.066\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 1271197\u001b[0m\n", "\u001b[32m2025-11-19 18:14:42.067\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:14:51.922\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:14:51.925\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 1241467\u001b[0m\n", "\u001b[32m2025-11-19 18:14:51.926\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:21:42.179\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:21:42.186\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 1241467\u001b[0m\n", "\u001b[32m2025-11-19 18:21:42.190\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:21:43.893\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 22\u001b[0m\n", "\u001b[32m2025-11-19 18:21:45.144\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:21:45.149\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 1204191\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:21:45.505\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column id: \u001b[0m\n", "\u001b[32m2025-11-19 18:21:45.850\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column labels: \u001b[0m\n", "\u001b[32m2025-11-19 18:21:46.144\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column title: \u001b[0m\n", "\u001b[32m2025-11-19 18:21:46.607\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column body: \u001b[0m\n", "\u001b[32m2025-11-19 18:21:46.944\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column author_association: \u001b[0m\n", "\u001b[32m2025-11-19 18:21:46.959\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:21:47.041\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:21:47.063\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:21:47.090\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n", "Calculating Metrics: 100%|██████████| 71/71 [00:27<00:00, 2.54it/s] \n", "\u001b[32m2025-11-19 18:22:40.229\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Failed\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
12
\n", "
Successful Expectations
11
\n", "
Unsuccessful Expectations
1
\n", "
Success Percent
≈91.67%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:22:39Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T172147.140184Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " 55ffa326b8409964e06e5288ebc636b0\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: id, labels, title, body, author_association\n", " \n", " \n", " \n", "

49 unexpected values found. ≈0.004069% of 1204191 total rows.\n", "
\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Sampled Unexpected Values\n", " \n", " \n", " \n", " \n", " Count\n", " \n", " \n", " \n", " \n", " body\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
('1049631994', 'CONTRIBUTOR', 'feature', 'Validate Bounty lock reuse with Invitation')
2
The upstream issue to check whether the bounty lock can be set when the invite lock is already set: Joystream/joystream#2824\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", ", The upstream issue to check whether the bounty lock can be set when the invite lock is already set: Joystream/joystream#2824\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", "
\n", "
('1098188764', 'MEMBER', 'question', 'Olympia Balances')
2
Olympia will almost certainly be a fresh chain, and the question has arisen of what to do with balances! It could be a nice time to reset balances of everyone, input?\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", ", Olympia will almost certainly be a fresh chain, and the question has arisen of what to do with balances! It could be a nice time to reset balances of everyone, input?\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", "
\n", "
('1108035469', 'CONTRIBUTOR', 'documentation', 'Giza vs Olympia query node schema: document the changes for the benefit of Atlas and other projects')
2
Create a clear documentation of notable changes between Giza and Olympia query node schema.\n", "This will probably mostly focus on membeships / working groups modules, where the mappings have changed significantly, there may also be some changes in the content mappings after the [final merge]( is done.\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", ", Create a clear documentation of notable changes between Giza and Olympia query node schema.\n", "This will probably mostly focus on membeships / working groups modules, where the mappings have changed significantly, there may also be some changes in the content mappings after the [final merge]( is done.\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", "
\n", "
('1110695546', 'CONTRIBUTOR', 'bug', 'Incorrect popup height in DropDownList with LESS themes')
2
### Bug report\n", "The height of the DropDownList popup is incorrect when using LESS themes.\n", "\n", "Regression introduced with version 2022.1.119.\n", "\n", "### Reproduction of the problem\n", "1. Open this Dojo example - \n", "2. Open the DropDownList\n", "### Current behavior\n", "\n", "The popup height is incorrect. \n", "\n", "### Expected/desired behavior\n", "The popup height should be big enough to fit the list.\n", "\n", "### Environment\n", "\n", "* **Kendo UI version:** 2022.1.119\n", "* **Browser:** [all] \n", "\n", "\n", ", ### Bug report\n", "The height of the DropDownList popup is incorrect when using LESS themes.\n", "\n", "Regression introduced with version 2022.1.119.\n", "\n", "### Reproduction of the problem\n", "1. Open this Dojo example - \n", "2. Open the DropDownList\n", "### Current behavior\n", "\n", "The popup height is incorrect. \n", "\n", "### Expected/desired behavior\n", "The popup height should be big enough to fit the list.\n", "\n", "### Environment\n", "\n", "* **Kendo UI version:** 2022.1.119\n", "* **Browser:** [all] \n", "\n", "\n", "
\n", "
('1119590871', 'OWNER', 'bug', 'entering a repo on front page directs to empty contract page')
2
go to enter a \"open-contracts/fiat-swap\", click 'submit' -> empty page with url \"\n", "\n", "\n", "enter \" into url bar directly -> hit enter -> everything works, go to enter a \"open-contracts/fiat-swap\", click 'submit' -> empty page with url \"\n", "\n", "\n", "enter \" into url bar directly -> hit enter -> everything works
\n", "
('1127603767', 'CONTRIBUTOR', 'bug', 'Windows vscode.tests have multiple failures:')
2
![image](\n", "\n", "Captured from here:\n", ", ![image](\n", "\n", "Captured from here:\n", "
\n", "
('1132358132', 'NONE', 'bug', 'Wrong keyboard shortcut for switching to 2D view in Windows Editor')
2
**Your Godot version:** v3.4.2.stable\n", "\n", "**Issue description:** Keyboard shortcut described here:\n", "\n", "*Head back to the 2D workspace. You can either click the \"2D\" text at the top of the window or press Ctrl + F2 (Alt + 2 on macOS).*\n", "\n", "is not right on my editor under Windows. Ctrl + F1 should be the good one.\n", "\n", "**URL to the documentation page:** \n", ", **Your Godot version:** v3.4.2.stable\n", "\n", "**Issue description:** Keyboard shortcut described here:\n", "\n", "*Head back to the 2D workspace. You can either click the \"2D\" text at the top of the window or press Ctrl + F2 (Alt + 2 on macOS).*\n", "\n", "is not right on my editor under Windows. Ctrl + F1 should be the good one.\n", "\n", "**URL to the documentation page:** \n", "
\n", "
('1146423058', 'NONE', 'bug', 'v3.4.3 completely breaks GmapInfoWindow')
2
Working perfectly on v3.4.2\n", "\n", "On v3.4.3 the slot seems to be broken because my info window completely fails to appear.\n", "\n", "v3.4.2:\n", " \n", "\n", "v3.4.3:\n", " \n", "\n", "Love your package but it's a bit concerning how such regressions are slipping through the cracks... \n", ", Working perfectly on v3.4.2\n", "\n", "On v3.4.3 the slot seems to be broken because my info window completely fails to appear.\n", "\n", "v3.4.2:\n", " \n", "\n", "v3.4.3:\n", " \n", "\n", "Love your package but it's a bit concerning how such regressions are slipping through the cracks... \n", "
\n", "
('1155395539', 'NONE', 'bug', '[BUG] 在Wayland Sway Ozone 上没法打开')
2
**Describe the bug / 描述**\n", "A clear and concise description of what the bug is. / 请简要描述出现的 bug\n", "如题,在Wayland Sway 上没法打开\n", "\n", "**To Reproduce / 复现过程**\n", "\n", "1. 安装Sway\n", "2. 安装Icalingua-Plus-Plus\n", "\n", "\n", "**Expected behavior / 预期的结果**\n", "可以打开\n", "\n", "**System information / 系统信息**\n", "- OS: [e.g. Arch Linux] `Linux arch 5.16.11-zen1-1-zen #1 ZEN SMP PREEMPT Thu, 24 Feb 2022 02:18:22 +0000 x86_64 GNU/Linux`\n", "- Version [e.g. 2.1.4] 2.5.2-1\n", "\n", "**Additional context / 附加信息**\n", "Add any other context about the problem here. / 如果有什么额外信息,可以写在这里\n", "log:\n", "```\n", "libva error: vaGetDriverNameByIndex() failed with unknown libva error, driver_name = (null)\n", "Cannot find an external image viewer\n", " icalingua> (node:84170) Warning: Setting the NODE_TLS_REJECT_UNAUTHORIZED environment variable to '0' makes TLS connections and HTTPS requests insecure by disabling certificate verification.\n", "(Use `electron --trace-warnings ...` to show where the warning was created)\n", " icalingua> Aria2 RPC connected\n", " icalingua>\n", "```\n", "\n", "```\n", "--enable-features=UseOzonePlatform\n", "--ozone-platform=wayland\n", "\n", "```, **Describe the bug / 描述**\n", "A clear and concise description of what the bug is. / 请简要描述出现的 bug\n", "如题,在Wayland Sway 上没法打开\n", "\n", "**To Reproduce / 复现过程**\n", "\n", "1. 安装Sway\n", "2. 安装Icalingua-Plus-Plus\n", "\n", "\n", "**Expected behavior / 预期的结果**\n", "可以打开\n", "\n", "**System information / 系统信息**\n", "- OS: [e.g. Arch Linux] `Linux arch 5.16.11-zen1-1-zen #1 ZEN SMP PREEMPT Thu, 24 Feb 2022 02:18:22 +0000 x86_64 GNU/Linux`\n", "- Version [e.g. 2.1.4] 2.5.2-1\n", "\n", "**Additional context / 附加信息**\n", "Add any other context about the problem here. / 如果有什么额外信息,可以写在这里\n", "log:\n", "```\n", "libva error: vaGetDriverNameByIndex() failed with unknown libva error, driver_name = (null)\n", "Cannot find an external image viewer\n", " icalingua> (node:84170) Warning: Setting the NODE_TLS_REJECT_UNAUTHORIZED environment variable to '0' makes TLS connections and HTTPS requests insecure by disabling certificate verification.\n", "(Use `electron --trace-warnings ...` to show where the warning was created)\n", " icalingua> Aria2 RPC connected\n", " icalingua>\n", "```\n", "\n", "```\n", "--enable-features=UseOzonePlatform\n", "--ozone-platform=wayland\n", "\n", "```
\n", "
('1210767230', 'CONTRIBUTOR', 'bug', 'TypedPropertyFromAssignsRector: wrongly (?) assigns `null` to properties')
2
# Bug Report\n", "\n", " \n", "\n", "| Subject | Details |\n", "| :------------- | :---------------------------------------------------------------|\n", "| Rector version | 0.12.21 |\n", "\n", " \n", "\n", "## Minimal PHP Code Causing Issue\n", "\n", "\n", "\n", "## Expected Behaviour\n", "\n", "Rector changes the code this way:\n", "\n", "```diff\n", "- @var string The number part of the VAT number */\n", "- private $number;\n", "+ private ?string $number = null;\n", "```\n", "\n", "But I expect it changes it this other way:\n", "\n", "```diff\n", "- @var string The number part of the VAT number */\n", "- private $number;\n", "+ private string $number;\n", "```\n", "\n", "Am I missing something?\n", "\n", "I know the demo is really complex as I had to put in a lot of classes and interfaces.\n", "\n", "For a cleaner view of what's happening, see the PR to update Rector (the one that starts to cause the issue) here:\n", "\n", ", # Bug Report\n", "\n", " \n", "\n", "| Subject | Details |\n", "| :------------- | :---------------------------------------------------------------|\n", "| Rector version | 0.12.21 |\n", "\n", " \n", "\n", "## Minimal PHP Code Causing Issue\n", "\n", "\n", "\n", "## Expected Behaviour\n", "\n", "Rector changes the code this way:\n", "\n", "```diff\n", "- @var string The number part of the VAT number */\n", "- private $number;\n", "+ private ?string $number = null;\n", "```\n", "\n", "But I expect it changes it this other way:\n", "\n", "```diff\n", "- @var string The number part of the VAT number */\n", "- private $number;\n", "+ private string $number;\n", "```\n", "\n", "Am I missing something?\n", "\n", "I know the demo is really complex as I had to put in a lot of classes and interfaces.\n", "\n", "For a cleaner view of what's happening, see the PR to update Rector (the one that starts to cause the issue) here:\n", "\n", "
\n", "
('1212144811', 'MEMBER', 'bug', 'Bug: Nft Buy Now send revenue into channel account, not nft owner')
2
# Problem\n", "The extrinsic `buy_nft`\n", "\n", "* sends funds to built in channel account if present, rather than nft owner account\n", "* sends funds to either member channel owner if account is not present.\n", "* blocks entire purrchase if account is not present and channel is owned by curator group\n", "\n", "see here\n", "\n", "\n", "\n", "# Fix\n", "In all cases, send revenue to controller account of owning member.\n", "\n", "# PRs\n", "We need two separate PRs for this\n", "\n", "* one fixing this for `rhodes`.\n", "* one fixing this for `ephesus` first of all, then pulled into `audit3`.\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", ", # Problem\n", "The extrinsic `buy_nft`\n", "\n", "* sends funds to built in channel account if present, rather than nft owner account\n", "* sends funds to either member channel owner if account is not present.\n", "* blocks entire purrchase if account is not present and channel is owned by curator group\n", "\n", "see here\n", "\n", "\n", "\n", "# Fix\n", "In all cases, send revenue to controller account of owning member.\n", "\n", "# PRs\n", "We need two separate PRs for this\n", "\n", "* one fixing this for `rhodes`.\n", "* one fixing this for `ephesus` first of all, then pulled into `audit3`.\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", "
\n", "
('1214771968', 'CONTRIBUTOR', 'bug', \"Init transactional status doesn't work with Open Auction Id\")
2
It checks for NFT existence for setting the initial auction nonce before nft is created -> bug\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", ", It checks for NFT existence for setting the initial auction nonce before nft is created -> bug\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", "
\n", "
('1305019140', 'NONE', 'bug', 'Filter Control: search mix filters from one table into the other')
2
### Bootstraptable version(s) affected\n", "\n", "1.20.2\n", "\n", "### Description\n", "\n", "If you use 2 tables, trying to filter one table affects the other table, with lots of glitches.\n", "\n", "### Example(s)\n", "\n", "\n", "\n", "### Possible Solutions\n", "\n", "contain the search to a single table, not all tables\n", "\n", "### Additional Context\n", "\n", "_No response_, ### Bootstraptable version(s) affected\n", "\n", "1.20.2\n", "\n", "### Description\n", "\n", "If you use 2 tables, trying to filter one table affects the other table, with lots of glitches.\n", "\n", "### Example(s)\n", "\n", "\n", "\n", "### Possible Solutions\n", "\n", "contain the search to a single table, not all tables\n", "\n", "### Additional Context\n", "\n", "_No response_
\n", "
('1356943328', 'NONE', 'bug', 'Label management . Add the Label window and remove the delete button')
2
新增标签弹窗,去掉删除按钮\n", "![image](\n", "\n", ", 新增标签弹窗,去掉删除按钮\n", "![image](\n", "\n", "
\n", "
('1384724879', 'NONE', 'bug', '[Bug]: Testing from an alt account')
2
\n", "![Author]( ![Issue Number]( ![Status]( ![Locked](\n", " \n", "### Feature Submition Checklist\n", "\n", "- [X] I have read the [Code of Conduct](\n", "- [X] I have checked other issues for similar bug reports\n", "- [X] I have checked that it is not due to a configuration setting on my computer\n", "- [X] I have a working fix for this bug\n", "\n", "### Contact Details\n", "\n", "_No response_\n", "\n", "### What happened?\n", "\n", "A bug happened! and it was weird\n", "\n", "### What browser(s) are you seeing the problem on?\n", "\n", "Chrome\n", "\n", "### What operating system(s) are you seeing the problem on?\n", "\n", "Windows\n", "\n", "### What device(s) are you seeing the problem on?\n", "\n", "Mobile, \n", "![Author]( ![Issue Number]( ![Status]( ![Locked](\n", " \n", "### Feature Submition Checklist\n", "\n", "- [X] I have read the [Code of Conduct](\n", "- [X] I have checked other issues for similar bug reports\n", "- [X] I have checked that it is not due to a configuration setting on my computer\n", "- [X] I have a working fix for this bug\n", "\n", "### Contact Details\n", "\n", "_No response_\n", "\n", "### What happened?\n", "\n", "A bug happened! and it was weird\n", "\n", "### What browser(s) are you seeing the problem on?\n", "\n", "Chrome\n", "\n", "### What operating system(s) are you seeing the problem on?\n", "\n", "Windows\n", "\n", "### What device(s) are you seeing the problem on?\n", "\n", "Mobile
\n", "
('540447531', 'NONE', 'bug', '[MaterialButton] Long button text gets cut off when using maxlines=2 and ellipsize=end')
2
**Description:** When using a MaterialButton with fixed width and fixed number of text lines, I tried to find a way to deal with dynamic button text that could be too long for the button size. When the text is too long for the button the words that don't fit will simply not be displayed by default, thus not indicating that there are words missing. When setting \"ellipsize\" to \"end\" the three dots get displayed, but the beginning of the text will be cut off in some situations.\n", "\n", "**Steps to reproduce:** Check out and run this sample project: [\n", "\n", "Alternatively:\n", "1. Create a new Android project with a single activity.\n", "2. Add material lib and set material theme.\n", "3. Use the following activity layout:\n", "``` \n", " \n", "\n", " \n", "\n", " \n", "```\n", "\n", "**Actual behavior:** \n", "\n", "The first letter of the second text line gets cut off.\n", "\n", "![image](\n", "\n", "\n", "**Expected behavior:** All words of the text are displayed fully.\n", "\n", "**Android API version:** 28\n", "\n", "**Material Library version:** 1.1.0-beta02, 1.2.0-alpha02\n", "\n", "**Device:** Pixel 3a XL\n", ", **Description:** When using a MaterialButton with fixed width and fixed number of text lines, I tried to find a way to deal with dynamic button text that could be too long for the button size. When the text is too long for the button the words that don't fit will simply not be displayed by default, thus not indicating that there are words missing. When setting \"ellipsize\" to \"end\" the three dots get displayed, but the beginning of the text will be cut off in some situations.\n", "\n", "**Steps to reproduce:** Check out and run this sample project: [\n", "\n", "Alternatively:\n", "1. Create a new Android project with a single activity.\n", "2. Add material lib and set material theme.\n", "3. Use the following activity layout:\n", "``` \n", " \n", "\n", " \n", "\n", " \n", "```\n", "\n", "**Actual behavior:** \n", "\n", "The first letter of the second text line gets cut off.\n", "\n", "![image](\n", "\n", "\n", "**Expected behavior:** All words of the text are displayed fully.\n", "\n", "**Android API version:** 28\n", "\n", "**Material Library version:** 1.1.0-beta02, 1.2.0-alpha02\n", "\n", "**Device:** Pixel 3a XL\n", "
\n", "
('628971737', 'CONTRIBUTOR', 'bug', 'Pasted table gets incorrect layout after formatting')
2
### Bug report\n", "\n", "Pasting HTML via the viewHtml tool and formatting a sentence with an inline tool causes broken layout. \n", "\n", "### Reproduction of the problem\n", "\n", "\n", "1. Copy the following HTML:\n", "\n", "```html\n", " \n", " \n", " \n", " Spain \n", " Bulgaria \n", " \n", " \n", " United States \n", " UK \n", " \n", " \n", " \n", "```\n", "\n", "2. Use the View HTML tool to insert it to the editor\n", "3. Select United States by using triple click\n", "4. Bold the selection (Ctrl+B)\n", "\n", "### Current behavior\n", "![image](\n", "\n", "\n", "### Expected/desired behavior\n", "The layout of the table to remain the same.\n", "\n", "### Workaround \n", "Using a custom deserialization oprion: \n", "\n", "\n", "\n", "### Environment\n", "\n", "* **Kendo UI version:** all\n", "* **Browser:** [all ] \n", "\n", "\n", ", ### Bug report\n", "\n", "Pasting HTML via the viewHtml tool and formatting a sentence with an inline tool causes broken layout. \n", "\n", "### Reproduction of the problem\n", "\n", "\n", "1. Copy the following HTML:\n", "\n", "```html\n", " \n", " \n", " \n", " Spain \n", " Bulgaria \n", " \n", " \n", " United States \n", " UK \n", " \n", " \n", " \n", "```\n", "\n", "2. Use the View HTML tool to insert it to the editor\n", "3. Select United States by using triple click\n", "4. Bold the selection (Ctrl+B)\n", "\n", "### Current behavior\n", "![image](\n", "\n", "\n", "### Expected/desired behavior\n", "The layout of the table to remain the same.\n", "\n", "### Workaround \n", "Using a custom deserialization oprion: \n", "\n", "\n", "\n", "### Environment\n", "\n", "* **Kendo UI version:** all\n", "* **Browser:** [all ] \n", "\n", "\n", "
\n", "
('697520085', 'NONE', 'bug', '[Catalog App: Bottom App Bar] BottomSheet is Too Light')
2
**Description:** \n", "The BottomSheet (#3b3b3b) is too light (the elevation of BottomSheet is 32dp totally).\n", "\n", "![Screenshot_20210908-145455](\n", "\n", "\n", "**Expected behavior:** The BottomSheet should be #2d2d2d (the elevation of BottomSheet should be 16dp).\n", "\n", "\n", "\n", "**Source code:** \n", "(original)\n", "```\n", " \n", "\n", " \n", " \n", "```\n", "(revised)\n", "```\n", " \n", "\n", " \n", " \n", "```\n", "\n", "**Android API version:** 23\n", "\n", "**Material Library version:** 1.4.0\n", "\n", "**Device:** Nexus 7 (2013), **Description:** \n", "The BottomSheet (#3b3b3b) is too light (the elevation of BottomSheet is 32dp totally).\n", "\n", "![Screenshot_20210908-145455](\n", "\n", "\n", "**Expected behavior:** The BottomSheet should be #2d2d2d (the elevation of BottomSheet should be 16dp).\n", "\n", "\n", "\n", "**Source code:** \n", "(original)\n", "```\n", " \n", "\n", " \n", " \n", "```\n", "(revised)\n", "```\n", " \n", "\n", " \n", " \n", "```\n", "\n", "**Android API version:** 23\n", "\n", "**Material Library version:** 1.4.0\n", "\n", "**Device:** Nexus 7 (2013)
\n", "
('920631408', 'CONTRIBUTOR', 'bug', 'Olympia runtime upgrade proposal - unexpected post-upgrade proposal status')
2
I'm running into some unexpected behavior when running runtime upgrade proposal tests on Olympia branch.\n", "\n", "Steps I've taken:\n", "\n", "1. I created 2 RuntimeUpgradeProposals - one that I expected to be succesfully executed (`proposal2`) and one I expected to be `CanceledByRuntime` later (`proposal1`). _Here I'm assuming that when a proposal is in `Deciding`/`Active` state during the runtime upgrade - it should become `CanceledByRuntime` once the upgrade is executed: [see related code](\n", "1. Because for my tests I'm using `constitutionality===2` for runtime upgrade proposals, I approve both of them and I elect a new council\n", "1. Now the second round of voting begins and both proposals are in `Deciding` state \n", "1. I approve `proposal2`\n", "1. `proposal2` executes succesfully and the runtime is upgraded\n", "1. I would expect `proposal1` to now be `CanceledByRuntime`, but no related event is emitted. The query node still sees it as beeing in `Deciding` state, but `api.query.proposalsEngine.proposals()` returns empty result.\n", "\n", "Query node query:\n", "\n", "```graphql\n", "{\n", " proposals(where:{ id_in: [\"1\", \"2\"] }) {\n", " id\n", " details {\n", " __typename\n", " }\n", " status {\n", " __typename\n", " ... on ProposalStatusExecutionFailed {\n", " errorMessage\n", " }\n", " }\n", " statusSetAtBlock\n", " proposalStatusUpdates {\n", " id\n", " newStatus {\n", " __typename\n", " }\n", " }\n", " }\n", "}\n", "```\n", "\n", "Query node result:\n", "\n", "```graphql\n", "{\n", " \"data\": {\n", " \"proposals\": [\n", " {\n", " \"id\": \"1\",\n", " \"details\": {\n", " \"__typename\": \"RuntimeUpgradeProposalDetails\"\n", " },\n", " \"status\": {\n", " \"__typename\": \"ProposalStatusDeciding\"\n", " },\n", " \"statusSetAtBlock\": 216,\n", " \"proposalStatusUpdates\": [\n", " {\n", " \"id\": \"OLYMPIA-136-1\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusDormant\"\n", " }\n", " },\n", " {\n", " \"id\": \"OLYMPIA-216-5\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusDeciding\"\n", " }\n", " }\n", " ]\n", " },\n", " {\n", " \"id\": \"2\",\n", " \"details\": {\n", " \"__typename\": \"RuntimeUpgradeProposalDetails\"\n", " },\n", " \"status\": {\n", " \"__typename\": \"ProposalStatusExecuted\"\n", " },\n", " \"statusSetAtBlock\": 239,\n", " \"proposalStatusUpdates\": [\n", " {\n", " \"id\": \"OLYMPIA-175-2\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusDormant\"\n", " }\n", " },\n", " {\n", " \"id\": \"OLYMPIA-216-4\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusDeciding\"\n", " }\n", " },\n", " {\n", " \"id\": \"OLYMPIA-219-1\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusGracing\"\n", " }\n", " }\n", " ]\n", " }\n", " ]\n", " }\n", "}\n", "```\n", "\n", "Chain state after runtime upgrade:\n", "![proposals-s1](\n", "\n", "Events emitted in block 239 (during runtime upgrade):\n", "![proposals-s2](\n", "\n", "_Notice no events related to updating the status of `proposal1`_\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", ", I'm running into some unexpected behavior when running runtime upgrade proposal tests on Olympia branch.\n", "\n", "Steps I've taken:\n", "\n", "1. I created 2 RuntimeUpgradeProposals - one that I expected to be succesfully executed (`proposal2`) and one I expected to be `CanceledByRuntime` later (`proposal1`). _Here I'm assuming that when a proposal is in `Deciding`/`Active` state during the runtime upgrade - it should become `CanceledByRuntime` once the upgrade is executed: [see related code](\n", "1. Because for my tests I'm using `constitutionality===2` for runtime upgrade proposals, I approve both of them and I elect a new council\n", "1. Now the second round of voting begins and both proposals are in `Deciding` state \n", "1. I approve `proposal2`\n", "1. `proposal2` executes succesfully and the runtime is upgraded\n", "1. I would expect `proposal1` to now be `CanceledByRuntime`, but no related event is emitted. The query node still sees it as beeing in `Deciding` state, but `api.query.proposalsEngine.proposals()` returns empty result.\n", "\n", "Query node query:\n", "\n", "```graphql\n", "{\n", " proposals(where:{ id_in: [\"1\", \"2\"] }) {\n", " id\n", " details {\n", " __typename\n", " }\n", " status {\n", " __typename\n", " ... on ProposalStatusExecutionFailed {\n", " errorMessage\n", " }\n", " }\n", " statusSetAtBlock\n", " proposalStatusUpdates {\n", " id\n", " newStatus {\n", " __typename\n", " }\n", " }\n", " }\n", "}\n", "```\n", "\n", "Query node result:\n", "\n", "```graphql\n", "{\n", " \"data\": {\n", " \"proposals\": [\n", " {\n", " \"id\": \"1\",\n", " \"details\": {\n", " \"__typename\": \"RuntimeUpgradeProposalDetails\"\n", " },\n", " \"status\": {\n", " \"__typename\": \"ProposalStatusDeciding\"\n", " },\n", " \"statusSetAtBlock\": 216,\n", " \"proposalStatusUpdates\": [\n", " {\n", " \"id\": \"OLYMPIA-136-1\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusDormant\"\n", " }\n", " },\n", " {\n", " \"id\": \"OLYMPIA-216-5\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusDeciding\"\n", " }\n", " }\n", " ]\n", " },\n", " {\n", " \"id\": \"2\",\n", " \"details\": {\n", " \"__typename\": \"RuntimeUpgradeProposalDetails\"\n", " },\n", " \"status\": {\n", " \"__typename\": \"ProposalStatusExecuted\"\n", " },\n", " \"statusSetAtBlock\": 239,\n", " \"proposalStatusUpdates\": [\n", " {\n", " \"id\": \"OLYMPIA-175-2\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusDormant\"\n", " }\n", " },\n", " {\n", " \"id\": \"OLYMPIA-216-4\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusDeciding\"\n", " }\n", " },\n", " {\n", " \"id\": \"OLYMPIA-219-1\",\n", " \"newStatus\": {\n", " \"__typename\": \"ProposalStatusGracing\"\n", " }\n", " }\n", " ]\n", " }\n", " ]\n", " }\n", "}\n", "```\n", "\n", "Chain state after runtime upgrade:\n", "![proposals-s1](\n", "\n", "Events emitted in block 239 (during runtime upgrade):\n", "![proposals-s2](\n", "\n", "_Notice no events related to updating the status of `proposal1`_\n", "\n", "\n", "\n", "┆Issue is synchronized with this [Asana task]( by [Unito](\n", "
\n", "
('968495989', 'OWNER', 'bug', 'CX Second_Order_SQL_Injection @ src/main/java/org/cysecurity/cspf/jvl/controller/LoginValidator.java [master]')
7
**Second_Order_SQL_Injection** issue exists @ **src/main/java/org/cysecurity/cspf/jvl/controller/LoginValidator.java** in branch **master**\n", "\n", "*The application's rs=stmt.executeQuery method executes an SQL query with BinaryExpr, at line 14 of src\\main\\webapp\\vulnerability\\Messages.jsp. The application constructs this SQL query by embedding an untrusted string into the query without proper sanitization. The concatenated string is submitted to the database, where it is parsed and executed accordingly.\n", "The attacker may be able to write arbitrary data to the database, which is then retrieved by the application with rs in processRequest method at line 52 of src\\main\\java\\org\\cysecurity\\cspf\\jvl\\controller\\LoginValidator.java. This data then flows through the code, until it is used directly in the SQL query without sanitization, and then submitted to the database server for execution.\n", "This may enable a Second-Order SQL Injection attack.*\n", "\n", "Severity: High\n", "\n", "CWE:89\n", "\n", "[Vulnerability details and guidance](\n", "\n", "[Checkmarx](\n", "\n", "[Training](\n", "[Recommended Fix](\n", "\n", "Lines: [52]( \n", "\n", "---\n", "[Code (Line #52):](\n", "```\n", " rs=stmt.executeQuery(\"select * from users where username='\"+user+\"' and password='\"+pass+\"'\");\n", "```\n", "---\n", ", **Second_Order_SQL_Injection** issue exists @ **src/main/java/org/cysecurity/cspf/jvl/controller/LoginValidator.java** in branch **master**\n", "\n", "*The application's rs=stmt.executeQuery method executes an SQL query with BinaryExpr, at line 14 of src\\main\\webapp\\vulnerability\\Messages.jsp. The application constructs this SQL query by embedding an untrusted string into the query without proper sanitization. The concatenated string is submitted to the database, where it is parsed and executed accordingly.\n", "The attacker may be able to write arbitrary data to the database, which is then retrieved by the application with rs in processRequest method at line 52 of src\\main\\java\\org\\cysecurity\\cspf\\jvl\\controller\\LoginValidator.java. This data then flows through the code, until it is used directly in the SQL query without sanitization, and then submitted to the database server for execution.\n", "This may enable a Second-Order SQL Injection attack.*\n", "\n", "Severity: High\n", "\n", "CWE:89\n", "\n", "[Vulnerability details and guidance](\n", "\n", "[Checkmarx](\n", "\n", "[Training](\n", "[Recommended Fix](\n", "\n", "Lines: [52]( \n", "\n", "---\n", "[Code (Line #52):](\n", "```\n", " rs=stmt.executeQuery(\"select * from users where username='\"+user+\"' and password='\"+pass+\"'\");\n", "```\n", "---\n", ", **Second_Order_SQL_Injection** issue exists @ **src/main/java/org/cysecurity/cspf/jvl/controller/LoginValidator.java** in branch **master**\n", "\n", "*The application's rs=stmt.executeQuery method executes an SQL query with BinaryExpr, at line 14 of src\\main\\webapp\\vulnerability\\Messages.jsp. The application constructs this SQL query by embedding an untrusted string into the query without proper sanitization. The concatenated string is submitted to the database, where it is parsed and executed accordingly.\n", "The attacker may be able to write arbitrary data to the database, which is then retrieved by the application with rs in processRequest method at line 52 of src\\main\\java\\org\\cysecurity\\cspf\\jvl\\controller\\LoginValidator.java. This data then flows through the code, until it is used directly in the SQL query without sanitization, and then submitted to the database server for execution.\n", "This may enable a Second-Order SQL Injection attack.*\n", "\n", "Severity: High\n", "\n", "CWE:89\n", "\n", "[Vulnerability details and guidance](\n", "\n", "[Checkmarx](\n", "\n", "[Training](\n", "[Recommended Fix](\n", "\n", "Lines: [52]( \n", "\n", "---\n", "[Code (Line #52):](\n", "```\n", " rs=stmt.executeQuery(\"select * from users where username='\"+user+\"' and password='\"+pass+\"'\");\n", "```\n", "---\n", ", **Second_Order_SQL_Injection** issue exists @ **src/main/java/org/cysecurity/cspf/jvl/controller/LoginValidator.java** in branch **master**\n", "\n", "*The application's rs=stmt.executeQuery method executes an SQL query with BinaryExpr, at line 14 of src\\main\\webapp\\vulnerability\\Messages.jsp. The application constructs this SQL query by embedding an untrusted string into the query without proper sanitization. The concatenated string is submitted to the database, where it is parsed and executed accordingly.\n", "The attacker may be able to write arbitrary data to the database, which is then retrieved by the application with rs in processRequest method at line 52 of src\\main\\java\\org\\cysecurity\\cspf\\jvl\\controller\\LoginValidator.java. This data then flows through the code, until it is used directly in the SQL query without sanitization, and then submitted to the database server for execution.\n", "This may enable a Second-Order SQL Injection attack.*\n", "\n", "Severity: High\n", "\n", "CWE:89\n", "\n", "[Vulnerability details and guidance](\n", "\n", "[Checkmarx](\n", "\n", "[Training](\n", "[Recommended Fix](\n", "\n", "Lines: [52]( \n", "\n", "---\n", "[Code (Line #52):](\n", "```\n", " rs=stmt.executeQuery(\"select * from users where username='\"+user+\"' and password='\"+pass+\"'\");\n", "```\n", "---\n", ", **Second_Order_SQL_Injection** issue exists @ **src/main/java/org/cysecurity/cspf/jvl/controller/LoginValidator.java** in branch **master**\n", "\n", "*The application's rs=stmt.executeQuery method executes an SQL query with BinaryExpr, at line 14 of src\\main\\webapp\\vulnerability\\Messages.jsp. The application constructs this SQL query by embedding an untrusted string into the query without proper sanitization. The concatenated string is submitted to the database, where it is parsed and executed accordingly.\n", "The attacker may be able to write arbitrary data to the database, which is then retrieved by the application with rs in processRequest method at line 52 of src\\main\\java\\org\\cysecurity\\cspf\\jvl\\controller\\LoginValidator.java. This data then flows through the code, until it is used directly in the SQL query without sanitization, and then submitted to the database server for execution.\n", "This may enable a Second-Order SQL Injection attack.*\n", "\n", "Severity: High\n", "\n", "CWE:89\n", "\n", "[Vulnerability details and guidance](\n", "\n", "[Checkmarx](\n", "\n", "[Training](null)\n", "[Recommended Fix](\n", "\n", "Lines: [52]( \n", "\n", "---\n", "[Code (Line #52):](\n", "```\n", " rs=stmt.executeQuery(\"select * from users where username='\"+user+\"' and password='\"+pass+\"'\");\n", "```\n", "---\n", ", **Second_Order_SQL_Injection** issue exists @ **src/main/java/org/cysecurity/cspf/jvl/controller/LoginValidator.java** in branch **master**\n", "\n", "*The application's rs=stmt.executeQuery method executes an SQL query with BinaryExpr, at line 14 of src\\main\\webapp\\vulnerability\\Messages.jsp. The application constructs this SQL query by embedding an untrusted string into the query without proper sanitization. The concatenated string is submitted to the database, where it is parsed and executed accordingly.\n", "The attacker may be able to write arbitrary data to the database, which is then retrieved by the application with rs in processRequest method at line 52 of src\\main\\java\\org\\cysecurity\\cspf\\jvl\\controller\\LoginValidator.java. This data then flows through the code, until it is used directly in the SQL query without sanitization, and then submitted to the database server for execution.\n", "This may enable a Second-Order SQL Injection attack.*\n", "\n", "Severity: High\n", "\n", "CWE:89\n", "\n", "[Vulnerability details and guidance](\n", "\n", "[Checkmarx](\n", "\n", "[Training](\n", "[Recommended Fix](\n", "\n", "Lines: [52]( \n", "\n", "---\n", "[Code (Line #52):](\n", "```\n", " rs=stmt.executeQuery(\"select * from users where username='\"+user+\"' and password='\"+pass+\"'\");\n", "```\n", "---\n", ", **Second_Order_SQL_Injection** issue exists @ **src/main/java/org/cysecurity/cspf/jvl/controller/LoginValidator.java** in branch **master**\n", "\n", "*The application's rs=stmt.executeQuery method executes an SQL query with BinaryExpr, at line 14 of src\\main\\webapp\\vulnerability\\Messages.jsp. The application constructs this SQL query by embedding an untrusted string into the query without proper sanitization. The concatenated string is submitted to the database, where it is parsed and executed accordingly.\n", "The attacker may be able to write arbitrary data to the database, which is then retrieved by the application with rs in processRequest method at line 52 of src\\main\\java\\org\\cysecurity\\cspf\\jvl\\controller\\LoginValidator.java. This data then flows through the code, until it is used directly in the SQL query without sanitization, and then submitted to the database server for execution.\n", "This may enable a Second-Order SQL Injection attack.*\n", "\n", "Severity: High\n", "\n", "CWE:89\n", "\n", "[Vulnerability details and guidance](\n", "\n", "[Checkmarx](\n", "\n", "[Training](null)\n", "[Recommended Fix](\n", "\n", "Lines: [52]( \n", "\n", "---\n", "[Code (Line #52):](\n", "```\n", " rs=stmt.executeQuery(\"select * from users where username='\"+user+\"' and password='\"+pass+\"'\");\n", "```\n", "---\n", "
\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " To retrieve all unexpected values...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " df.filter(items=[35403, 50709, 167687, 177591, 178138, 210435, 237639, 240531, 291007, 360972, 417510, 423674, 498265, 501880, 542566, 554930, 582698, 583966, 607406, 608890, 610750, 615321, 625668, 659016, 687533, 701127, 707444, 777008, 782498, 794490, 809859, 840428, 881341, 943987, 954620, 955829, 981850, 1008152, 1009431, 1033146, 1043532, 1070842, 1073189, 1085196, 1102128, 1114268, 1123125, 1227152, 1261500], axis=0)\n", " \n", " \n", " \n", "
\n", "
≈0.0040691% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " author_association\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " body\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 22 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " id\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type int.\n", " \n", "
int64
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " labels\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " title\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "

Stay current on everything GX with our newsletter Subscribe

\n", "
\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"id\", \"type_\": \"int\"}, \"meta\": {\"tag\": \"type_integrity_columnid\"}, \"id\": \"07764237-f20d-4e4c-b941-20562e3cf480\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': None,\n", " 'percent_of_failed_rows': None,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"id\"}, \"meta\": {\"tag\": \"missing_values_columnid\"}, \"id\": \"351f40f4-a083-4cb4-887f-86acb2d9cf89\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"labels\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabels\"}, \"id\": \"7c5524f4-a93e-48a6-8a04-d7870830722f\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"labels\"}, \"meta\": {\"tag\": \"missing_values_columnlabels\"}, \"id\": \"34b201c8-37cb-474e-aedb-5d80c3639d39\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntitle\"}, \"id\": \"58b811ac-d55c-4b8e-861b-05406bdaffb9\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\"}, \"meta\": {\"tag\": \"missing_values_columntitle\"}, \"id\": \"477559be-1e0d-4e05-8457-b0626a59b8bb\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnbody\"}, \"id\": \"3b81f0ec-1264-4b52-8754-090f15f7e566\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\"}, \"meta\": {\"tag\": \"missing_values_columnbody\"}, \"id\": \"cc4cb4e0-09b3-49b2-872f-21eac8e3b60f\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"min_value\": 22}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"2be6d6dc-181a-419d-856c-c68bae95c187\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"author_association\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnauthor_association\"}, \"id\": \"fab92ebe-fe23-4c5c-8076-508ef9374d34\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"author_association\"}, \"meta\": {\"tag\": \"missing_values_columnauthor_association\"}, \"id\": \"d68abee3-b764-4b2c-8fc0-2084574c0650\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"id\", \"labels\", \"title\", \"body\", \"author_association\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"5812759a-eeff-4e40-8469-582e03f2299f\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 49,\n", " 'percent_of_failed_rows': 0.004069121925010235,\n", " 'success': False}]\n", "\n", "\n", "Processing dataset: nlbse24_train\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:22:48.492\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nlbse24/issues_train.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:22:48.536\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:48.561\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:48.562\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 1500\u001b[0m\n", "\u001b[32m2025-11-19 18:22:48.562\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " repo created_at label \\\n", "count 1500 1500 1500 \n", "unique 5 1500 3 \n", "top facebook/react 2022-01-11 16:30:53 bug \n", "freq 300 1 500 \n", "\n", " title \\\n", "count 1500 \n", "unique 1481 \n", "top [DevTools Bug] Cannot add node \"1\" because a n... \n", "freq 12 \n", "\n", " body \n", "count 1500 \n", "unique 1492 \n", "top Please go to Stack Overflow for help and suppo... \n", "freq 4 \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:22:48.835\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:48.837\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 1500\u001b[0m\n", "\u001b[32m2025-11-19 18:22:48.838\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:48.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:48.861\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 1500\u001b[0m\n", "\u001b[32m2025-11-19 18:22:48.863\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:49.961\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:49.961\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 1500\u001b[0m\n", "\u001b[32m2025-11-19 18:22:49.961\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:49.972\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 76\u001b[0m\n", "\u001b[32m2025-11-19 18:22:49.978\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:49.980\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 1455\u001b[0m\n", "\u001b[32m2025-11-19 18:22:49.982\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column repo: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:49.991\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column created_at: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:49.998\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column label: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:50.011\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column title: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:50.021\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column body: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:50.028\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:50.084\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:50.096\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:50.110\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Calculating Metrics: 100%|██████████| 78/78 [00:00<00:00, 313.09it/s]\n", "\u001b[32m2025-11-19 18:22:51.109\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.259\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nasa/cfs_test.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.279\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.279\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.289\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 545\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.291\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Succeeded\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
12
\n", "
Successful Expectations
12
\n", "
Unsuccessful Expectations
0
\n", "
Success Percent
100%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:22:50Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T172250.138923Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " 52b37f2d298546b0d97395133f616b6b\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: repo, created_at, label, title, body\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " body\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 76 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " created_at\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " label\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " repo\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " title\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "

Stay current on everything GX with our newsletter Subscribe

\n", "
\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"repo\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnrepo\"}, \"id\": \"3e90183a-34c3-481e-a08d-2665189e8a02\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"repo\"}, \"meta\": {\"tag\": \"missing_values_columnrepo\"}, \"id\": \"c55512ed-bd02-49dd-90fc-76b8f8182230\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"created_at\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columncreated_at\"}, \"id\": \"62aec55b-b6f4-4b71-8f28-85c9266c9ac8\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"created_at\"}, \"meta\": {\"tag\": \"missing_values_columncreated_at\"}, \"id\": \"c137bf50-8baa-42aa-8953-942494aa0796\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabel\"}, \"id\": \"7452ed64-1092-42ef-b5ce-1baf01b85d35\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\"}, \"meta\": {\"tag\": \"missing_values_columnlabel\"}, \"id\": \"2f7a6a0e-7eb6-4340-b55d-d44f7cd19ea8\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntitle\"}, \"id\": \"cfb3c03c-7bc7-4097-9fbe-9b95d1896bae\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\"}, \"meta\": {\"tag\": \"missing_values_columntitle\"}, \"id\": \"707a8679-accb-48b0-a43b-07beaa456a48\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnbody\"}, \"id\": \"81831b20-7cce-490c-9f95-745e0cbe0f91\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\"}, \"meta\": {\"tag\": \"missing_values_columnbody\"}, \"id\": \"4c5ab0e5-5192-4b22-902e-51abfb3bcd36\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"min_value\": 76}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"49465a43-6af9-4641-ab94-92591e894763\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"repo\", \"created_at\", \"label\", \"title\", \"body\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"2a338473-1390-4da8-977b-897a7f09b44f\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True}]\n", "\n", "\n", "Processing dataset: nasa_cfs_test\n", " title \\\n", "count 545 \n", "unique 534 \n", "top Static analysis issues relative to flight code \n", "freq 3 \n", "\n", " body label \\\n", "count 545 545 \n", "unique 532 2 \n", "top Handful of static analysis issues in the \"red\"... non-bug \n", "freq 3 380 \n", "\n", " url \n", "count 545 \n", "unique 545 \n", "top https://github.com/nasa/to_lab/issues/2 \n", "freq 1 \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:22:51.389\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.389\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 545\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.389\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.403\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.403\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 545\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.403\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.619\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.619\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 545\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.625\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.630\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 102\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.630\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.630\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 528\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.630\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column title: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.641\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column body: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.650\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column label: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.659\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column url: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.668\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.719\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.731\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:51.756\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Calculating Metrics: 100%|██████████| 66/66 [00:00<00:00, 640.52it/s]\n", "\u001b[32m2025-11-19 18:22:52.469\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Succeeded\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
10
\n", "
Successful Expectations
10
\n", "
Unsuccessful Expectations
0
\n", "
Success Percent
100%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:22:52Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T172251.781440Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " 2f0caf8a272c5bf34f2c91cd4855e1ed\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: title, body, label, url\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " body\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 102 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " label\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " title\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " url\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "

Stay current on everything GX with our newsletter Subscribe

\n", "
\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntitle\"}, \"id\": \"2c99d059-d550-4e92-8157-283e726d773d\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\"}, \"meta\": {\"tag\": \"missing_values_columntitle\"}, \"id\": \"f4e7af83-8bec-4723-b844-0ad53367d844\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnbody\"}, \"id\": \"4c588005-bd28-426a-9842-7b229f515cef\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\"}, \"meta\": {\"tag\": \"missing_values_columnbody\"}, \"id\": \"23fd7b5c-7421-4b5f-a088-7a4f448b8b7c\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"min_value\": 102}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"db7ac254-eff0-40c9-80f3-f753a3326ebe\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabel\"}, \"id\": \"cca2493d-fe29-4d21-b1d0-7a512d50f717\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\"}, \"meta\": {\"tag\": \"missing_values_columnlabel\"}, \"id\": \"8c3807a6-3f1d-4361-960a-6c0b936144a2\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"url\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnurl\"}, \"id\": \"f624199e-a352-4411-a2f6-5eb059da7f5c\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"url\"}, \"meta\": {\"tag\": \"missing_values_columnurl\"}, \"id\": \"67994ebe-0f93-471a-9e63-f1b5741b058c\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"title\", \"body\", \"label\", \"url\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"69150d8a-7c43-4f90-ade6-66eabf1836da\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True}]\n", "\n", "\n", "Processing dataset: nasa_fprime_test\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:22:52.659\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nasa/fprime_test.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.691\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.702\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.703\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 150\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.709\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.755\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.759\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 150\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.761\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.764\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.770\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 150\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.773\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.839\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.839\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 150\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.843\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.850\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 128\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.853\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.855\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 145\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column title: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.865\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column body: \u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " title \\\n", "count 151 \n", "unique 151 \n", "top Graviation compass system ref component \n", "freq 1 \n", "\n", " body label \\\n", "count 150 151 \n", "unique 149 2 \n", "top | | |\\r\\n|:---|:---|\\r\\n|**_F´ Version_**| |\\r... non-bug \n", "freq 2 76 \n", "\n", " url \n", "count 151 \n", "unique 151 \n", "top https://github.com/nasa/fprime/issues/2118 \n", "freq 1 \n", "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:22:52.882\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column label: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.898\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column url: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.909\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.965\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:52.980\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.001\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n", "Calculating Metrics: 100%|██████████| 66/66 [00:00<00:00, 738.51it/s]\n", "\u001b[32m2025-11-19 18:22:53.679\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.769\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nasa/nasa_test_sample.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.779\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.779\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.789\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 10\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.789\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.798\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.799\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 10\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.801\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.803\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.803\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 10\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.809\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.815\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.816\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 10\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.817\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.823\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 203\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.825\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.829\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 9\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.829\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column label: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.836\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column text: \u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.842\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.856\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Succeeded\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
10
\n", "
Successful Expectations
10
\n", "
Unsuccessful Expectations
0
\n", "
Success Percent
100%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:22:53Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T172253.024550Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " 0700ca9bcb8de157ccb93b1ce8dd288f\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: title, body, label, url\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " body\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 128 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " label\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " title\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " url\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "

Stay current on everything GX with our newsletter Subscribe

\n", "
\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntitle\"}, \"id\": \"055f75ba-b863-47ea-9f7b-97afcdfae968\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\"}, \"meta\": {\"tag\": \"missing_values_columntitle\"}, \"id\": \"a4a44137-46a7-4300-9eee-fba38c4b8960\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnbody\"}, \"id\": \"60afee05-7a59-4c6e-b877-eb9b504a4d2e\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\"}, \"meta\": {\"tag\": \"missing_values_columnbody\"}, \"id\": \"d7b1185b-4058-4433-b446-7e5e3ca8ee4d\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"min_value\": 128}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"249b2566-6bd0-4245-927f-2d3a4df5a34c\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabel\"}, \"id\": \"274dabc3-5a64-45c5-8a5c-b612e76158b1\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\"}, \"meta\": {\"tag\": \"missing_values_columnlabel\"}, \"id\": \"2eb87b17-f5fd-47e0-986f-98202803e977\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"url\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnurl\"}, \"id\": \"bb55ba50-d5b3-4995-8433-fd4d9b78508f\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"url\"}, \"meta\": {\"tag\": \"missing_values_columnurl\"}, \"id\": \"48185fc4-0113-4d8d-82bb-eefe91528458\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"title\", \"body\", \"label\", \"url\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"7c6a4bdb-6a0f-48c1-b344-49aa6ab1cabc\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True}]\n", "\n", "\n", "Processing dataset: nasa_test\n", " label text\n", "count 10 10\n", "unique 4 10\n", "top feature Fix #297, CCSDS Command Secondary Header Endia...\n", "freq 5 1\n", "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:22:53.868\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:22:53.882\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n", "Calculating Metrics: 100%|██████████| 42/42 [00:00<00:00, 658.95it/s]\n", "\u001b[32m2025-11-19 18:22:54.450\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Succeeded\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
6
\n", "
Successful Expectations
6
\n", "
Unsuccessful Expectations
0
\n", "
Success Percent
100%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:22:54Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T172253.898537Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " a1d4a49e9f85970a63f2a9e9da44a810\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: label, text\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " label\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " text\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 203 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "

Stay current on everything GX with our newsletter Subscribe

\n", "
\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabel\"}, \"id\": \"79aebfa9-ea57-45c1-b2ad-ecbe9126571d\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\"}, \"meta\": {\"tag\": \"missing_values_columnlabel\"}, \"id\": \"3fda0f6c-2d77-4a0c-9c1a-dbeed2b654c9\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"text\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntext\"}, \"id\": \"bad0a634-bddf-40cf-8b42-89c89e073dde\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"text\"}, \"meta\": {\"tag\": \"missing_values_columntext\"}, \"id\": \"064f04c6-9046-44ad-9db5-c0ae525f35b0\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"text\", \"min_value\": 203}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"ed7cb6f0-b354-4c8e-b0f9-6793aac0e928\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"label\", \"text\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"69b0606f-b2fa-42df-b4b3-1a7f3f7c8081\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True}]\n", "\n", "\n", "Processing dataset: nlbse23_test\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:23:07.779\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nlbse23/nlbse23-issue-classification-test.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:23:08.507\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " id labels title body \\\n", "count 1.423200e+05 142320 142320 141854 \n", "unique NaN 4 137990 138488 \n", "top NaN bug Need a service that has a counter TBD \n", "freq NaN 74781 159 649 \n", "mean 1.137254e+09 NaN NaN NaN \n", "std 2.256949e+08 NaN NaN NaN \n", "min 4.838050e+05 NaN NaN NaN \n", "25% 1.096958e+09 NaN NaN NaN \n", "50% 1.181684e+09 NaN NaN NaN \n", "75% 1.278590e+09 NaN NaN NaN \n", "max 1.393073e+09 NaN NaN NaN \n", "\n", " author_association \n", "count 142320 \n", "unique 6 \n", "top NONE \n", "freq 59792 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:23:09.224\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:23:09.228\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 141854\u001b[0m\n", "\u001b[32m2025-11-19 18:23:09.229\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n", "\u001b[32m2025-11-19 18:23:34.000\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:23:34.002\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 141854\u001b[0m\n", "\u001b[32m2025-11-19 18:23:34.003\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:23:35.094\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:23:35.096\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 141356\u001b[0m\n", "\u001b[32m2025-11-19 18:23:35.097\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:24:19.938\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:24:19.939\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 141356\u001b[0m\n", "\u001b[32m2025-11-19 18:24:19.940\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.118\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 22\u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.295\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.297\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 137027\u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.334\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column id: \u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.368\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column labels: \u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.409\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column title: \u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.462\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column body: \u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.492\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column author_association: \u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:24:20.499\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.540\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.552\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:24:20.562\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n", "Calculating Metrics: 100%|██████████| 71/71 [00:03<00:00, 18.74it/s]\n", "\u001b[32m2025-11-19 18:24:26.358\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Succeeded\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
12
\n", "
Successful Expectations
12
\n", "
Unsuccessful Expectations
0
\n", "
Success Percent
100%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:24:26Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T172420.582806Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " 33e436de3a6420bed3c3d61c69b2b205\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: id, labels, title, body, author_association\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " author_association\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " body\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 22 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " id\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type int.\n", " \n", "
int64
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " labels\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " title\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "

Stay current on everything GX with our newsletter Subscribe

\n", "
\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"id\", \"type_\": \"int\"}, \"meta\": {\"tag\": \"type_integrity_columnid\"}, \"id\": \"b621f04a-7d4b-4e35-bf95-9ffca612856b\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': None,\n", " 'percent_of_failed_rows': None,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"id\"}, \"meta\": {\"tag\": \"missing_values_columnid\"}, \"id\": \"ea6aa81f-6898-4dd2-8da5-bfdcb4349124\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"labels\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabels\"}, \"id\": \"36c18e06-a65f-4032-bc51-bb5cb7ac1cff\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"labels\"}, \"meta\": {\"tag\": \"missing_values_columnlabels\"}, \"id\": \"12133cab-40e2-4694-8dd5-c24f319547f9\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntitle\"}, \"id\": \"f1243483-8efa-40c0-bd96-6f43c2ef21a5\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\"}, \"meta\": {\"tag\": \"missing_values_columntitle\"}, \"id\": \"5be47b66-de4e-46cf-9c40-f991655d2940\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnbody\"}, \"id\": \"835992dc-28d1-4d73-a0de-7d56ff3d762e\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\"}, \"meta\": {\"tag\": \"missing_values_columnbody\"}, \"id\": \"1db1ef11-1928-4739-96c9-5726e247414e\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"min_value\": 22}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"2d31c357-49b2-47ce-9092-79eba31af638\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"author_association\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnauthor_association\"}, \"id\": \"40fefb96-cdd8-4b76-b8cb-a8895a3253d2\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"author_association\"}, \"meta\": {\"tag\": \"missing_values_columnauthor_association\"}, \"id\": \"e0700e63-8cab-4fb2-84f8-18fdd043620b\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"id\", \"labels\", \"title\", \"body\", \"author_association\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"8ef75cb3-99e7-4c64-bd51-a586cdba87ec\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True}]\n", "\n", "\n", "Processing dataset: nlbse24_test\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:24:26.978\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mload_dataset\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mDataset loaded: issue-report-classification/nlbse24/issues_test.csv\u001b[0m\n", "\u001b[32m2025-11-19 18:24:26.998\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m313\u001b[0m - \u001b[1mSolving columns integrity issues...\u001b[0m\n", "\u001b[32m2025-11-19 18:24:27.021\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m323\u001b[0m - \u001b[1mColumns integrity issues solved!\u001b[0m\n", "\u001b[32m2025-11-19 18:24:27.023\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_columns_integrity\u001b[0m:\u001b[36m324\u001b[0m - \u001b[1mNumber of samples after cleaning columns integrity: 1498\u001b[0m\n", "\u001b[32m2025-11-19 18:24:27.024\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mCleaning missing values...\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " repo created_at label \\\n", "count 1500 1500 1500 \n", "unique 5 1500 3 \n", "top facebook/react 2022-01-06 11:01:32 bug \n", "freq 300 1 500 \n", "\n", " title \\\n", "count 1500 \n", "unique 1490 \n", "top [DevTools Bug] Cannot add node \"1\" because a n... \n", "freq 4 \n", "\n", " body \n", "count 1498 \n", "unique 1488 \n", "top Please go to Stack Overflow for help and suppo... \n", "freq 4 \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-19 18:24:27.285\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m340\u001b[0m - \u001b[1mMissing values cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:24:27.287\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_missing_values\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mNumber of samples after cleaning missing values: 1498\u001b[0m\n", "\u001b[32m2025-11-19 18:24:27.288\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mCleaning duplicates...\u001b[0m\n", "\u001b[32m2025-11-19 18:24:27.321\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mDuplicates cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:24:27.322\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_duplicates\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of samples after cleaning duplicates: 1498\u001b[0m\n", "\u001b[32m2025-11-19 18:24:27.324\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m383\u001b[0m - \u001b[1mCleaning bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.238\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m392\u001b[0m - \u001b[1mBodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.241\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_bodies\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mNumber of samples after cleaning bodies: 1498\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.242\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mCleaning unmeaningful bodies...\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.248\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__get_unmeaningful_body_length\u001b[0m:\u001b[36m85\u001b[0m - \u001b[1mMaximum body length to be considered unmeaningful: 83\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.254\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mUnmeaningful bodies cleaned!\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.257\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mclean_unmeaningful_bodies\u001b[0m:\u001b[36m374\u001b[0m - \u001b[1mNumber of samples after cleaning unmeaningful bodies: 1453\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.260\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column repo: \u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.265\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column created_at: \u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.273\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column label: \u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.282\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column title: \u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.292\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1mMost frequent type in column body: \u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.300\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_columns_type_integrity_expectation\u001b[0m:\u001b[36m180\u001b[0m - \u001b[1mColumns type integrity checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.353\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_missing_values_expectation\u001b[0m:\u001b[36m191\u001b[0m - \u001b[1mMissing values checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.371\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_duplicates_expectation\u001b[0m:\u001b[36m204\u001b[0m - \u001b[1mDuplicates checks set\u001b[0m\n", "\u001b[32m2025-11-19 18:24:28.385\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36m__check_unmeaningful_bodies_expectation\u001b[0m:\u001b[36m222\u001b[0m - \u001b[1mUnmeaningful bodies checks set\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Cleaning completed!\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Calculating Metrics: 100%|██████████| 78/78 [00:00<00:00, 446.44it/s]\n", "\u001b[32m2025-11-19 18:24:29.148\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msyntetic_issue_report_data_generation.dataset\u001b[0m:\u001b[36mcheck_dataset\u001b[0m:\u001b[36m288\u001b[0m - \u001b[1mDataset checking completed!\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " Data documentation compiled by Great Expectations\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", " \n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "\n", "\n", "
\n", "
\n", "
\n", "
\n", "
How to Edit This Expectation Suite
\n", " \n", "
\n", "
\n", "

Expectations are best edited interactively in Jupyter notebooks.

\n", "

To automatically generate a notebook that does this run:

\n", "
\n", " \n", " \n", " \n", "
\n", " \n", "
\n", "
\n", "

Once you have made your changes and run the entire notebook you can kill the notebook by pressing Ctr-C in your terminal.

\n", "

Because these notebooks are generated from an Expectation Suite, these notebooks are entirely disposable.

\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", " \n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "

Expectation Validation Result

\n", "

Evaluates whether a batch of data matches expectations.

\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "\n", "
\n", "
\n", " Actions\n", "
\n", "
\n", " \n", "
\n", "

\n", " Validation Filter:\n", "

\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", "
\n", " \n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " Table of Contents\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Overview\n", "
\n", " \n", "
\n", " \n", " \n", "
\n", " \n", " \n", " Expectation Suite: Dataset expectation suite
Data asset: None
Status: Succeeded\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Statistics\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Evaluated Expectations
12
\n", "
Successful Expectations
12
\n", "
Unsuccessful Expectations
0
\n", "
Success Percent
100%
\n", "\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "
\n", " \n", "

\n", " \n", " Show more info...\n", " \n", "

\n", " \n", "\n", "
\n", " \n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Info\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
Great Expectations Version
1.9.0
\n", "
Run Name
__none__
\n", "
Run Time
2025-11-19T17:24:28Z
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Markers\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " ge_load_time\n", " \n", "
\n", " \n", " 20251119T172428.405097Z\n", " \n", "
\n", "
\n", " \n", " pandas_data_fingerprint\n", " \n", "
\n", " \n", " d2d9df424ff135d4e9ae3ac57024b468\n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Parameters\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " dataframe\n", " \n", "
\n", " \n", " \n", " \n", "
\n", "\n", " \n", " \n", " \n", "
\n", " \n", " \n", "
\n", " \n", "
\n", " Batch Spec\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " batch_data\n", " \n", "
\n", " \n", " PandasDataFrame\n", " \n", "
\n", "\n", " \n", "
\n", "
\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " Table-Level Expectations\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " Values for given compound columns must be unique together: repo, created_at, label, title, body\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " body\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must always be greater than or equal to 83 characters long.\n", " \n", "
0% unexpected
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " created_at\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " label\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " repo\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", " \n", "
\n", " title\n", "
\n", " \n", "
\n", " \n", "
\n", "\n", "
\n", " \n", "\n", "
\n", "\n", " \n", "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " Status\n", " \n", " \n", " \n", " \n", " Expectation\n", " \n", " \n", " \n", " \n", " Observed Value\n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must be of type str.\n", " \n", "
0% unexpected
\n", "
\n", " \n", " \n", " \n", "
\n", " \n", " values must never be null.\n", " \n", "
100% not null
\n", "\n", "\n", "
\n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", " \n", "\n", " \n", "\n", "[{'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"repo\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnrepo\"}, \"id\": \"8ac50c7f-a7e4-4d53-915c-74ebc10b9a23\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"repo\"}, \"meta\": {\"tag\": \"missing_values_columnrepo\"}, \"id\": \"3da1ca87-f3bb-41d9-8444-ef82b3f7e4f5\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"created_at\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columncreated_at\"}, \"id\": \"e29941fe-f519-4194-93c8-8ada13f2071d\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"created_at\"}, \"meta\": {\"tag\": \"missing_values_columncreated_at\"}, \"id\": \"bde76d7a-060e-4b47-9228-edbd88285e19\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnlabel\"}, \"id\": \"ab28bebb-1f6b-4b4b-b2fb-9b0ca7163faa\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"label\"}, \"meta\": {\"tag\": \"missing_values_columnlabel\"}, \"id\": \"9a008c37-afb1-4ad0-8bd0-045df7c1ba26\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columntitle\"}, \"id\": \"95c03a7e-86f7-46e5-a36f-cc637f96e2a8\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"title\"}, \"meta\": {\"tag\": \"missing_values_columntitle\"}, \"id\": \"c23e4e86-38ce-439d-9060-615bfb3cb916\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_be_of_type\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"type_\": \"str\"}, \"meta\": {\"tag\": \"type_integrity_columnbody\"}, \"id\": \"b17cdcea-46b5-428f-a648-705c1648243e\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_values_to_not_be_null\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\"}, \"meta\": {\"tag\": \"missing_values_columnbody\"}, \"id\": \"6fa29b43-e442-4dc4-a53d-dafd9df12ac0\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_column_value_lengths_to_be_between\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column\": \"body\", \"min_value\": 83}, \"meta\": {\"tag\": \"unmeaningful_bodies\"}, \"id\": \"7b57916b-b27f-4a9f-9fcd-7c8d396863e8\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True},\n", " {'config': {\"type\": \"expect_compound_columns_to_be_unique\", \"kwargs\": {\"batch_id\": \"df-df_asset\", \"column_list\": [\"repo\", \"created_at\", \"label\", \"title\", \"body\"]}, \"meta\": {\"tag\": \"duplicates\"}, \"id\": \"88713cd6-94e7-4a07-888e-938130cafdc7\", \"severity\": \"critical\"},\n", " 'num_of_failed_rows': 0,\n", " 'percent_of_failed_rows': 0.0,\n", " 'success': True}]\n" ] } ], "source": [ "datasets = [\"nasa_cfs_train\", \"nasa_fprime_train\", \"nasa_train\", \"nlbse23_train\", \"nlbse24_train\", \n", " \"nasa_cfs_test\", \"nasa_fprime_test\", \"nasa_test\", \"nlbse23_test\", \"nlbse24_test\"]\n", "\n", "for dataset in datasets:\n", " print(f\"\\n\\nProcessing dataset: {dataset}\")\n", " # load the dataset and print basic stats\n", " dp = DataPreprocessing(dataset)\n", " dp.load_dataset()\n", " dp.basic_stats()\n", "\n", " # check the dataset\n", " checks = [\"column_types\", \"missing_values\", \"duplicates\", \"unmeaningful_bodies\"]\n", " results = dp.check_dataset(checks, save_report=True, report_path=\"Raw data\")\n", "\n", " pprint.pprint(results)\n", " \n", " # clean the dataset\n", " dp.automated_cleaning()\n", "\n", " print(\"\\nCleaning completed!\\n\")\n", "\n", " # check the dataset again to verify if the cleaning process worked\n", " results = dp.check_dataset(checks,save_report=True,report_path=\"Interim data\")\n", " pprint.pprint(results)\n", " \n", " # save the dataset\n", " dp.save_dataset()\n", "\n", " # clean dataset allocated memory\n", " del dp" ] }, { "cell_type": "code", "execution_count": null, "id": "1c9f916a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "syntetic-issue-report-data-generation", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.19" } }, "nbformat": 4, "nbformat_minor": 5 }