diff --git a/docs/0.12.0/doctrees/API.doctree b/docs/0.12.0/doctrees/API.doctree new file mode 100644 index 000000000..8812ae277 Binary files /dev/null and b/docs/0.12.0/doctrees/API.doctree differ diff --git a/docs/0.12.0/doctrees/add_new_model_to_data_labeler.doctree b/docs/0.12.0/doctrees/add_new_model_to_data_labeler.doctree new file mode 100644 index 000000000..faca090af Binary files /dev/null and b/docs/0.12.0/doctrees/add_new_model_to_data_labeler.doctree differ diff --git a/docs/0.12.0/doctrees/column_name_labeler_example.doctree b/docs/0.12.0/doctrees/column_name_labeler_example.doctree new file mode 100644 index 000000000..259c7dc3b Binary files /dev/null and b/docs/0.12.0/doctrees/column_name_labeler_example.doctree differ diff --git a/docs/0.12.0/doctrees/data_labeling.doctree b/docs/0.12.0/doctrees/data_labeling.doctree new file mode 100644 index 000000000..e8e46d8a2 Binary files /dev/null and b/docs/0.12.0/doctrees/data_labeling.doctree differ diff --git a/docs/0.12.0/doctrees/data_reader.doctree b/docs/0.12.0/doctrees/data_reader.doctree new file mode 100644 index 000000000..41110ad5e Binary files /dev/null and b/docs/0.12.0/doctrees/data_reader.doctree differ diff --git a/docs/0.12.0/doctrees/data_readers.doctree b/docs/0.12.0/doctrees/data_readers.doctree new file mode 100644 index 000000000..95a16fcae Binary files /dev/null and b/docs/0.12.0/doctrees/data_readers.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.avro_data.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.avro_data.doctree new file mode 100644 index 000000000..4ab4ddfed Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.avro_data.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.base_data.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.base_data.doctree new file mode 100644 index 000000000..cb9abd2b2 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.base_data.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.csv_data.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.csv_data.doctree new file mode 100644 index 000000000..43d8280e3 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.csv_data.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.data.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.data.doctree new file mode 100644 index 000000000..7de9a7a42 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.data.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.data_utils.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.data_utils.doctree new file mode 100644 index 000000000..1f78301b1 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.data_utils.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.doctree new file mode 100644 index 000000000..fc296b1d7 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.filepath_or_buffer.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.filepath_or_buffer.doctree new file mode 100644 index 000000000..570203bbf Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.filepath_or_buffer.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.graph_data.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.graph_data.doctree new file mode 100644 index 000000000..b250638b2 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.graph_data.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.json_data.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.json_data.doctree new file mode 100644 index 000000000..5fb3d59bb Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.json_data.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.parquet_data.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.parquet_data.doctree new file mode 100644 index 000000000..c1deb80a5 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.parquet_data.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.structured_mixins.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.structured_mixins.doctree new file mode 100644 index 000000000..8f6a0088a Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.structured_mixins.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.data_readers.text_data.doctree b/docs/0.12.0/doctrees/dataprofiler.data_readers.text_data.doctree new file mode 100644 index 000000000..242f8ccfa Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.data_readers.text_data.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.doctree b/docs/0.12.0/doctrees/dataprofiler.doctree new file mode 100644 index 000000000..353d7e05c Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.dp_logging.doctree b/docs/0.12.0/doctrees/dataprofiler.dp_logging.doctree new file mode 100644 index 000000000..b279b0653 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.dp_logging.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.base_data_labeler.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.base_data_labeler.doctree new file mode 100644 index 000000000..f7f049209 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.base_data_labeler.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.base_model.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.base_model.doctree new file mode 100644 index 000000000..da093755e Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.base_model.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.char_load_tf_model.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.char_load_tf_model.doctree new file mode 100644 index 000000000..a4c8c22e2 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.char_load_tf_model.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.character_level_cnn_model.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.character_level_cnn_model.doctree new file mode 100644 index 000000000..54936d0bf Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.character_level_cnn_model.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.classification_report_utils.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.classification_report_utils.doctree new file mode 100644 index 000000000..e4db8f0aa Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.classification_report_utils.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.column_name_model.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.column_name_model.doctree new file mode 100644 index 000000000..64701f676 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.column_name_model.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.data_labelers.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.data_labelers.doctree new file mode 100644 index 000000000..31cb53d1d Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.data_labelers.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.data_processing.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.data_processing.doctree new file mode 100644 index 000000000..3e54b0181 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.data_processing.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.doctree new file mode 100644 index 000000000..c4dec47e7 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.labeler_utils.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.labeler_utils.doctree new file mode 100644 index 000000000..511ff2ece Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.labeler_utils.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.regex_model.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.regex_model.doctree new file mode 100644 index 000000000..6dd0d8da8 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.regex_model.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.labelers.utils.doctree b/docs/0.12.0/doctrees/dataprofiler.labelers.utils.doctree new file mode 100644 index 000000000..9d53ed20c Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.labelers.utils.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.plugins.decorators.doctree b/docs/0.12.0/doctrees/dataprofiler.plugins.decorators.doctree new file mode 100644 index 000000000..1fcccd19b Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.plugins.decorators.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.plugins.doctree b/docs/0.12.0/doctrees/dataprofiler.plugins.doctree new file mode 100644 index 000000000..2619ed8e5 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.plugins.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.base_column_profilers.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.base_column_profilers.doctree new file mode 100644 index 000000000..48202f332 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.base_column_profilers.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.categorical_column_profile.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.categorical_column_profile.doctree new file mode 100644 index 000000000..71ac5a7b9 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.categorical_column_profile.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.column_profile_compilers.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.column_profile_compilers.doctree new file mode 100644 index 000000000..69bb3fc61 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.column_profile_compilers.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.data_labeler_column_profile.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.data_labeler_column_profile.doctree new file mode 100644 index 000000000..89bc39bff Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.data_labeler_column_profile.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.datetime_column_profile.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.datetime_column_profile.doctree new file mode 100644 index 000000000..d41aa2287 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.datetime_column_profile.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.doctree new file mode 100644 index 000000000..74a7ec9e1 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.float_column_profile.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.float_column_profile.doctree new file mode 100644 index 000000000..cb278dff2 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.float_column_profile.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.graph_profiler.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.graph_profiler.doctree new file mode 100644 index 000000000..2f1d2b7c4 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.graph_profiler.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.helpers.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.helpers.doctree new file mode 100644 index 000000000..5db43b768 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.helpers.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.helpers.report_helpers.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.helpers.report_helpers.doctree new file mode 100644 index 000000000..df9c9f39b Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.helpers.report_helpers.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.histogram_utils.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.histogram_utils.doctree new file mode 100644 index 000000000..22c93c64d Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.histogram_utils.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.int_column_profile.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.int_column_profile.doctree new file mode 100644 index 000000000..e1182d890 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.int_column_profile.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.json_decoder.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.json_decoder.doctree new file mode 100644 index 000000000..cd1ece8be Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.json_decoder.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.json_encoder.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.json_encoder.doctree new file mode 100644 index 000000000..04b63e82a Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.json_encoder.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.numerical_column_stats.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.numerical_column_stats.doctree new file mode 100644 index 000000000..c5cd464d2 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.numerical_column_stats.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.order_column_profile.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.order_column_profile.doctree new file mode 100644 index 000000000..6d3962381 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.order_column_profile.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.profile_builder.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.profile_builder.doctree new file mode 100644 index 000000000..b82950e95 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.profile_builder.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.profiler_options.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.profiler_options.doctree new file mode 100644 index 000000000..f6c62dc52 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.profiler_options.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.profiler_utils.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.profiler_utils.doctree new file mode 100644 index 000000000..89bcec8c3 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.profiler_utils.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.text_column_profile.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.text_column_profile.doctree new file mode 100644 index 000000000..a77fabce5 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.text_column_profile.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.unstructured_labeler_profile.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.unstructured_labeler_profile.doctree new file mode 100644 index 000000000..290e8d705 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.unstructured_labeler_profile.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.unstructured_text_profile.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.unstructured_text_profile.doctree new file mode 100644 index 000000000..1af5b48e6 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.unstructured_text_profile.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.profilers.utils.doctree b/docs/0.12.0/doctrees/dataprofiler.profilers.utils.doctree new file mode 100644 index 000000000..b3b1462ed Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.profilers.utils.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.reports.doctree b/docs/0.12.0/doctrees/dataprofiler.reports.doctree new file mode 100644 index 000000000..2b37a3eac Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.reports.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.reports.graphs.doctree b/docs/0.12.0/doctrees/dataprofiler.reports.graphs.doctree new file mode 100644 index 000000000..3893cb78f Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.reports.graphs.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.reports.utils.doctree b/docs/0.12.0/doctrees/dataprofiler.reports.utils.doctree new file mode 100644 index 000000000..236a7914c Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.reports.utils.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.rng_utils.doctree b/docs/0.12.0/doctrees/dataprofiler.rng_utils.doctree new file mode 100644 index 000000000..fd9dc36c4 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.rng_utils.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.settings.doctree b/docs/0.12.0/doctrees/dataprofiler.settings.doctree new file mode 100644 index 000000000..4f4192118 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.settings.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.validators.base_validators.doctree b/docs/0.12.0/doctrees/dataprofiler.validators.base_validators.doctree new file mode 100644 index 000000000..435a419f4 Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.validators.base_validators.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.validators.doctree b/docs/0.12.0/doctrees/dataprofiler.validators.doctree new file mode 100644 index 000000000..68dd10ecb Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.validators.doctree differ diff --git a/docs/0.12.0/doctrees/dataprofiler.version.doctree b/docs/0.12.0/doctrees/dataprofiler.version.doctree new file mode 100644 index 000000000..0469faa4a Binary files /dev/null and b/docs/0.12.0/doctrees/dataprofiler.version.doctree differ diff --git a/docs/0.12.0/doctrees/environment.pickle b/docs/0.12.0/doctrees/environment.pickle new file mode 100644 index 000000000..8b516adc9 Binary files /dev/null and b/docs/0.12.0/doctrees/environment.pickle differ diff --git a/docs/0.12.0/doctrees/examples.doctree b/docs/0.12.0/doctrees/examples.doctree new file mode 100644 index 000000000..5fd4198ee Binary files /dev/null and b/docs/0.12.0/doctrees/examples.doctree differ diff --git a/docs/0.12.0/doctrees/graph_data_demo.doctree b/docs/0.12.0/doctrees/graph_data_demo.doctree new file mode 100644 index 000000000..0cc7f1075 Binary files /dev/null and b/docs/0.12.0/doctrees/graph_data_demo.doctree differ diff --git a/docs/0.12.0/doctrees/graphs.doctree b/docs/0.12.0/doctrees/graphs.doctree new file mode 100644 index 000000000..b48445276 Binary files /dev/null and b/docs/0.12.0/doctrees/graphs.doctree differ diff --git a/docs/0.12.0/doctrees/index.doctree b/docs/0.12.0/doctrees/index.doctree new file mode 100644 index 000000000..eba04db66 Binary files /dev/null and b/docs/0.12.0/doctrees/index.doctree differ diff --git a/docs/0.12.0/doctrees/install.doctree b/docs/0.12.0/doctrees/install.doctree new file mode 100644 index 000000000..0c222db50 Binary files /dev/null and b/docs/0.12.0/doctrees/install.doctree differ diff --git a/docs/0.12.0/doctrees/labeler.doctree b/docs/0.12.0/doctrees/labeler.doctree new file mode 100644 index 000000000..eaf1a8d26 Binary files /dev/null and b/docs/0.12.0/doctrees/labeler.doctree differ diff --git a/docs/0.12.0/doctrees/merge_profile_list.doctree b/docs/0.12.0/doctrees/merge_profile_list.doctree new file mode 100644 index 000000000..91bd3060a Binary files /dev/null and b/docs/0.12.0/doctrees/merge_profile_list.doctree differ diff --git a/docs/0.12.0/doctrees/modules.doctree b/docs/0.12.0/doctrees/modules.doctree new file mode 100644 index 000000000..c37fa820e Binary files /dev/null and b/docs/0.12.0/doctrees/modules.doctree differ diff --git a/docs/0.12.0/doctrees/nbsphinx/add_new_model_to_data_labeler.ipynb b/docs/0.12.0/doctrees/nbsphinx/add_new_model_to_data_labeler.ipynb new file mode 100644 index 000000000..1495e6a85 --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/add_new_model_to_data_labeler.ipynb @@ -0,0 +1,488 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "228bb2a6", + "metadata": {}, + "source": [ + "# Adding new model to the existing DataLabeler pipeline" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cab7a569", + "metadata": {}, + "source": [ + "Consider the case when we would like to explore different character-level neural network models and evaluate their performance on different datasets. The existing DataLabeler in the DataProfiler library already contains a preprocessor, a postprocessor, and a character-level CNN (Convolutional Neural Network) model that are combined to work on such data. All we need is to build additional model classes that inherit the main functionalities from the CNN model and also adapt the model construction to the desired architectures. In this example, we define such a new model to be used with the Data Labeler component of the Data Profiler. In particular, a character-level LSTM (Long Short-Term Memory) model is implemented, then integrated into the DataLabeler pipeline to be trained with a tabular dataset. The process includes the following steps:\n", + "\n", + " - Build a new character-level LSTM model that inherits the CNN model\n", + " - Load the DataLabeler from the DataProfiler\n", + " - Swap the existing CNN model with the new LSTM model\n", + " - Train the data labeler pipeline on a given dataset\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16624c48", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e90728ab", + "metadata": {}, + "source": [ + "## Dataset" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3d61981c", + "metadata": {}, + "source": [ + "In this example, we use a structured dataset, the aws honeypot dataset, given in the test folder of the library. This dataset is first read by the Data Reader class of the Data Profiler, then split into training and test data to be used in the next sections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f031fe06", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data\n", + "data = dp.Data(\"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\")\n", + "df_data = data.data\n", + "\n", + "# split data to training and test set\n", + "split_ratio = 0.2\n", + "df_data = df_data.sample(frac=1).reset_index(drop=True)\n", + "data_train = df_data[:int((1 - split_ratio) * len(df_data))]\n", + "data_test = df_data[int((1 - split_ratio) * len(df_data)):]\n", + "\n", + "df_data.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "745ed0d4", + "metadata": {}, + "source": [ + "## Implement a new character-level LSTM model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7375b0c0", + "metadata": {}, + "source": [ + "This new model is inherited from `CharacterLevelCnnModel` class, with some modifications on the following functions\n", + "\n", + "`__init__`: to add new parameters for the LSTM model. The new parameters, `size_lstm`, `rec_dropout`, `activation`, `recurrent_activation`, specify number of LSTM layers, activation function, and recurrent dropout ratio.\n", + "\n", + "`_validate_parameters`: to add additional checks on the new parameters for the LSTM model\n", + "\n", + "`_construct_model`: to construct the new LSTM model with the desired architecture" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8568fb49", + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "from dataprofiler.labelers.character_level_cnn_model import (\n", + " CharacterLevelCnnModel,\n", + " create_glove_char,\n", + " build_embd_dictionary,\n", + ")\n", + "from dataprofiler.labelers.base_model import BaseModel\n", + "from dataprofiler.labelers.labeler_utils import F1Score\n", + "\n", + "\n", + "# CharacterLevelLstmModel derives from CharacterLevelCnnModel\n", + "#########################################################\n", + "#########################################################\n", + "class CharacterLevelLstmModel(CharacterLevelCnnModel):\n", + " # boolean if the label mapping requires the mapping for index 0 reserved\n", + " requires_zero_mapping = True\n", + "\n", + " def __init__(self, label_mapping=None, parameters=None):\n", + " \"\"\"\n", + " LSTM Model Initializer\n", + " \"\"\"\n", + "\n", + " # parameter initialization\n", + " if not parameters:\n", + " parameters = {}\n", + " parameters.setdefault(\"max_length\", 3400)\n", + " parameters.setdefault(\"max_char_encoding_id\", 127)\n", + " parameters.setdefault(\"dim_embed\", 64)\n", + " parameters.setdefault(\"size_fc\", [32, 32])\n", + " parameters.setdefault(\"dropout\", 0.1)\n", + " # new parameters for LSTM model\n", + " #########################################################\n", + " #########################################################\n", + " parameters.setdefault(\"size_lstm\", [64])\n", + " parameters.setdefault(\"rec_dropout\", 0.1)\n", + " parameters.setdefault(\"activation\", \"tanh\")\n", + " parameters.setdefault(\"recurrent_activation\", \"sigmoid\")\n", + " #########################################################\n", + " #########################################################\n", + " parameters.setdefault(\"default_label\", \"UNKNOWN\")\n", + " parameters[\"pad_label\"] = \"PAD\"\n", + " self._epoch_id = 0\n", + "\n", + " # reconstruct flags for model\n", + " self._model_num_labels = 0\n", + " self._model_default_ind = -1\n", + "\n", + " BaseModel.__init__(self, label_mapping, parameters)\n", + "\n", + " def _validate_parameters(self, parameters):\n", + " \"\"\"\n", + " Validate the parameters sent in. Raise error if invalid parameters are\n", + " present.\n", + " \"\"\"\n", + " errors = []\n", + " list_of_necessary_params = [\n", + " \"max_length\",\n", + " \"max_char_encoding_id\",\n", + " \"dim_embed\",\n", + " \"size_fc\",\n", + " \"dropout\",\n", + " \"size_lstm\",\n", + " \"rec_dropout\",\n", + " \"activation\",\n", + " \"recurrent_activation\",\n", + " \"default_label\",\n", + " \"pad_label\",\n", + " ]\n", + " # Make sure the necessary parameters are present and valid.\n", + " for param in parameters:\n", + " if param in [\n", + " \"max_length\",\n", + " \"max_char_encoding_id\",\n", + " \"dim_embed\",\n", + " \"size_conv\",\n", + " ]:\n", + " if (\n", + " not isinstance(parameters[param], (int, float))\n", + " or parameters[param] < 0\n", + " ):\n", + " errors.append(\n", + " param + \" must be a valid integer or float \" \"greater than 0.\"\n", + " )\n", + " elif param in [\n", + " \"dropout\",\n", + " \"rec_dropout\",\n", + " ]: # additional check for rec_dropout\n", + " if (\n", + " not isinstance(parameters[param], (int, float))\n", + " or parameters[param] < 0\n", + " or parameters[param] > 1\n", + " ):\n", + " errors.append(\n", + " param + \" must be a valid integer or float \" \"from 0 to 1.\"\n", + " )\n", + " elif (\n", + " param == \"size_fc\" or param == \"size_lstm\"\n", + " ): # additional check for size_lstm\n", + " if (\n", + " not isinstance(parameters[param], list)\n", + " or len(parameters[param]) == 0\n", + " ):\n", + " errors.append(param + \" must be a non-empty list of \" \"integers.\")\n", + " else:\n", + " for item in parameters[param]:\n", + " if not isinstance(item, int):\n", + " errors.append(\n", + " param + \" must be a non-empty \" \"list of integers.\"\n", + " )\n", + " break\n", + " elif param in [\n", + " \"default_label\",\n", + " \"activation\",\n", + " \"recurrent_activation\",\n", + " ]: # additional check for activation and recurrent_activation\n", + " if not isinstance(parameters[param], str):\n", + " error = str(param) + \" must be a string.\"\n", + " errors.append(error)\n", + "\n", + " # Error if there are extra parameters thrown in\n", + " for param in parameters:\n", + " if param not in list_of_necessary_params:\n", + " errors.append(param + \" is not an accepted parameter.\")\n", + " if errors:\n", + " raise ValueError(\"\\n\".join(errors))\n", + "\n", + " def _construct_model(self):\n", + " \"\"\"\n", + " Model constructor for the data labeler. This also serves as a weight\n", + " reset.\n", + "\n", + " :return: None\n", + " \"\"\"\n", + " num_labels = self.num_labels\n", + " default_ind = self.label_mapping[self._parameters[\"default_label\"]]\n", + "\n", + " # Reset model\n", + " tf.keras.backend.clear_session()\n", + "\n", + " # generate glove embedding\n", + " create_glove_char(self._parameters[\"dim_embed\"])\n", + "\n", + " # generate model\n", + " self._model = tf.keras.models.Sequential()\n", + "\n", + " # default parameters\n", + " max_length = self._parameters[\"max_length\"]\n", + " max_char_encoding_id = self._parameters[\"max_char_encoding_id\"]\n", + "\n", + " # Encoding layer\n", + " def encoding_function(input_str):\n", + " char_in_vector = CharacterLevelLstmModel._char_encoding_layer(\n", + " input_str, max_char_encoding_id, max_length\n", + " )\n", + " return char_in_vector\n", + "\n", + " self._model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.string))\n", + "\n", + " self._model.add(\n", + " tf.keras.layers.Lambda(encoding_function, output_shape=tuple([max_length]))\n", + " )\n", + "\n", + " # Create a pre-trained weight matrix\n", + " # character encoding indices range from 0 to max_char_encoding_id,\n", + " # we add one extra index for out-of-vocabulary character\n", + " embed_file = os.path.join(\n", + " \"../dataprofiler/labelers\",\n", + " \"embeddings/glove-reduced-{}D.txt\".format(self._parameters[\"dim_embed\"]),\n", + " )\n", + " embedding_matrix = np.zeros(\n", + " (max_char_encoding_id + 2, self._parameters[\"dim_embed\"])\n", + " )\n", + " embedding_dict = build_embd_dictionary(embed_file)\n", + "\n", + " input_shape = tuple([max_length])\n", + " # Fill in the weight matrix: let pad and space be 0s\n", + " for ascii_num in range(max_char_encoding_id):\n", + " if chr(ascii_num) in embedding_dict:\n", + " embedding_matrix[ascii_num + 1] = embedding_dict[chr(ascii_num)]\n", + "\n", + " self._model.add(\n", + " tf.keras.layers.Embedding(\n", + " max_char_encoding_id + 2,\n", + " self._parameters[\"dim_embed\"],\n", + " weights=[embedding_matrix],\n", + " input_length=input_shape[0],\n", + " trainable=True,\n", + " )\n", + " )\n", + "\n", + " # Add the lstm layers\n", + " #########################################################\n", + " #########################################################\n", + " for size in self._parameters[\"size_lstm\"]:\n", + " self._model.add(\n", + " tf.keras.layers.LSTM(\n", + " units=size,\n", + " recurrent_dropout=self._parameters[\"rec_dropout\"],\n", + " activation=self._parameters[\"activation\"],\n", + " recurrent_activation=self._parameters[\"recurrent_activation\"],\n", + " return_sequences=True,\n", + " )\n", + " )\n", + " if self._parameters[\"dropout\"]:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n", + " #########################################################\n", + " #########################################################\n", + "\n", + " # Add the fully connected layers\n", + " for size in self._parameters[\"size_fc\"]:\n", + " self._model.add(tf.keras.layers.Dense(units=size, activation=\"relu\"))\n", + " if self._parameters[\"dropout\"]:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n", + "\n", + " # Add the final Softmax layer\n", + " self._model.add(tf.keras.layers.Dense(num_labels, activation=\"softmax\"))\n", + "\n", + " # Output the model into a .pb file for TensorFlow\n", + " argmax_layer = tf.keras.backend.argmax(self._model.output)\n", + "\n", + " # Create confidence layers\n", + " final_predicted_layer = CharacterLevelLstmModel._argmax_threshold_layer(\n", + " num_labels, threshold=0.0, default_ind=default_ind\n", + " )\n", + "\n", + " argmax_outputs = self._model.outputs + [\n", + " argmax_layer,\n", + " final_predicted_layer(argmax_layer, self._model.output),\n", + " ]\n", + " self._model = tf.keras.Model(self._model.inputs, argmax_outputs)\n", + "\n", + " # Compile the model\n", + " softmax_output_layer_name = self._model.outputs[0].name.split(\"/\")[0]\n", + " losses = {softmax_output_layer_name: \"categorical_crossentropy\"}\n", + "\n", + " # use f1 score metric\n", + " f1_score_training = F1Score(num_classes=num_labels, average=\"micro\")\n", + " metrics = {softmax_output_layer_name: [\"acc\", f1_score_training]}\n", + "\n", + " self._model.compile(loss=losses, optimizer=\"adam\", metrics=metrics)\n", + "\n", + " self._epoch_id = 0\n", + " self._model_num_labels = num_labels\n", + " self._model_default_ind = default_ind" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d66bd25c", + "metadata": {}, + "source": [ + "## Integrate the new LSTM model to the DataLabeler" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "479f407a", + "metadata": {}, + "source": [ + "Once the LSTM model is built, it replaces the existing model in the DataLabeler pipeline, which is then trained on the given dataset. Note that, as the DataLabeler is trained on the above tabular dataset, its label mapping is updated by the list of column names in that dataset while training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb482ffe", + "metadata": {}, + "outputs": [], + "source": [ + "# get labels from the given dataset\n", + "value_label_df = data_train.reset_index(drop=True).melt()\n", + "value_label_df.columns = [1, 0] # labels=1, values=0 in that order\n", + "value_label_df = value_label_df.astype(str)\n", + "labels = value_label_df[1].unique().tolist()\n", + "\n", + "# create a new LSTM model\n", + "# set default label (one of the column names) to the model\n", + "model = CharacterLevelLstmModel(label_mapping=labels, parameters={'default_label': 'comment'})\n", + "\n", + "# add the new LSTM model to the data labeler\n", + "data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True)\n", + "data_labeler.set_model(model)\n", + "\n", + "# set default label (one of the column names) to the preprocessor and postprocessor\n", + "processor_params = {'default_label': 'comment'}\n", + "data_labeler._preprocessor.set_params(**processor_params)\n", + "data_labeler._postprocessor.set_params(**processor_params)\n", + "\n", + "# train the data labeler\n", + "save_dirpath=\"data_labeler_saved\"\n", + "if not os.path.exists(save_dirpath):\n", + " os.makedirs(save_dirpath)\n", + "\n", + "epochs=2\n", + "data_labeler.fit(\n", + " x=value_label_df[0], y=value_label_df[1], labels=labels, epochs=epochs)\n", + "if save_dirpath:\n", + " data_labeler.save_to_disk(save_dirpath)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "14b78c69", + "metadata": {}, + "source": [ + "The trained Data Labeler is then used by the Data Profiler to provide the prediction on the new dataset. In this example, all options except data labeler are disabled for the sake of presenting data labeler functionality. The results are given in the columnar format where true column types are given in the first column, and the predicted column labels are given in the second column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdfcf1d2", + "metadata": {}, + "outputs": [], + "source": [ + "# predict with the data labeler object\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"structured_options.text.is_enabled\": False, \n", + " \"int.is_enabled\": False, \n", + " \"float.is_enabled\": False, \n", + " \"order.is_enabled\": False, \n", + " \"category.is_enabled\": False, \n", + " \"datetime.is_enabled\": False,})\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': data_labeler})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "def get_structured_results(results):\n", + " columns = []\n", + " predictions = []\n", + " for col_report in results['data_stats']:\n", + " columns.append(col_report['column_name'])\n", + " predictions.append(col_report['data_label'])\n", + "\n", + " df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions})\n", + " return df_results\n", + "\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cc60ff8a", + "metadata": {}, + "source": [ + "In summary, users can define their own model, plug it in the DataLabeler pipeline, and train the labeler with the new dataset. Above, we show one example of adding the LSTM model to the pipeline. Interested users can implement other neural network models as desired with the same process." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/column_name_labeler_example.ipynb b/docs/0.12.0/doctrees/nbsphinx/column_name_labeler_example.ipynb new file mode 100644 index 000000000..6d3369698 --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/column_name_labeler_example.ipynb @@ -0,0 +1,364 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e04c382a-7c49-452b-b9bf-e448951c64fe", + "metadata": {}, + "source": [ + "# ColumnName Labeler Tutorial" + ] + }, + { + "cell_type": "markdown", + "id": "6fb3ecb9-bc51-4c18-93d5-7991bbee5165", + "metadata": {}, + "source": [ + "This notebook teaches how to use the existing `ColumnNameModel`:\n", + "\n", + "1. Loading and utilizing the pre-existing `ColumnNameModel`\n", + "2. Run the labeler\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a67c197b-d3ee-4896-a96f-cc3d043601d3", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "from pprint import pprint\n", + "\n", + "import pandas as pd\n", + "\n", + "try:\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " sys.path.insert(0, '../..')\n", + " import dataprofiler as dp" + ] + }, + { + "cell_type": "markdown", + "id": "35841215", + "metadata": {}, + "source": [ + "## Loading and predicting using a pre-existing model using `load_from_library`\n", + "\n", + "The easiest option for users is to `load_from_library` by specifying the name for the labeler in the `resources/` folder. Quickly import and start predicting with any model from the Data Profiler's library of models available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46e36dd6", + "metadata": {}, + "outputs": [], + "source": [ + "labeler_from_library = dp.DataLabeler.load_from_library('column_name_labeler')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfa94868", + "metadata": {}, + "outputs": [], + "source": [ + "labeler_from_library.predict(data=[\"ssn\"])" + ] + }, + { + "cell_type": "markdown", + "id": "c71356f4-9020-4862-a1e1-816effbb5443", + "metadata": {}, + "source": [ + "## Loading and using the pre-existing column name labeler using `load_with_components`\n", + "\n", + "For example purposes here, we will import the exsting `ColumnName` labeler via the `load_with_components` command from the `dp.DataLabeler`. This shows a bit more of the details of the data labeler's flow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "818c5b88", + "metadata": {}, + "outputs": [], + "source": [ + "parameters = {\n", + " \"true_positive_dict\": [\n", + " {\"attribute\": \"ssn\", \"label\": \"ssn\"},\n", + " {\"attribute\": \"suffix\", \"label\": \"name\"},\n", + " {\"attribute\": \"my_home_address\", \"label\": \"address\"},\n", + " ],\n", + " \"false_positive_dict\": [\n", + " {\n", + " \"attribute\": \"contract_number\",\n", + " \"label\": \"ssn\",\n", + " },\n", + " {\n", + " \"attribute\": \"role\",\n", + " \"label\": \"name\",\n", + " },\n", + " {\n", + " \"attribute\": \"send_address\",\n", + " \"label\": \"address\",\n", + " },\n", + " ],\n", + " \"negative_threshold_config\": 50,\n", + " \"positive_threshold_config\": 85,\n", + " \"include_label\": True,\n", + " }\n", + "\n", + "label_mapping = {\"ssn\": 1, \"name\": 2, \"address\": 3}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9098329e", + "metadata": {}, + "outputs": [], + "source": [ + "# pre processor \n", + "preprocessor = dp.labelers.data_processing.DirectPassPreprocessor()\n", + "\n", + "# model\n", + "from dataprofiler.labelers.column_name_model import ColumnNameModel\n", + "model = ColumnNameModel(\n", + " parameters=parameters,\n", + " label_mapping=label_mapping,\n", + ")\n", + "\n", + "\n", + "# post processor\n", + "postprocessor = dp.labelers.data_processing.ColumnNameModelPostprocessor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "113d6655-4bca-4d8e-9e6f-b972e29d5684", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler = dp.DataLabeler.load_with_components(\n", + " preprocessor=preprocessor,\n", + " model=model,\n", + " postprocessor=postprocessor,\n", + ")\n", + "data_labeler.model.help()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b405887-2b92-44ca-b8d7-29c384f6dd9c", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(data_labeler.label_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11916a48-098c-4056-ac6c-b9542d85fa86", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(data_labeler.model._parameters)" + ] + }, + { + "cell_type": "markdown", + "id": "da0e97ee-8d6d-4631-9b55-78ed904d5f41", + "metadata": {}, + "source": [ + "### Predicting with the ColumnName labeler\n", + "\n", + "In the prediction below, the data will be passed into to stages in the background\n", + "- 1) `compare_negative`: The idea behind the `compare_negative` is to first filter out any possibility of flagging a false positive in the model prediction. In this step, the confidence value is checked and if the similarity is too close to being a false positive, that particular string in the `data` is removed and not returned to the `compare_positive`.\n", + "- 2) `compare_positive`: Finally the `data` is passed to the `compare_positive` step and checked for similarity with the the `true_positive_dict` values. Again, during this stage the `positive_threshold_config` is used to filter the results to only those `data` values that are greater than or equal to the `positive_threshold_config` provided by the user." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe519e65-36a7-4f42-8314-5369de8635c7", + "metadata": {}, + "outputs": [], + "source": [ + "# evaluate a prediction using the default parameters\n", + "data_labeler.predict(data=[\"ssn\", \"name\", \"address\"])" + ] + }, + { + "cell_type": "markdown", + "id": "b41d834d-e47b-45a6-8970-d2d2033e2ade", + "metadata": {}, + "source": [ + "## Replacing the parameters in the existing labeler\n", + "\n", + "We can achieve this by:\n", + "1. Setting the label mapping to the new labels\n", + "2. Setting the model parameters which include: `true_positive_dict`, `false_positive_dict`, `negative_threshold_config`, `positive_threshold_config`, and `include_label`\n", + "\n", + "where `true_positive_dict` and `false_positive_dict` are `lists` of `dicts`, `negative_threshold_config` and `positive_threshold_config` are integer values between `0` and `100`, and `include_label` is a `boolean` value that determines if the output should include the prediction labels or only the confidence values." + ] + }, + { + "cell_type": "markdown", + "id": "c6bb010a-406f-4fd8-abd0-3355a5ad0ded", + "metadata": {}, + "source": [ + "Below, we created 4 labels where `other` is the `default_label`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f86584cf-a7af-4bae-bf44-d87caa68833a", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.set_labels({'other': 0, \"funky_one\": 1, \"funky_two\": 2, \"funky_three\": 3})\n", + "data_labeler.model.set_params(\n", + " true_positive_dict= [\n", + " {\"attribute\": \"ssn\", \"label\": \"funky_one\"},\n", + " {\"attribute\": \"suffix\", \"label\": \"funky_two\"},\n", + " {\"attribute\": \"my_home_address\", \"label\": \"funky_three\"},\n", + " ],\n", + " false_positive_dict=[\n", + " {\n", + " \"attribute\": \"contract_number\",\n", + " \"label\": \"ssn\",\n", + " },\n", + " {\n", + " \"attribute\": \"role\",\n", + " \"label\": \"name\",\n", + " },\n", + " {\n", + " \"attribute\": \"not_my_address\",\n", + " \"label\": \"address\",\n", + " },\n", + " ],\n", + " negative_threshold_config=50,\n", + " positive_threshold_config=85,\n", + " include_label=True,\n", + ")\n", + "data_labeler.label_mapping" + ] + }, + { + "cell_type": "markdown", + "id": "1ece1c8c-18a5-46fc-b563-6458e6e71e53", + "metadata": {}, + "source": [ + "### Predicting with the new labels\n", + "\n", + "Here we are testing the `predict()` method with brand new labels for label_mapping. As we can see the new labels flow throught to the output of the data labeler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92842e14-2ea6-4879-b58c-c52b607dc94c", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.predict(data=[\"ssn\", \"suffix\"], predict_options=dict(show_confidences=True))" + ] + }, + { + "cell_type": "markdown", + "id": "261b903f-8f4c-403f-839b-ab8813f850e9", + "metadata": {}, + "source": [ + "## Saving the Data Labeler for future use" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6ffbaf2-9400-486a-ba83-5fc9ba9334d7", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.isdir('new_column_name_labeler'):\n", + " os.mkdir('new_column_name_labeler')\n", + "data_labeler.save_to_disk('new_column_name_labeler')" + ] + }, + { + "cell_type": "markdown", + "id": "09e40cb6-9d89-41c4-ae28-3dca498f8c68", + "metadata": {}, + "source": [ + "## Loading the saved Data Labeler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52615b25-70a6-4ebb-8a32-14aaf1e747d9", + "metadata": {}, + "outputs": [], + "source": [ + "saved_labeler = dp.DataLabeler.load_from_disk('new_column_name_labeler')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1ccc0b3-1dc2-4847-95c2-d6b8769b1590", + "metadata": {}, + "outputs": [], + "source": [ + "# ensuring the parametesr are what we saved.\n", + "print(\"label_mapping:\")\n", + "pprint(saved_labeler.label_mapping)\n", + "print(\"\\nmodel parameters:\")\n", + "pprint(saved_labeler.model._parameters)\n", + "print()\n", + "print(\"postprocessor: \" + saved_labeler.postprocessor.__class__.__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c827f2ae-4af6-4f3f-9651-9ee9ebea9fa0", + "metadata": {}, + "outputs": [], + "source": [ + "# predicting with the loaded labeler.\n", + "saved_labeler.predict([\"ssn\", \"name\", \"address\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/data_reader.ipynb b/docs/0.12.0/doctrees/nbsphinx/data_reader.ipynb new file mode 100644 index 000000000..d2ce887e6 --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/data_reader.ipynb @@ -0,0 +1,689 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d4d79832-59ab-410a-ad6d-fbba01a3f0d3", + "metadata": {}, + "source": [ + "# Intro to Data Readers\n", + "Within the Data Profiler, there are 5 data reader classes:\n", + "\n", + " * CSVData (delimited data: CSV, TSV, etc.)\n", + " * JSONData\n", + " * ParquetData\n", + " * AVROData\n", + " * GraphData\n", + " * TextData\n", + " \n", + "Each of these classes can be used to read data individually, however the Data Profiler provides the unique capability of auto detecting what data you have and reading it automatically by using the `Data` class.\n", + "```python\n", + "import dataprofiler as dp\n", + "data = dp.Data('/path/to/mydata.abc') # auto detects and reads your data\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "f2315666-20be-4937-9f9a-26d42dc135e2", + "metadata": { + "tags": [] + }, + "source": [ + "## Automatically reading and detecting data\n", + "\n", + "Below is a demonstration of utilizing the `Data` class which automatically detects the type of data for a given file and reads it automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99e61c6c-43b8-4700-b627-759b5ef8bdda", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8821ad8d-b2c0-489c-ae6a-54c11b7f0a08", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data with different file types\n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "csv_files = [\n", + " \"csv/aws_honeypot_marx_geo.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "json_files = [\n", + " \"json/complex_nested.json\",\n", + " \"json/honeypot_intentially_mislabeled_file.csv\", # json file with the .csv extension\n", + "]\n", + "parquet_files = [\n", + " \"parquet/nation.dict.parquet\",\n", + " \"parquet/nation.plain.intentionally_mislabled_file.csv\", # parquet file with the .csv extension\n", + "]\n", + "avro_files = [\n", + " \"avro/userdata1.avro\",\n", + " \"avro/userdata1_intentionally_mislabled_file.json\", # avro file with the .json extension\n", + "]\n", + "graph_files = [\n", + " \"csv/graph_data_csv_identify.csv\", # csv file with graph column names\n", + "]\n", + "text_files = [\n", + " \"txt/discussion_reddit.txt\",\n", + "]\n", + "all_files = csv_files + json_files + parquet_files + avro_files + graph_files + text_files\n", + "print('filepath' + ' ' * 58 + 'data type')\n", + "print('='*80)\n", + "for file in all_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " data = dp.Data(filepath)\n", + " print(\"{:<65} {:<15}\".format(file, data.data_type))\n", + "print(\"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49dfc981-59fd-48a5-ad7b-e01f0a52d0b2", + "metadata": {}, + "outputs": [], + "source": [ + "# importing from a url\n", + "data = dp.Data('https://raw.githubusercontent.com/capitalone/DataProfiler/main/dataprofiler/tests/data/csv/diamonds.csv')\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "77f8ef2d-5aaf-44d6-b6d1-bf14f7eb7aa6", + "metadata": {}, + "source": [ + "## Specifying detection options of `Data` and loading `pandas.DataFrame`\n", + "\n", + "The `Data` class also gives the ability to set options or if the user wants to load their data with specific requirements.\n", + "Options for each data reader are specified in the docs: https://capitalone.github.io/DataProfiler/docs/0.4.4/html/dataprofiler.data_readers.html\n", + "\n", + "```python\n", + "import dataprofiler as dp\n", + "\n", + "options = {...} # allowed options are specified for each data reader.\n", + "data = dp.Data(data, options=options)\n", + "```\n", + "Later in this tutorial, the options for the CSVData class will be discussed.\n", + "\n", + "Additionally, a user can directly load a `pandas.DataFrame` as any data reader they choose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b925d4e-ca94-4913-9acf-26a883585e85", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "\n", + "df = pd.DataFrame(['my', 'random', 'data'])\n", + "\n", + "# specify via the `Data` class\n", + "data = dp.Data(data=df, data_type='csv')\n", + "print('Data Type: ', data.data_type)\n", + "\n", + "# specifically use the CSVData class\n", + "data = CSVData(data=df)\n", + "print('Data Type: ', data.data_type)" + ] + }, + { + "cell_type": "markdown", + "id": "52c3c3ac-c241-4d91-8ac7-b3d28ffd19c3", + "metadata": {}, + "source": [ + "## Accessing data and attributes\n", + "\n", + "Once loaded, the data can be accessed via the `data` property of the object. Additional information about the data loaded may differ between data readers.\n", + "\n", + "For this example we will focus on `CSVData`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09fa5929-e710-4107-9313-1370ab639c9c", + "metadata": {}, + "outputs": [], + "source": [ + "filepath = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "data = dp.Data(filepath)\n", + "print('Data Type: ', data.data_type)\n", + "print('Data Filepath: ', data.input_file_path)\n", + "print('File Encoding: ', data.file_encoding)\n", + "print('Data Length (two techniques): ', len(data), data.length)\n", + "print(\"Data Access:\")\n", + "data.data" + ] + }, + { + "cell_type": "markdown", + "id": "b98be971-4768-479d-9e54-00f05a6fb790", + "metadata": {}, + "source": [ + "## Checking data file types with `is_match`\n", + "\n", + "Each data reader has a class method `is_match` which determines whether or not a dataset is of a given data type.\n", + "```python\n", + "CSVData.is_match\n", + "JSONData.is_match\n", + "ParquetData.is_match\n", + "AVROData.is_match\n", + "GraphData.is_match\n", + "TextData.is_match\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "104a32c1-1d50-4aa5-94ce-b2e72de38476", + "metadata": {}, + "outputs": [], + "source": [ + "# supplemental function\n", + "def add_true_false_color(value):\n", + " \"\"\"Converts True to green and False to red in printed text.\"\"\"\n", + " if value:\n", + " return \"\\x1b[92m \" + str(is_match) + \"\\x1b[0m\"\n", + " return \"\\x1b[31m \" + str(is_match) + \"\\x1b[0m\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06868d90-2726-4096-a6da-3866174e6671", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "\n", + "non_csv_files = [\n", + " 'json/iris-utf-8.json',\n", + " 'json/honeypot_intentially_mislabeled_file.csv',\n", + " 'parquet/titanic.parq',\n", + " 'parquet/nation.plain.intentionally_mislabled_file.csv',\n", + " 'txt/code.txt',\n", + " 'txt/sentence.txt',\n", + " 'avro/users.avro',\n", + " 'avro/snappy_compressed_intentionally_mislabeled_file.csv',\n", + "]\n", + "\n", + "print(\"Is the file a CSV?\")\n", + "print('=' * 80)\n", + "for file in csv_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " is_match = CSVData.is_match(filepath)\n", + " print(add_true_false_color(is_match), ':', file)\n", + " print('=' * 80)\n", + " \n", + "for file in non_csv_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " is_match = CSVData.is_match(filepath)\n", + " print(add_true_false_color(is_match), ':', file)\n", + " print('=' * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "38889990-8e19-4114-a4f3-dc2af938e29d", + "metadata": {}, + "source": [ + "## Reloading data after altering options with `reload`\n", + "\n", + "There are two cases for using the reload function, both of which require the data type to have been interpreted correctly:\n", + "\n", + " 1. The options were not correctly determined\n", + " 2. The options were loaded correctly but a change is desired.\n", + " \n", + "In the example below, the `data_format` for reading the data is changed and the data is then reloaded." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01870e8d-45ee-4f33-a088-4453c7ffc7c2", + "metadata": {}, + "outputs": [], + "source": [ + "filepath = \"../dataprofiler/tests/data/csv/diamonds.csv\"\n", + "\n", + "data = dp.Data(filepath)\n", + "print('original data:')\n", + "print('=' * 80)\n", + "print(data.data[:5])\n", + "\n", + "print()\n", + "data.reload(options={'data_format': 'records', 'record_samples_per_line': 1})\n", + "print('reloaded data:')\n", + "print('=' * 80)\n", + "data.data[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "e2285f19-9b34-4484-beaa-79df890b2825", + "metadata": {}, + "source": [ + "## A deeper dive into `CSVData`\n", + "\n", + "This next section will focus on how to use the data reader class: `CSVData`. The `CSVData` class is used for reading delimited data. Delimited data are datasets which have their columns specified by a specific character, commonly the `,`. E.g. from the `diamonds.csv` dataset:\n", + "```\n", + "carat,cut,color,clarity,depth,table,price,x,y,z\n", + "0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43\n", + "0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31\n", + "0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31\n", + "0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63\n", + "0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75\n", + "```\n", + "\n", + "However, the delimiter can be any character. Additionally, a `quotechar`, commonly `\"`, can be specified which allows a delimiter to be contained within a column value.\n", + "E.g. from the `blogposts.csv` dataset:\n", + "```\n", + "Blog Post,Date,Subject,Field\n", + "\"Monty Hall, meet Game Theory\",4/13/2014,Statistics,Mathematics\n", + "Gaussian Quadrature,4/13/2014,Algorithms,Mathematics\n", + "```\n", + "Notice how `\"Monty Hall, meet Game Theory\"` is contained by the quotechar because it contains the delimiter value `,`.\n", + "\n", + "These delimiter dataset parameters (and more) can be automatically determined by the `CSVData` data reader, however they can also be set via the options as demonstrated later in this tutorial." + ] + }, + { + "cell_type": "markdown", + "id": "cccb6bf9-7fb8-46b8-992e-9caacb7ab3a8", + "metadata": {}, + "source": [ + "## Intro to the `CSVData` data reader\n", + "\n", + "Previously, it was shown that `CSVData` may automatically be detected using `Data` or can be manually specified by the user:\n", + "\n", + "```python\n", + "import dataprofiler as dp\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "data = dp.Data(filepath)\n", + "data = CSVData(filepath)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e25f5130-4f19-40c5-9d13-549a04f1aef5", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read delimited data \n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "csv_files = [\n", + " \"csv/diamonds.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "\n", + "for file in csv_files:\n", + " data = CSVData(os.path.join(data_folder, file))\n", + " print(data.data.head())\n", + " print('=' * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "8940de56-1417-4bf6-af87-9d4d00b9a631", + "metadata": {}, + "source": [ + "## CSVData Options\n", + "\n", + "As mentioned preivously, `CSVData` has options that can be set to finetune its detection or to ensure the data is being read in a specific manner.\n", + "The options for `CSVData` are detailed below:\n", + "\n", + " * delimiter - delimiter used to decipher the csv input file\n", + " * quotechar - quote character used in the delimited file\n", + " * header - location of the header in the file.\n", + " * data_format - user selected format in which to return data can only be of specified types\n", + " * selected_columns - columns being selected from the entire dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d74f2e8-0ec3-4e93-8778-0a5f013e0cdb", + "metadata": {}, + "outputs": [], + "source": [ + "# options are set via a dictionary object in which the parameters are specified.\n", + "# these are the default values for each option\n", + "options = {\n", + " \"delimiter\": \",\",\n", + " \"quotechar\": '\"',\n", + " \"header\": 'auto',\n", + " \"data_format\": \"dataframe\", # type: str, choices: \"dataframe\", \"records\"\n", + " \"selected_columns\": list(),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9af108a1-ffe6-4c3a-82cc-833b1a3b57a1", + "metadata": {}, + "source": [ + "## Options: delimiter and quotechar\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `delimiter` and `quotechar`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "570e20c3-198e-4356-98d3-92eb9655ef4e", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/daily-activity-sheet-@-singlequote.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98385148-861e-4eb1-ba8d-e93120515401", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(filepath) # or use CSVData\n", + "print('Auto detected')\n", + "print('=' * 80)\n", + "print('delimiter: ', data.delimiter)\n", + "print('quotechar: ', data.quotechar)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f5d9306-d90a-4fc6-85a7-a0d535fe2d80", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'delimiter': '@', 'quotechar': \"'\"}\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('manually set')\n", + "print('=' * 80)\n", + "print('delimiter: ', data.delimiter)\n", + "print('quotechar: ', data.quotechar)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7bfa60f-b5b9-48a5-adc5-3937aed145da", + "metadata": {}, + "outputs": [], + "source": [ + "# intentional failure with incorrect options\n", + "options = {'delimiter': ',', 'quotechar': '\"'}\n", + "\n", + "# will be interepted as TextData because the delimtier and quotechar were incorrect\n", + "data = dp.Data(filepath, options=options)\n", + "print('intentional faliure set')\n", + "print('=' * 80)\n", + "try:\n", + " print('delimiter: ', data.delimiter) # attribute error raised here, bc TextData, not CSVData\n", + " print('quotechar: ', data.quotechar)\n", + " \n", + " # should not reach this or something went wrong\n", + " raise Exception('Should have failed because this is detected as TextData.')\n", + "except AttributeError:\n", + " print('When data_type is not set or the CSVData is not set, it will fail over to the\\n'\n", + " 'next best reader. In this case it is \"TextData\"\\n')\n", + "data.data" + ] + }, + { + "cell_type": "markdown", + "id": "eeb41c7c-8319-40a3-9d87-88edbb3c5290", + "metadata": {}, + "source": [ + "## Options: header\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `header`.\n", + "\n", + "Notice how in the manually set mechanism, we are intentionally setting the header incorrectly to illustrate what happens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16a927ef-1ba8-4bf2-ae40-2a9909030609", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/sparse-first-and-last-column-header-and-author-description.txt\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0701d7bf-2de0-4dce-8f09-7f0cddd1132c", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'header': 'auto'} # auto detected (default value)\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('Data Header:', data.header)\n", + "print('=' * 80)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8642a0a-367a-44c6-b611-b89d97b29f85", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'header': 2} # intentionally set incorrectly at value 2\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('Data Header:', data.header)\n", + "print('=' * 80)\n", + "data.data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d6e3f640-c809-4eb6-9571-30065821615e", + "metadata": {}, + "source": [ + "## Options: data_format\n", + "\n", + "For CSVData, the `data_format` option can have the following values:\n", + "\n", + " * dataframe - (default) loads the dataset as a pandas.DataFrame\n", + " * records - loads the data as rows of text values, the extra parameter `record_samples_per_line` how many rows are combined into a single line\n", + " \n", + "`dataframe` is used for conducting **structured profiling** of the dataset while `records` is for **unstructured profiling**.\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `data_format`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "146109ea-a554-4766-bb19-78c116d2a8dd", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/diamonds.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dceac967-d326-4064-ba1c-87a1146c9d72", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'data_format': 'dataframe'} # default\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "data.data[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c25524f-ef23-4e06-9023-842c64c2640e", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'data_format': 'records', 'record_samples_per_line': 1}\n", + "data = dp.Data(filepath, options=options)\n", + "data.data[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "d45f3ed6-ddcd-4bf3-95bc-09f23eb94c97", + "metadata": {}, + "source": [ + "## Options: selected columns\n", + "\n", + "By default, all columns of a dataset will be read and loaded into the data reader. However, `selected_columns` can be set to only load columns which the user requests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9b45e18-93c6-42e6-b978-af51574307eb", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "018f3f4d-32ac-411a-9918-bae78aff0b0e", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'selected_columns': ['datetime', 'host', 'src', 'proto']}\n", + "data = dp.Data(filepath, options=options)\n", + "data.data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "b50679ea", + "metadata": {}, + "source": [ + "## Intro to `GraphData` data reader\n", + "\n", + "This tutorial will focus on how to use the data reader class: `GraphData`. The `GraphData` class is used for reading the delimited data from a CSV file into a `NetworkX` Graph object. This is all in an effort to prepare the data automaticaly for `GraphProfiler` class to then profile graph data. \n", + "\n", + "The DataProiler keys off of common graph naming conventions in the column header row. E.G. from `dataprofiler/tests/csv/graph_data_csv_identify.csv`\n", + "```\n", + "node_id_dst, node_id_src, continuous_weight, categorical_status\n", + "108,289,7.4448069,9\n", + "81,180,3.65064207,0\n", + "458,83,5.9959787,10\n", + "55,116,4.63359209,79\n", + "454,177,5.76715529,11\n", + "429,225,4.79556889,3\n", + "```\n", + "\n", + "Options for the `GraphData` are exactly the same as `CSVData`.\n", + "\n", + "\n", + "Example implementation of `GraphData`:\n", + "```python\n", + "import dataprofiler as dp\n", + "from dataprofiler.data_readers.graph_data import GraphData\n", + "\n", + "data = dp.Data(graph_file)\n", + "data = GraphData(graph_file)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "838db976", + "metadata": {}, + "outputs": [], + "source": [ + "from dataprofiler.data_readers.graph_data import GraphData\n", + "\n", + "# use data reader to read delimited data \n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "graph_file = \"csv/graph_data_csv_identify.csv\"\n", + "\n", + "data = GraphData(os.path.join(data_folder, graph_file))\n", + "print(data.data.edges)\n", + "print('=' * 80)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/graph_data_demo.ipynb b/docs/0.12.0/doctrees/nbsphinx/graph_data_demo.ipynb new file mode 100644 index 000000000..088612872 --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/graph_data_demo.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Graph Pipeline Demo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataProfiler can also load and profile graph datasets. Similarly to the rest of DataProfiler profilers, this is split into two components:\n", + "- GraphData\n", + "- GraphProfiler\n", + "\n", + "We will demo the use of this graph pipeline.\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import pprint\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now input our dataset into the generic DataProfiler pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/graph_data_csv_identify.csv\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "report = profile.report()\n", + "\n", + "pp = pprint.PrettyPrinter(sort_dicts=False, compact=True)\n", + "pp.pprint(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We notice that the `Data` class automatically detected the input file as graph data. The `GraphData` class is able to differentiate between tabular and graph csv data. After `Data` matches the input file as graph data, `GraphData` does the necessary work to load the csv data into a NetworkX Graph. \n", + "\n", + "`Profiler` runs `GraphProfiler` when graph data is input (or when `data_type=\"graph\"` is specified). The `report()` function outputs the profile for the user." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Profile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The profile skeleton looks like this:\n", + "```\n", + "profile = {\n", + " \"num_nodes\": ...,\n", + " \"num_edges\": ...,\n", + " \"categorical_attributes\": ...,\n", + " \"continuous_attributes\": ...,\n", + " \"avg_node_degree\": ...,\n", + " \"global_max_component_size\": ...,\n", + " \"continuous_distribution\": ...,\n", + " \"categorical_distribution\": ...,\n", + " \"times\": ...,\n", + "}\n", + "```\n", + "\n", + "Description of properties in profile:\n", + "- `num_nodes`: number of nodes in the graph\n", + "- `num_edges`: number of edges in the graph\n", + "- `categorical_attributes`: list of categorical edge attributes\n", + "- `continuous_attributes`: list of continuous edge attributes\n", + "- `avg_node_degree`: average degree of nodes in the graph\n", + "- `global_max_component_size`: size of largest global max component in the graph\n", + "- `continuous_distribution`: dictionary of statistical properties for each continuous attribute\n", + "- `categorical_distribution`: dictionary of statistical properties for each categorical attribute\n", + "\n", + "The `continuous_distribution` and `categorical_distribution` dictionaries list statistical properties for each edge attribute in the graph:\n", + "```\n", + "continuous_distribution = {\n", + " \"name\": ...,\n", + " \"scale\": ...,\n", + " \"properties\": ...,\n", + "}\n", + "```\n", + "```\n", + "categorical_distribution = {\n", + " \"bin_counts\": ...,\n", + " \"bin_edges\": ...,\n", + "}\n", + "```\n", + "Description of each attribute:\n", + "- Continuous distribution:\n", + " - `name`: name of the distribution\n", + " - `scale`: negative log likelihood used to scale distributions and compare them in `GraphProfiler`\n", + " - `properties`: list of distribution props\n", + "- Categorical distribution:\n", + " - `bin_counts`: histogram bin counts\n", + " - `bin_edges`: histogram bin edges\n", + "\n", + "`properties` lists the following distribution properties: [optional: shape, loc, scale, mean, variance, skew, kurtosis]. The list can be either 6 length or 7 length depending on the distribution (extra shape parameter):\n", + "- 6 length: norm, uniform, expon, logistic\n", + "- 7 length: gamma, lognorm\n", + " - gamma: shape=`a` (float)\n", + " - lognorm: shape=`s` (float)\n", + " \n", + "For more information on shape parameters `a` and `s`: https://docs.scipy.org/doc/scipy/tutorial/stats.html#shape-parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile\n", + "Below you will see an example of how a Graph Profile can be saved and loaded again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The default save filepath is profile-.pkl\n", + "profile.save(filepath=\"profile.pkl\")\n", + "\n", + "new_profile = dp.GraphProfiler.load(\"profile.pkl\")\n", + "new_report = new_profile.report()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pp.pprint(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Difference in Data\n", + "If we wanted to ensure that this new profile was the same as the previous profile that we loaded, we could compare them using the diff functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "diff = profile.diff(new_profile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pp.pprint(diff)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another use for diff might be to provide differences between training and testing profiles as shown in the cell below.\n", + "We will use the profile above as the training profile and create a new profile to represent the testing profile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_profile = profile\n", + "\n", + "testing_data = dp.Data(os.path.join(data_path, \"csv/graph-differentiator-input-positive.csv\"))\n", + "testing_profile = dp.Profiler(testing_data)\n", + "\n", + "test_train_diff = training_profile.diff(testing_profile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below you can observe the difference between the two profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pp.pprint(test_train_diff)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have shown the graph pipeline in the DataProfiler. It works similarly to the current DataProfiler implementation." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/labeler.ipynb b/docs/0.12.0/doctrees/nbsphinx/labeler.ipynb new file mode 100644 index 000000000..af31b68c5 --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/labeler.ipynb @@ -0,0 +1,650 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "spoken-reunion", + "metadata": {}, + "source": [ + "# Sensitive Data Detection with the Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "interesting-bidder", + "metadata": {}, + "source": [ + "In this example, we utilize the Labeler component of the Data Profiler to detect the sensitive information for both structured and unstructured data. In addition, we show how to train the Labeler on some specific dataset with different list of entities.\n", + "\n", + "First, let's dive into what the Labeler is." + ] + }, + { + "cell_type": "markdown", + "id": "1965b83b", + "metadata": {}, + "source": [ + "## What is the Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "388c643f", + "metadata": {}, + "source": [ + "The Labeler is a pipeline designed to make building, training, and predictions with ML models quick and easy. There are 3 major components to the Labeler: the preprocessor, the model, and the postprocessor." + ] + }, + { + "cell_type": "markdown", + "id": "e5d0aeb4", + "metadata": {}, + "source": [ + "![alt text](DL-Flowchart.png \"Title\")" + ] + }, + { + "cell_type": "markdown", + "id": "550323c7", + "metadata": {}, + "source": [ + "Each component can be switched out individually to suit your needs. As you might expect, the preprocessor takes in raw data and prepares it for the model, the model performs the prediction or training, and the postprocessor takes prediction results and turns them into human-readable results. \n", + "\n", + "Now let's run some examples. Start by importing all the requirements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "scientific-stevens", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "\n", + "# remove extra tf loggin\n", + "import tensorflow as tf\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "id": "5125b215", + "metadata": {}, + "source": [ + "## Structured Data Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "wicked-devon", + "metadata": {}, + "source": [ + "We'll use the aws honeypot dataset in the test folder for this example. First, look at the data using the Data Reader class of the Data Profiler. This dataset is from the US department of educations, [found here!](https://data.ed.gov/dataset/college-scorecard-all-data-files-through-6-2020/resources?resource=823ac095-bdfc-41b0-b508-4e8fc3110082)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adjusted-native", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df_data = data.data\n", + "df_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ab6ccf8a", + "metadata": {}, + "source": [ + "We can directly predict the labels of a structured dataset on the cell level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19529af4", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "labeler = dp.DataLabeler(labeler_type='structured')\n", + "\n", + "# print out the labels and label mapping\n", + "print(\"Labels: {}\".format(labeler.labels)) \n", + "print(\"\\n\")\n", + "print(\"Label Mapping: {}\".format(labeler.label_mapping))\n", + "print(\"\\n\")\n", + "\n", + "# make predictions and get labels for each cell going row by row\n", + "# predict options are model dependent and the default model can show prediction confidences\n", + "predictions = labeler.predict(data, predict_options={\"show_confidences\": True})\n", + "\n", + "# display prediction results\n", + "print(\"Predictions: {}\".format(predictions['pred']))\n", + "print(\"\\n\")\n", + "\n", + "# display confidence results\n", + "print(\"Confidences: {}\".format(predictions['conf']))" + ] + }, + { + "cell_type": "markdown", + "id": "2af72e2c", + "metadata": {}, + "source": [ + "The profiler uses the Labeler to perform column by column predictions. The data contains 11 columns, each of which has data label. Next, we will use the Labeler of the Data Profiler to predict the label for each column in this tabular dataset. Since we are only going to demo the labeling functionality, other options of the Data Profiler are disabled to keep this quick." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6cb9d7e-149a-4cfe-86f8-76c47c57aeea", + "metadata": {}, + "outputs": [], + "source": [ + "# helper functions for printing results\n", + "\n", + "def get_structured_results(results):\n", + " \"\"\"Helper function to get data labels for each column.\"\"\"\n", + " columns = []\n", + " predictions = []\n", + " samples = []\n", + " for col in results['data_stats']:\n", + " columns.append(col['column_name'])\n", + " predictions.append(col['data_label'])\n", + " samples.append(col['samples'])\n", + "\n", + " df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions, 'Sample': samples})\n", + " return df_results\n", + "\n", + "def get_unstructured_results(data, results):\n", + " \"\"\"Helper function to get data labels for each labeled piece of text.\"\"\"\n", + " labeled_data = []\n", + " for pred in results['pred'][0]:\n", + " labeled_data.append([data[0][pred[0]:pred[1]], pred[2]])\n", + " label_df = pd.DataFrame(labeled_data, columns=['Text', 'Labels'])\n", + " return label_df\n", + " \n", + "\n", + "pd.set_option('display.width', 100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "secret-million", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# set options to only run the labeler\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"structured_options.text.is_enabled\": False, \n", + " \"int.is_enabled\": False, \n", + " \"float.is_enabled\": False, \n", + " \"order.is_enabled\": False, \n", + " \"category.is_enabled\": False, \n", + " \"chi2_homogeneity.is_enabled\": False,\n", + " \"datetime.is_enabled\": False,})\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "\n", + "results = profile.report() \n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "fatty-louisville", + "metadata": {}, + "source": [ + "In this example, the results show that the Data Profiler is able to detect integers, URLs, address, and floats appropriately. Unknown is typically strings of text, which is appropriate for those columns." + ] + }, + { + "cell_type": "markdown", + "id": "unavailable-diploma", + "metadata": {}, + "source": [ + "## Unstructured Data Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "metallic-coaching", + "metadata": {}, + "source": [ + "Besides structured data, the Labeler detects the sensitive information on the unstructured text. We use a sample of spam email in Enron email dataset for this demo. As above, we start investigating the content of the given email sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unauthorized-lounge", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# load data\n", + "data = \"Message-ID: <11111111.1111111111111.JavaMail.evans@thyme>\\n\" + \\\n", + " \"Date: Fri, 10 Aug 2005 11:31:37 -0700 (PDT)\\n\" + \\\n", + " \"From: w..smith@company.com\\n\" + \\\n", + " \"To: john.smith@company.com\\n\" + \\\n", + " \"Subject: RE: ABC\\n\" + \\\n", + " \"Mime-Version: 1.0\\n\" + \\\n", + " \"Content-Type: text/plain; charset=us-ascii\\n\" + \\\n", + " \"Content-Transfer-Encoding: 7bit\\n\" + \\\n", + " \"X-From: Smith, Mary W. \\n\" + \\\n", + " \"X-To: Smith, John \\n\" + \\\n", + " \"X-cc: \\n\" + \\\n", + " \"X-bcc: \\n\" + \\\n", + " \"X-Folder: \\SSMITH (Non-Privileged)\\Sent Items\\n\" + \\\n", + " \"X-Origin: Smith-S\\n\" + \\\n", + " \"X-FileName: SSMITH (Non-Privileged).pst\\n\\n\" + \\\n", + " \"All I ever saw was the e-mail from the office.\\n\\n\" + \\\n", + " \"Mary\\n\\n\" + \\\n", + " \"-----Original Message-----\\n\" + \\\n", + " \"From: Smith, John \\n\" + \\\n", + " \"Sent: Friday, August 10, 2005 13:07 PM\\n\" + \\\n", + " \"To: Smith, Mary W.\\n\" + \\\n", + " \"Subject: ABC\\n\\n\" + \\\n", + " \"Have you heard any more regarding the ABC sale? I guess that means that \" + \\\n", + " \"it's no big deal here, but you think they would have send something.\\n\\n\\n\" + \\\n", + " \"John Smith\\n\" + \\\n", + " \"123-456-7890\\n\"\n", + "\n", + "# convert string data to list to feed into the labeler\n", + "data = [data]" + ] + }, + { + "cell_type": "markdown", + "id": "concerned-segment", + "metadata": {}, + "source": [ + "By default, the Labeler predicts the results at the character level for unstructured text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "junior-acrobat", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "labeler = dp.DataLabeler(labeler_type='unstructured')\n", + "\n", + "# make predictions and get labels per character\n", + "predictions = labeler.predict(data)\n", + "\n", + "# display results\n", + "print(predictions['pred'])" + ] + }, + { + "cell_type": "markdown", + "id": "individual-diabetes", + "metadata": {}, + "source": [ + "In addition to the character-level result, the Labeler provides the results at the word level following the standard NER (Named Entity Recognition), e.g., utilized by spaCy. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "optical-universe", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# convert prediction to word format and ner format\n", + "# Set the output to the NER format (start position, end position, label)\n", + "labeler.set_params(\n", + " { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } } \n", + ")\n", + "\n", + "# make predictions and get labels per character\n", + "predictions = labeler.predict(data)\n", + "\n", + "# display results\n", + "print('\\n')\n", + "print('=======================Prediction======================\\n')\n", + "for pred in predictions['pred'][0]:\n", + " print('{}: {}'.format(data[0][pred[0]: pred[1]], pred[2]))\n", + " print('--------------------------------------------------------')" + ] + }, + { + "cell_type": "markdown", + "id": "behavioral-tourism", + "metadata": {}, + "source": [ + "Here, the Labeler is able to identify sensitive information such as datetime, email address, person names, and phone number in an email sample. " + ] + }, + { + "cell_type": "markdown", + "id": "nasty-disney", + "metadata": {}, + "source": [ + "## Train the Labeler from Scratch" + ] + }, + { + "cell_type": "markdown", + "id": "destroyed-twist", + "metadata": {}, + "source": [ + "The Labeler can be trained from scratch with a new list of labels. Below, we show an example of training the Labeler on a dataset with labels given as the columns of that dataset. For brevity's sake, let's only train a few epochs with a subset of a dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "utility-evaluation", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df = data.data[[\"OPEID6\", \"INSTURL\", \"SEARCH_STRING\"]]\n", + "df.head()\n", + "\n", + "# split data to training and test set\n", + "split_ratio = 0.2\n", + "df = df.sample(frac=1).reset_index(drop=True)\n", + "data_train = df[:int((1 - split_ratio) * len(df))]\n", + "data_test = df[int((1 - split_ratio) * len(df)):]\n", + "\n", + "# train a new labeler with column names as labels\n", + "if not os.path.exists('data_labeler_saved'):\n", + " os.makedirs('data_labeler_saved')\n", + "\n", + "labeler = dp.train_structured_labeler(\n", + " data=data_train,\n", + " save_dirpath=\"data_labeler_saved\",\n", + " epochs=10,\n", + " default_label=\"OPEID6\"\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "utility-torture", + "metadata": {}, + "source": [ + "The trained Labeler is then used by the Data Profiler to provide the prediction on the new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "answering-panel", + "metadata": {}, + "outputs": [], + "source": [ + "# predict with the labeler object\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': labeler})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "polish-stand", + "metadata": {}, + "source": [ + "Another way to use the trained Labeler is through the directory path of the saved labeler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "industrial-characterization", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# predict with the labeler loaded from path\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_dirpath': 'data_labeler_saved'})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "2acedba0", + "metadata": {}, + "source": [ + "## Transfer Learning a Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "2f15fb1f", + "metadata": {}, + "source": [ + "Instead of training a model from scratch, we can also transfer learn to improve the model and/or extend the labels. Again for brevity's sake, let's only train a few epochs with a small dataset at the cost of accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0104c374", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df_data = data.data[[\"OPEID6\", \"INSTURL\", \"SEARCH_STRING\"]]\n", + "\n", + "\n", + "# prep data\n", + "df_data = df_data.reset_index(drop=True).melt()\n", + "df_data.columns = [1, 0] # labels=1, values=0 in that order\n", + "df_data = df_data.astype(str)\n", + "new_labels = df_data[1].unique().tolist()\n", + "\n", + "# load structured Labeler w/ trainable set to True\n", + "labeler = dp.DataLabeler(labeler_type='structured', trainable=True)\n", + "\n", + "# Reconstruct the model to add each new label\n", + "for label in new_labels:\n", + " labeler.add_label(label)\n", + "\n", + "# this will use transfer learning to retrain the labeler on your new\n", + "# dataset and labels.\n", + "# Setting labels with a list of labels or label mapping will overwrite the existing labels with new ones\n", + "# Setting the reset_weights parameter to false allows transfer learning to occur\n", + "model_results = labeler.fit(x=df_data[0], y=df_data[1], validation_split=0.2, \n", + " epochs=10, labels=None, reset_weights=False)" + ] + }, + { + "cell_type": "markdown", + "id": "ae78745f", + "metadata": {}, + "source": [ + "Let's display the training results of the last epoch:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b764aa8c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"{:16s} Precision Recall F1-score Support\".format(\"\"))\n", + "for item in model_results[-1][2]:\n", + " print(\"{:16s} {:4.3f} {:4.3f} {:4.3f} {:7.0f}\".format(item,\n", + " model_results[-1][2][item][\"precision\"],\n", + " model_results[-1][2][item][\"recall\"],\n", + " model_results[-1][2][item][\"f1-score\"],\n", + " model_results[-1][2][item][\"support\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "44009522", + "metadata": {}, + "source": [ + "It is now trained to detect additional labels! The model results here show all the labels training accuracy. Since only new labels existed in the dataset, only the new labels are given accuracy scores. Keep in mind this is a small dataset for brevity's sake and that real training would involve more samples and better results." + ] + }, + { + "cell_type": "markdown", + "id": "e110ee1c", + "metadata": {}, + "source": [ + "## Saving and Loading a Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "c484d193", + "metadata": {}, + "source": [ + "The Labeler can easily be saved or loaded with one simple line." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d8684fa", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure save directory exists\n", + "if not os.path.exists('my_labeler'):\n", + " os.makedirs('my_labeler')\n", + "\n", + "# Saving the labeler\n", + "labeler.save_to_disk(\"my_labeler\")\n", + "\n", + "# Loading the labeler\n", + "labeler = dp.DataLabeler(labeler_type='structured', dirpath=\"my_labeler\")" + ] + }, + { + "cell_type": "markdown", + "id": "8d36dec8", + "metadata": {}, + "source": [ + "## Building a Labeler from the Ground Up" + ] + }, + { + "cell_type": "markdown", + "id": "59346d2b", + "metadata": {}, + "source": [ + "As mentioned earlier, the labeler is comprised of three components, and each of the compenents can be created and interchanged in the the labeler pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6506ef97", + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "from dataprofiler.labelers.character_level_cnn_model import \\\n", + " CharacterLevelCnnModel\n", + "from dataprofiler.labelers.data_processing import \\\n", + " StructCharPreprocessor, StructCharPostprocessor\n", + "\n", + "model = CharacterLevelCnnModel({\"PAD\":0, \"UNKNOWN\":1, \"Test_Label\":2})\n", + "preprocessor = StructCharPreprocessor()\n", + "postprocessor = StructCharPostprocessor()\n", + "\n", + "labeler = dp.DataLabeler(labeler_type='structured')\n", + "labeler.set_preprocessor(preprocessor)\n", + "labeler.set_model(model)\n", + "labeler.set_postprocessor(postprocessor)\n", + "\n", + "# check for basic compatibility between the processors and the model\n", + "labeler.check_pipeline()\n", + "\n", + "# Optionally set the parameters\n", + "parameters={\n", + " 'preprocessor':{\n", + " 'max_length': 100,\n", + " },\n", + " 'model':{\n", + " 'max_length': 100,\n", + " },\n", + " 'postprocessor':{\n", + " 'random_state': random.Random(1)\n", + " }\n", + "} \n", + "labeler.set_params(parameters)\n", + "\n", + "labeler.help()" + ] + }, + { + "cell_type": "markdown", + "id": "5f020d7f", + "metadata": {}, + "source": [ + "The components can each be created if you inherit the BaseModel and BaseProcessor for the model and processors, respectively. More info can be found about coding your own components in the Labeler section of the [documentation]( https://capitalone.github.io/dataprofiler). In summary, the Data Profiler open source library can be used to scan sensitive information in both structured and unstructured data with different file types. It supports multiple input formats and output formats at word and character levels. Users can also train the labeler on their own datasets." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/merge_profile_list.ipynb b/docs/0.12.0/doctrees/nbsphinx/merge_profile_list.ipynb new file mode 100644 index 000000000..7a6d8005a --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/merge_profile_list.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "60af5256", + "metadata": {}, + "source": [ + "# Merge List of Profiles\n", + "\n", + "This is an example of a new utils in the dataprofiler for distributed merging of profile objects. This assumes the user is providing a list of profile objects to the utils function for merging all the profiles together." + ] + }, + { + "cell_type": "markdown", + "id": "7eee37ff", + "metadata": {}, + "source": [ + "## Imports\n", + "\n", + "Let's start by importing the necessary packages..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0d27009", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + " from dataprofiler.profilers.profiler_utils import merge_profile_list\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + " from dataprofiler.profilers.profiler_utils import merge_profile_list\n", + "\n", + "# remove extra tf loggin\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "id": "b4369e64", + "metadata": {}, + "source": [ + "## Setup the Data and Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "410c3c4d", + "metadata": {}, + "source": [ + "This section shows the basic example of the Data Profiler. \n", + "\n", + "1. Instantiate a Pandas dataframe with dummy data\n", + "2. Pass the dataframe to the `Profiler` and instantiate two separate profilers in a list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3567c82", + "metadata": {}, + "outputs": [], + "source": [ + "d = {'col1': [1, 2], 'col2': [3, 4]}\n", + "df = pd.DataFrame(data=d)\n", + "\n", + "list_of_profiles = [dp.Profiler(df), dp.Profiler(df)]" + ] + }, + { + "cell_type": "markdown", + "id": "350502eb", + "metadata": {}, + "source": [ + "Take a look at the list of profiles... " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b649db32", + "metadata": {}, + "outputs": [], + "source": [ + "list_of_profiles" + ] + }, + { + "cell_type": "markdown", + "id": "4ed4fc12", + "metadata": {}, + "source": [ + "## Run Merge on List of Profiles\n", + "\n", + "Now let's merge the list of profiles into a `single_profile`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a636047", + "metadata": {}, + "outputs": [], + "source": [ + "single_profile = merge_profile_list(list_of_profiles=list_of_profiles)" + ] + }, + { + "cell_type": "markdown", + "id": "0aa88720", + "metadata": {}, + "source": [ + "And check out the `.report` on the single profile:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34059c21", + "metadata": {}, + "outputs": [], + "source": [ + "single_profile.report()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dataprofiler", + "language": "python", + "name": "dataprofiler" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/overview.ipynb b/docs/0.12.0/doctrees/nbsphinx/overview.ipynb new file mode 100644 index 000000000..d5e77abe4 --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/overview.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc2826d9", + "metadata": {}, + "source": [ + "# Data Profiler - What's in your data?" + ] + }, + { + "cell_type": "markdown", + "id": "b997522b", + "metadata": {}, + "source": [ + "This introductory jupyter notebook demonstrates the basic usages of the Data Profiler. The library is designed to easily detect sensitive data and gather statistics on your datasets with just several lines of code. The Data Profiler can handle several different data types including: CSV (or any delimited file), JSON, Parquet, AVRO, and text. Additionally, there are a plethora of options to customize your profile. This library also has the ability to update profiles from multiple batches of large datasets, or merge multiple profiles. In particular, this example covers the followings:\n", + "\n", + "- Basic usage of the Data Profiler\n", + "- The data reader class\n", + "- Profiler options\n", + "- Updating profiles and merging profiles\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef404c84", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "markdown", + "id": "f51971e3", + "metadata": {}, + "source": [ + "## Basic Usage of the Data Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "639e66d3", + "metadata": {}, + "source": [ + "This section shows the basic example of the Data Profiler. A CSV dataset is read using the data reader, then the Data object is given to the Data Profiler to detect sensitive data and obtain the statistics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5379c45c", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# use data reader to read input data\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "print(data.data.head())\n", + "\n", + "# run data profiler and get the report\n", + "profile = dp.Profiler(data)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "57fe2827", + "metadata": {}, + "source": [ + "The report includes `global_stats` and `data_stats` for the given dataset. The former contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio, while the latter contains specific properties and statistics for each column such as detected data label, min, max, mean, variance, etc. In this example, the `compact` format of the report is used to shorten the full list of the results. To get more results related to detailed predictions at the entity level from the Data Labeler component or histogram results, the format `pretty` should be used." + ] + }, + { + "cell_type": "markdown", + "id": "74027cfd", + "metadata": {}, + "source": [ + "## Data reader class" + ] + }, + { + "cell_type": "markdown", + "id": "41364888", + "metadata": {}, + "source": [ + "DataProfiler can detect multiple file types including CSV (or any delimited file), JSON, Parquet, AVRO, and text. The example below shows that it successfully detects data types from multiple categories regardless of the file extensions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "823829f4", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data with different file types\n", + "csv_files = [\n", + " \"csv/aws_honeypot_marx_geo.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "json_files = [\n", + " \"json/complex_nested.json\",\n", + " \"json/honeypot_intentially_mislabeled_file.csv\", # json file with the .csv extension\n", + "]\n", + "parquet_files = [\n", + " \"parquet/nation.dict.parquet\",\n", + " \"parquet/nation.plain.intentionally_mislabled_file.csv\", # parquet file with the .csv extension\n", + "]\n", + "avro_files = [\n", + " \"avro/userdata1.avro\",\n", + " \"avro/userdata1_intentionally_mislabled_file.json\", # avro file with the .json extension\n", + "]\n", + "text_files = [\n", + " \"txt/discussion_reddit.txt\",\n", + "]\n", + "\n", + "all_files = {\n", + " \"csv\": csv_files,\n", + " \"json\": json_files,\n", + " \"parquet\": parquet_files,\n", + " \"avro\": avro_files,\n", + " \"text\": text_files\n", + "}\n", + "\n", + "for file_type in all_files:\n", + " print(file_type)\n", + " for file in all_files[file_type]:\n", + " data = dp.Data(os.path.join(data_path, file))\n", + " print(\"{:<85} {:<15}\".format(file, data.data_type))\n", + " print(\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "3f9d7e02", + "metadata": {}, + "source": [ + "The `Data` class detects the file type and uses one of the following classes: `CSVData`, `JSONData`, `ParquetData`, `AVROData`, `TextData`. Users can call these specific classes directly if desired. For example, below we provide a collection of data with different types, each of them is processed by the corresponding data class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "831e68a3", + "metadata": {}, + "outputs": [], + "source": [ + "# use individual data reader classes\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "from dataprofiler.data_readers.json_data import JSONData\n", + "from dataprofiler.data_readers.parquet_data import ParquetData\n", + "from dataprofiler.data_readers.avro_data import AVROData\n", + "from dataprofiler.data_readers.text_data import TextData\n", + "\n", + "csv_files = \"csv/aws_honeypot_marx_geo.csv\"\n", + "json_files = \"json/complex_nested.json\"\n", + "parquet_files = \"parquet/nation.dict.parquet\"\n", + "avro_files = \"avro/userdata1.avro\"\n", + "text_files = \"txt/discussion_reddit.txt\"\n", + "\n", + "all_files = {\n", + " \"csv\": [csv_files, CSVData],\n", + " \"json\": [json_files, JSONData],\n", + " \"parquet\": [parquet_files, ParquetData],\n", + " \"avro\": [avro_files, AVROData],\n", + " \"text\": [text_files, TextData],\n", + "}\n", + "\n", + "for file_type in all_files:\n", + " file, data_reader = all_files[file_type]\n", + " data = data_reader(os.path.join(data_path, file))\n", + " print(\"File name {}\\n\".format(file))\n", + " if file_type == \"text\":\n", + " print(data.data[0][:1000]) # print the first 1000 characters\n", + " else:\n", + " print(data.data)\n", + " print('===============================================================================')" + ] + }, + { + "cell_type": "markdown", + "id": "572df0a8", + "metadata": {}, + "source": [ + "In addition to reading the input data from multiple file types, the Data Profiler allows the input data as a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df87ab83", + "metadata": {}, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=[\"col_int\", \"col_float\"])\n", + "profile = dp.Profiler(my_dataframe)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "84a06312", + "metadata": {}, + "source": [ + "## Structured Profiler vs. Unstructured Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "4c0ea925", + "metadata": {}, + "source": [ + "The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile and the unstructured profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f4565d8", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Using the structured profiler\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data, profiler_type='structured')\n", + "\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))\n", + "\n", + "# Using the unstructured profiler\n", + "my_dataframe = pd.DataFrame([[\"Sample1\"],[\"Sample2\"],[\"Sample3\"]], columns=[\"Text_Samples\"])\n", + "profile = dp.Profiler(my_dataframe, profiler_type='unstructured')\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b16648ba", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "8b0cc8ad", + "metadata": {}, + "source": [ + "The Data Profiler can enable/disable statistics and modify features through profiler options. For example, if the users only want the statistics information, they may turn off the Data Labeler functionality. Below, let's remove the histogram and data labeler component while running Data Profiler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbac3a2c", + "metadata": {}, + "outputs": [], + "source": [ + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"histogram_and_quantiles.is_enabled\": False,\n", + " \"median_abs_deviation.is_enabled\": False,\n", + " \"median.is_enabled\": False,\n", + " \"mode.is_enabled\": False,\n", + " \"data_labeler.is_enabled\": False,})\n", + "\n", + "profile = dp.Profiler(my_dataframe, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "590ca50b", + "metadata": {}, + "source": [ + "Besides toggling on and off features, other options like the data labeler sample size or histogram bin method can be directly set and validated as shown here:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ed21bc1", + "metadata": {}, + "outputs": [], + "source": [ + "profile_options = dp.ProfilerOptions()\n", + "profile_options.structured_options.data_labeler.sample_size = 1\n", + "profile_options.structured_options.int.histogram_and_quantiles.bin_count_or_method = \"rice\"\n", + "# An error will raise if the options are set incorrectly.\n", + "profile_options.validate()\n", + "\n", + "profile = dp.Profiler(my_dataframe, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "9f690616", + "metadata": {}, + "source": [ + "## Update profiles" + ] + }, + { + "cell_type": "markdown", + "id": "965f8c85", + "metadata": {}, + "source": [ + "One of the interesting features of the Data Profiler is the ability to update profiles from batches of data, which allows for data streaming usage. In this section, the original dataset is separated into two batches with equal size. Each batch is then updated with Data Profiler sequentially. \n", + "\n", + "After the update, we expect the resulted profiles give the same statistics as the profiles updated from the full dataset. We will verify that through some properties in `global_stats` of the profiles including `column_count`, `row_count`, `row_is_null_ratio`, `duplicate_row_count`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34ac4346", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# read the input data and devide it into two equal halves\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "df = data.data\n", + "df1 = df.iloc[:int(len(df)/2)]\n", + "df2 = df.iloc[int(len(df)/2):]\n", + "\n", + "# Update the profile with the first half\n", + "profile = dp.Profiler(df1)\n", + "\n", + "# Update the profile with the second half\n", + "profile.update_profile(df2)\n", + "\n", + "# Update profile with the full dataset\n", + "profile_full = dp.Profiler(df)\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "report_full = profile_full.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report, indent=4))\n", + "print(json.dumps(report_full, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b41ee2bf", + "metadata": {}, + "source": [ + "You can see that the profiles are exactly the same whether they are broken into several updates or not." + ] + }, + { + "cell_type": "markdown", + "id": "c547f051", + "metadata": {}, + "source": [ + "## Merge profiles" + ] + }, + { + "cell_type": "markdown", + "id": "a5292962", + "metadata": {}, + "source": [ + "In addition to the profile update, Data Profiler provides the merging functionality which allows users to combine the profiles updated from multiple locations. This enables Data Profiler to be used in a distributed computing environment. Below, we assume that the two aforementioned halves of the original dataset come from two different machines. Each of them is then updated with the Data Profiler on the same machine, then the resulted profiles are merged.\n", + "\n", + "As with the profile update, we expect the merged profiles give the same statistics as the profiles updated from the full dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a565b8d1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Update the profile with the first half\n", + "profile1 = dp.Profiler(df1)\n", + "\n", + "# Update the profile with the second half\n", + "profile2 = dp.Profiler(df2)\n", + "\n", + "# merge profiles\n", + "profile_merge = profile1 + profile2\n", + "\n", + "# check results of the merged profile\n", + "report_merge = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report_merge, indent=4))\n", + "print(json.dumps(report_full, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b77fac3f", + "metadata": {}, + "source": [ + "You can see that the profiles are exactly the same!" + ] + }, + { + "cell_type": "markdown", + "id": "c644ee42", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "We have walked through some basic examples of Data Profiler usage, with different input data types and profiling options. We also work with update and merging functionality of the Data Profiler, which make it applicable for data streaming and distributed environment. Interested users can try with different datasets and functionalities as desired." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/popmon_dp_loader_example.ipynb b/docs/0.12.0/doctrees/nbsphinx/popmon_dp_loader_example.ipynb new file mode 100644 index 000000000..3ddb267da --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/popmon_dp_loader_example.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7f0cceea", + "metadata": {}, + "source": [ + "# Dataloader with Popmon Reports" + ] + }, + { + "cell_type": "markdown", + "id": "9e79d9c5", + "metadata": {}, + "source": [ + "This demo is to cover the usage of popmon with the dataloader from the dataprofiler\n", + "\n", + "This demo covers the followings:\n", + "\n", + " - How to install popmon\n", + " - Comparison of the dynamic dataloader from dataprofiler to the \n", + " standard dataloader used in pandas\n", + " - Popmon's usage example using both dataloaders\n", + " - Dataprofiler's examples using both dataloaders\n", + " - Usage of the pm_stability_report function (popmon reports)\n" + ] + }, + { + "cell_type": "markdown", + "id": "aec2198a", + "metadata": {}, + "source": [ + "## How to Install Popmon\n", + "To install popmon you can use the command below:" + ] + }, + { + "cell_type": "markdown", + "id": "4383ed2a", + "metadata": {}, + "source": [ + "`pip3 install popmon`\n" + ] + }, + { + "cell_type": "markdown", + "id": "91dedc34", + "metadata": {}, + "source": [ + "From here, we can import the libararies needed for this demo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2adec556", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "import pandas as pd\n", + "import popmon # noqa" + ] + }, + { + "cell_type": "markdown", + "id": "2ed532ec", + "metadata": {}, + "source": [ + "## Comparison of Dataloaders" + ] + }, + { + "cell_type": "markdown", + "id": "cccbf4cd", + "metadata": {}, + "source": [ + "First, we have the original pandas dataloading which works for specific file types. \n", + "This is good for if the data format is known ahead of time but is less useful for more dynamic cases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96e9ff89", + "metadata": {}, + "outputs": [], + "source": [ + "def popmon_dataloader(path, time_index):\n", + " # Load pm dataframe (Can only read csvs unless reader option is changed)\n", + " if not time_index is None:\n", + " pm_data = pd.read_csv(path, parse_dates=[time_index])\n", + " else:\n", + " time_index = True\n", + " pm_data = pd.read_csv(path)\n", + " return pm_data" + ] + }, + { + "cell_type": "markdown", + "id": "16dfbe10", + "metadata": {}, + "source": [ + "Next, we have the dataprofiler's dataloader. This allows for the dynamic loading of different data formats which is super useful when the data format is not know ahead of time.\n", + "This is intended to be an improvement on the dataloader standardly used in pandas." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07481259", + "metadata": {}, + "outputs": [], + "source": [ + "def dp_dataloader(path):\n", + " # Datalaoder from dataprofiler used\n", + " dp_data = dp.Data(path)\n", + " \n", + " # Profiler used to ensure proper label for datetime even \n", + " # when null values exist\n", + " profiler_options = dp.ProfilerOptions()\n", + " profiler_options.set({'*.is_enabled': False, # Runs first disabling all options in profiler\n", + " '*.datetime.is_enabled': True})\n", + " profile = dp.Profiler(dp_data, options=profiler_options)\n", + "\n", + " # convert any time/datetime types from strings to actual datatime type\n", + " for ind, col in enumerate(dp_data.data.columns):\n", + " if profile.profile[ind].profile.get('data_type') == 'datetime':\n", + " dp_data.data[col] = pd.to_datetime(dp_data.data[col])\n", + "\n", + " return dp_data.data" + ] + }, + { + "cell_type": "markdown", + "id": "69a8ea9b", + "metadata": {}, + "source": [ + "## Popmon's usage example using both dataloaders" + ] + }, + { + "cell_type": "markdown", + "id": "ff914ca7", + "metadata": {}, + "source": [ + "Next, we'll download a dataset from the resources component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bff33da8", + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "import shutil\n", + "popmon_tutorial_data = popmon.resources.data(\"flight_delays.csv.gz\")\n", + "with gzip.open(popmon_tutorial_data, 'rb') as f_in:\n", + " with open('./flight_delays.csv', 'wb') as f_out:\n", + " shutil.copyfileobj(f_in, f_out)" + ] + }, + { + "cell_type": "markdown", + "id": "19222c4a", + "metadata": {}, + "source": [ + "Finally we read in the data with popmon and print the report to a file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0090a2f3", + "metadata": {}, + "outputs": [], + "source": [ + "# Default csv from popmon example\n", + "path = \"./flight_delays.csv\"\n", + "time_index = \"DATE\"\n", + "report_output_dir = \"./popmon_output/flight_delays_full\"\n", + "if not os.path.exists(report_output_dir):\n", + " os.makedirs(report_output_dir)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0abcd9b", + "metadata": {}, + "outputs": [], + "source": [ + "pm_data = popmon_dataloader(path, time_index)\n", + "\n", + "report_pm_loader = pm_data.pm_stability_report(\n", + " time_axis=time_index,\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " pull_rules={\"*_pull\": [10, 7, -7, -10]},\n", + ")\n", + "\n", + "# Save popmon reports\n", + "report_pm_loader.to_file(os.path.join(report_output_dir, \"popmon_loader_report.html\"))\n", + "print(\"Report printed at:\", os.path.join(report_output_dir, \"popmon_loader_report.html\"))" + ] + }, + { + "cell_type": "markdown", + "id": "2303b5cf", + "metadata": {}, + "source": [ + "We then do the same for the dataprofiler loader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2854383", + "metadata": {}, + "outputs": [], + "source": [ + "dp_dataframe = dp_dataloader(path)\n", + "# Generate pm report using dp dataloader\n", + "report_dp_loader = dp_dataframe.pm_stability_report(\n", + " time_axis=time_index,\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " pull_rules={\"*_pull\": [10, 7, -7, -10]},\n", + ")\n", + "\n", + "# Save popmon reports\n", + "report_dp_loader.to_file(os.path.join(report_output_dir, \"dataprofiler_loader_report.html\"))\n", + "print(\"Report printed at:\", os.path.join(report_output_dir, \"dataprofiler_loader_report.html\"))" + ] + }, + { + "cell_type": "markdown", + "id": "8cc4e5f3", + "metadata": {}, + "source": [ + "## Examples of data\n", + "Next, We'll use some data from the test files of the data profiler to compare the dynamic loading of the dataprofiler's data loader to that of the standard pandas approach. \n" + ] + }, + { + "cell_type": "markdown", + "id": "352eaeea", + "metadata": {}, + "source": [ + "## Dataprofiler's examples using both dataloaders" + ] + }, + { + "cell_type": "markdown", + "id": "e99af913", + "metadata": {}, + "source": [ + "To execute this properly, simply choose one of the 3 examples below and then run the report generation below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80eb601d", + "metadata": {}, + "outputs": [], + "source": [ + "# Default csv from popmon example (mini version)\n", + "path = \"../dataprofiler/tests/data/csv/flight_delays.csv\"\n", + "time_index = \"DATE\"\n", + "report_output_dir = \"./popmon_output/flight_delays_mini\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c127288", + "metadata": {}, + "outputs": [], + "source": [ + "# Random csv from dataprofiler tests\n", + "path = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "time_index = \"datetime\"\n", + "report_output_dir = \"./popmon_output/aws_honeypot_marx_geo\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cd5c385", + "metadata": {}, + "outputs": [], + "source": [ + "# Random json file from dataprofiler tests\n", + "path = \"../dataprofiler/tests/data/json/math.json\"\n", + "\n", + "time_index = \"data.9\"\n", + "report_output_dir = \"./popmon_output/math\"" + ] + }, + { + "cell_type": "markdown", + "id": "ec860cb7", + "metadata": {}, + "source": [ + "Run the block below to create an output directory for your popmon reports." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf21835c", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(report_output_dir):\n", + " os.makedirs(report_output_dir)\n", + "dp_dataframe = dp_dataloader(path)" + ] + }, + { + "cell_type": "markdown", + "id": "479975a5", + "metadata": {}, + "source": [ + "## Report comparison" + ] + }, + { + "cell_type": "markdown", + "id": "02a355e7", + "metadata": {}, + "source": [ + "We generate reports using different sets of data from the dataprofiler and pandas below using dataprofiler's dataloader and popmons report generator\n" + ] + }, + { + "cell_type": "markdown", + "id": "6ce69145", + "metadata": {}, + "source": [ + "The dataprofiler's dataloader can seemlessly switch between data formats and generate reports with the exact same code in place." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0dcb405", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Generate pm report using dp dataloader\n", + "report_dp_loader = dp_dataframe.pm_stability_report(\n", + " time_axis=time_index,\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " pull_rules={\"*_pull\": [10, 7, -7, -10]},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9eb0035c", + "metadata": {}, + "source": [ + "If the dataloaders are valid, you can see the reports and compare them at the output directory specified in the printout below each report generation block (the two code blocks below)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efe7d8d6", + "metadata": {}, + "outputs": [], + "source": [ + "# Save dp reports\n", + "report_dp_loader.to_file(os.path.join(report_output_dir, \"dataprofiler_loader_report.html\"))\n", + "print(\"Report printed at:\", os.path.join(report_output_dir, \"dataprofiler_loader_report.html\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/profiler_example.ipynb b/docs/0.12.0/doctrees/nbsphinx/profiler_example.ipynb new file mode 100644 index 000000000..b6a4409c9 --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/profiler_example.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f37ca393", + "metadata": {}, + "source": [ + "# Structured Profilers" + ] + }, + { + "cell_type": "markdown", + "id": "ff9bd095", + "metadata": {}, + "source": [ + "**Data profiling** - *is the process of examining a dataset and collecting statistical or informational summaries about said dataset.*\n", + "\n", + "The Profiler class inside the DataProfiler is designed to generate *data profiles* via the Profiler class, which ingests either a Data class or a Pandas DataFrame. \n", + "\n", + "Currently, the Data class supports loading the following file formats:\n", + "\n", + "* Any delimited (CSV, TSV, etc.)\n", + "* JSON object\n", + "* Avro\n", + "* Parquet\n", + "* Text files\n", + "* Pandas Series/Dataframe\n", + "\n", + "Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.\n", + "\n", + "This example will look at specifically the structured data types for structured profiling. " + ] + }, + { + "cell_type": "markdown", + "id": "de58b9c4", + "metadata": {}, + "source": [ + "## Reporting" + ] + }, + { + "cell_type": "markdown", + "id": "8001185a", + "metadata": {}, + "source": [ + "One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.\n", + "\n", + "In terms of reporting, there are multiple reporting options:\n", + "\n", + "* **Pretty**: Floats are rounded to four decimal places, and lists are shortened.\n", + "* **Compact**: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc.\n", + "* **Serializable**: Output is json serializable and not prettified\n", + "* **Flat**: Nested Output is returned as a flattened dictionary\n", + "\n", + "The **Pretty** and **Compact** reports are the two most commonly used reports and includes `global_stats` and `data_stats` for the given dataset. `global_stats` contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio. `data_stats` contains specific properties and statistics for each column file such as min, max, mean, variance, etc.\n", + "\n", + "For structured profiles, the report looks like this:\n", + "\n", + "```\n", + "\"global_stats\": {\n", + " \"samples_used\": int,\n", + " \"column_count\": int,\n", + " \"row_count\": int,\n", + " \"row_has_null_ratio\": float,\n", + " \"row_is_null_ratio\": float, \n", + " \"unique_row_ratio\": float,\n", + " \"duplicate_row_count\": int,\n", + " \"file_type\": string,\n", + " \"encoding\": string,\n", + "},\n", + "\"data_stats\": [\n", + " {\n", + " \"column_name\": string,\n", + " \"data_type\": string,\n", + " \"data_label\": string,\n", + " \"categorical\": bool,\n", + " \"order\": string,\n", + " \"samples\": list(str),\n", + " \"statistics\": {\n", + " \"sample_size\": int,\n", + " \"null_count\": int,\n", + " \"null_types\": list(string),\n", + " \"null_types_index\": {\n", + " string: list(int)\n", + " },\n", + " \"data_type_representation\": [string, list(string)],\n", + " \"min\": [null, float],\n", + " \"max\": [null, float],\n", + " \"mean\": float,\n", + " \"variance\": float,\n", + " \"stddev\": float,\n", + " \"histogram\": { \n", + " \"bin_counts\": list(int),\n", + " \"bin_edges\": list(float),\n", + " },\n", + " \"quantiles\": {\n", + " int: float\n", + " }\n", + " \"vocab\": list(char),\n", + " \"avg_predictions\": dict(float), \n", + " \"data_label_representation\": dict(float),\n", + " \"categories\": list(str),\n", + " \"unique_count\": int,\n", + " \"unique_ratio\": float,\n", + " \"precision\": {\n", + " 'min': int,\n", + " 'max': int,\n", + " 'mean': float,\n", + " 'var': float,\n", + " 'std': float,\n", + " 'sample_size': int,\n", + " 'margin_of_error': float,\n", + " 'confidence_level': float\t\t\n", + " },\n", + " \"times\": dict(float),\n", + " \"format\": string\n", + " }\n", + " }\n", + "]\n", + "```\n", + "\n", + "In the example, the `compact` format of the report is used to shorten the full list of the results. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fcb5447", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"\n", + "\n", + "# remove extra tf loggin\n", + "import tensorflow as tf\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7fc2df6", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Compact - A high level view, good for quick reviews\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "d7ec39d2", + "metadata": {}, + "source": [ + "It should be noted, in addition to reading the input data from multiple file types, DataProfiler allows the input data as a dataframe. To get more results related to detailed predictions at the entity level from the DataLabeler component or histogram results, the format `pretty` should be used. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29737f25", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "import pandas as pd\n", + "my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=[\"col_int\", \"col_float\"])\n", + "profile = dp.Profiler(my_dataframe)\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "241f6e3e", + "metadata": {}, + "source": [ + "## Profiler Type" + ] + }, + { + "cell_type": "markdown", + "id": "5b20879b", + "metadata": {}, + "source": [ + "The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc44eb47", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data, profiler_type='structured')\n", + "\n", + "# print the report using json to prettify.\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "fe02ad64", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "40804cc9", + "metadata": {}, + "source": [ + "The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the `ProfilerOptions` class.\n", + "\n", + "For example, if a user doesn't require histogram information they may desire to turn off the histogram functionality. Simialrly, if a user is looking for a more accurate labeling, they can increase the samples used to label.\n", + "\n", + "Below, let's remove the histogram and increase the number of samples to the labeler component (1,000 samples). \n", + "\n", + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler/profile_options.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d25d899", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/diamonds.csv\"))\n", + "\n", + "profile_options = dp.ProfilerOptions()\n", + "\n", + "# Setting multiple options via set\n", + "profile_options.set({ \"histogram.is_enabled\": False, \"int.is_enabled\": False})\n", + "\n", + "# Set options via directly setting them\n", + "profile_options.structured_options.data_labeler.max_sample_size = 1000\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "2052415a", + "metadata": {}, + "source": [ + "## Updating Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "7e02f746", + "metadata": {}, + "source": [ + "Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab8022f", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Load and profile a CSV file\n", + "data = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-header-and-author.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Update the profile with new data:\n", + "new_data = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-skip-header.txt\"))\n", + "# new_data = dp.Data(os.path.join(data_path, \"iris-utf-16.csv\")) # will error due to schema mismatch\n", + "profile.update_profile(new_data)\n", + "\n", + "# Take a peek at the data\n", + "print(data.data)\n", + "print(new_data.data)\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "66ec6dc5", + "metadata": {}, + "source": [ + "## Merging Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "e2265fe9", + "metadata": {}, + "source": [ + "Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple `+` command: `profile3 = profile1 + profile2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc68ca07", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Load a CSV file with a schema\n", + "data1 = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-header-and-author.txt\"))\n", + "profile1 = dp.Profiler(data1)\n", + "\n", + "# Load another CSV file with the same schema\n", + "data2 = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-skip-header.txt\"))\n", + "profile2 = dp.Profiler(data2)\n", + "\n", + "# Merge the profiles\n", + "profile3 = profile1 + profile2\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile3.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "7ea07dc6", + "metadata": {}, + "source": [ + "As you can see, the `update_profile` function and the `+` operator function similarly. The reason the `+` operator is important is that it's possible to *save and load profiles*, which we cover next." + ] + }, + { + "cell_type": "markdown", + "id": "375ff25c-b189-436a-b07d-5e7f13cc6e03", + "metadata": {}, + "source": [ + "## Differences in Data\n", + "Can be applied to both structured and unstructured datasets. \n", + "\n", + "Such reports can provide details on the differences between training and validation data like in this pseudo example:\n", + "```python\n", + "profiler_training = dp.Profiler(training_data)\n", + "profiler_testing = dp.Profiler(testing_data)\n", + "\n", + "validation_report = profiler_training.diff(profiler_testing)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65360a03-e3ff-4f3c-9963-412298fdb284", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "# structured differences example\n", + "data_split_differences = profile1.diff(profile2)\n", + "pprint(data_split_differences)" + ] + }, + { + "cell_type": "markdown", + "id": "2ae471ff-852f-400a-9bee-5c9fef96f10a", + "metadata": {}, + "source": [ + "## Graphing a Profile\n", + "\n", + "We've also added the ability to generating visual reports from a profile.\n", + "\n", + "The following plots are currently available to work directly with your profilers:\n", + "\n", + " * missing values matrix\n", + " * histogram (numeric columns only)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "734b588d-ac9a-409c-8eb5-b1a0aede8c63", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# get the data\n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "data = dp.Data(os.path.join(data_folder, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "\n", + "# profile the data\n", + "profile = dp.Profiler(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4e70204-fa30-43c2-9556-e84c19f82d32", + "metadata": {}, + "outputs": [], + "source": [ + "# generate a missing values matrix\n", + "fig = plt.figure(figsize=(8, 6), dpi=100)\n", + "fig = dp.graphs.plot_missing_values_matrix(profile, ax=fig.gca(), title=\"Missing Values Matrix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d734d355-e542-4245-a1e9-66521e333c2d", + "metadata": {}, + "outputs": [], + "source": [ + "# generate histogram of all int/float columns\n", + "fig = dp.graphs.plot_histograms(profile)\n", + "fig.set_size_inches(8, 6)\n", + "fig.set_dpi(100)" + ] + }, + { + "cell_type": "markdown", + "id": "30868000", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile" + ] + }, + { + "cell_type": "markdown", + "id": "f2858072", + "metadata": {}, + "source": [ + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ad9ca57", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "data = dp.Data(os.path.join(data_path, \"csv/names-col.txt\"))\n", + "\n", + "# Generate a profile\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Save a profile to disk for later (saves as pickle file)\n", + "profile.save(filepath=\"my_profile.pkl\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.pkl\")\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "8f9859c2", + "metadata": {}, + "source": [ + "With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3571f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a multiple files via the Data class\n", + "filenames = [\"csv/sparse-first-and-last-column-header-and-author.txt\",\n", + " \"csv/sparse-first-and-last-column-skip-header.txt\"]\n", + "data_objects = []\n", + "for filename in filenames:\n", + " data_objects.append(dp.Data(os.path.join(data_path, filename)))\n", + "\n", + "\n", + "# Generate and save profiles\n", + "for i in range(len(data_objects)):\n", + " profile = dp.Profiler(data_objects[i])\n", + " profile.save(filepath=\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Load profiles and add them together\n", + "profile = None\n", + "for i in range(len(data_objects)):\n", + " if profile is None:\n", + " profile = dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + " else:\n", + " profile += dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4690068a-8fc3-4bd5-8649-63d0f34fa91d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/regex_labeler_from_scratch.ipynb b/docs/0.12.0/doctrees/nbsphinx/regex_labeler_from_scratch.ipynb new file mode 100644 index 000000000..96aee213a --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/regex_labeler_from_scratch.ipynb @@ -0,0 +1,444 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e04c382a-7c49-452b-b9bf-e448951c64fe", + "metadata": {}, + "source": [ + "# Building a Regex Data Labeler w/ your own Regex" + ] + }, + { + "cell_type": "markdown", + "id": "6fb3ecb9-bc51-4c18-93d5-7991bbee5165", + "metadata": {}, + "source": [ + "This notebook teaches how to use the existing / create your own regex labeler as well as utilize it for structured data profiling.\n", + "\n", + "1. Loading and utilizing the pre-existing regex data labeler\n", + "1. Replacing the existing regex rules with your own.\n", + "1. Utilizng a regex data labeler inside of the structured profiler\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a67c197b-d3ee-4896-a96f-cc3d043601d3", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "from pprint import pprint\n", + "\n", + "import pandas as pd\n", + "\n", + "try:\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " sys.path.insert(0, '../..')\n", + " import dataprofiler as dp" + ] + }, + { + "cell_type": "markdown", + "id": "c71356f4-9020-4862-a1e1-816effbb5443", + "metadata": {}, + "source": [ + "## Loading and using the pre-existing regex data labeler\n", + "We can easily import the exsting regex labeler via the `load_from_library` command from the `dp.DataLabeler`. This allows us to import models other than the default structured / unstructured labelers which exist in the library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "113d6655-4bca-4d8e-9e6f-b972e29d5684", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler = dp.DataLabeler.load_from_library('regex_model')\n", + "data_labeler.model.help()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b405887-2b92-44ca-b8d7-29c384f6dd9c", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(data_labeler.label_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11916a48-098c-4056-ac6c-b9542d85fa86", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(data_labeler.model._parameters['regex_patterns'])" + ] + }, + { + "cell_type": "markdown", + "id": "da0e97ee-8d6d-4631-9b55-78ed904d5f41", + "metadata": {}, + "source": [ + "### Predicting with the regex labeler\n", + "In the prediction below, the default settings will `split` the predictions by default as it's aggregation function. In other words, if a string '123 Fake St.' The first character would receive a vote for integer and for address giving both a 50% probability. This is because these regex functions are defined individually and a post prediction aggregation function must be used to get the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe519e65-36a7-4f42-8314-5369de8635c7", + "metadata": {}, + "outputs": [], + "source": [ + "# evaluate a prediction using the default parameters\n", + "data_labeler.predict(['123 Fake St.'])" + ] + }, + { + "cell_type": "markdown", + "id": "b41d834d-e47b-45a6-8970-d2d2033e2ade", + "metadata": {}, + "source": [ + "## Replacing the regex rules in the existing labeler\n", + "\n", + "We can achieve this by:\n", + "1. Setting the label mapping to the new labels\n", + "2. Setting the model parameters which include: `regex_patterns`, `default_label`, `ignore_case`, and `encapsulators`\n", + "\n", + "where `regex_patterns` is a `dict` of lists or regex for each label, `default_label` is the expected default label for the regex, `ignore_case` tells the model to ignore case during its detection, and `encapsulators` are generic regex statements placed before (start) and after (end) each regex. Currently, this is used by the default model to capture labels that are within a cell rather than matching the entire cell. (e.g. ' 123 ' will still capture 123 as digits)." + ] + }, + { + "cell_type": "markdown", + "id": "c6bb010a-406f-4fd8-abd0-3355a5ad0ded", + "metadata": {}, + "source": [ + "Below, we created 4 labels where `other` is the `default_label`. Additionally, we set enabled case sensitivity such that upper and lower case letters would be detected separately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f86584cf-a7af-4bae-bf44-d87caa68833a", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.set_labels({'other': 0, 'digits':1, 'lowercase_char': 2, 'uppercase_chars': 3})\n", + "data_labeler.model.set_params(\n", + " regex_patterns={\n", + " 'digits': [r'[+-]?[0-9]+'],\n", + " 'lowercase_char': [r'[a-z]+'],\n", + " 'uppercase_chars': [r'[A-Z]+'],\n", + " },\n", + " default_label='other',\n", + " ignore_case=False,\n", + ")\n", + "data_labeler.label_mapping" + ] + }, + { + "cell_type": "markdown", + "id": "1ece1c8c-18a5-46fc-b563-6458e6e71e53", + "metadata": {}, + "source": [ + "### Predicting with the new regex labels\n", + "\n", + "Here we notice the otuput of the predictions gives us a prediction per character for each regex. Note how by default it is matching subtext due to the encapsulators. Where `123` were found to be digits, `FAKE` was foudn to be upper case, and the whitespaces and `St.` were other due no single regex being correct." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92842e14-2ea6-4879-b58c-c52b607dc94c", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.predict(['123 FAKE St.'])" + ] + }, + { + "cell_type": "markdown", + "id": "2ce14e54-094f-41ff-9ce0-69acace6abc2", + "metadata": {}, + "source": [ + "Below we turn off case-sensitivity and we see how the aggregation funciton splits the votes for characters between the `lowercase` and `uppercase` chars." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7b8ed9d-c912-4dc7-82c5-ba78a3affc1e", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.model.set_params(ignore_case=True)\n", + "data_labeler.predict(['123 FAKE St.'])" + ] + }, + { + "cell_type": "markdown", + "id": "dc66515f-24e4-40f0-8592-b1ee4fba7077", + "metadata": {}, + "source": [ + "For the rest of this notebook, we will just use a single regex serach which will capture both upper and lower case chars." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e0c1b11-d111-4080-873f-40aff7cf7930", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.set_labels({'other': 0, 'digits':1, 'chars': 2})\n", + "data_labeler.model.set_params(\n", + " regex_patterns={\n", + " 'digits': [r'[=-]?[0-9]+'],\n", + " 'chars': [r'[a-zA-Z]+'],\n", + " },\n", + " default_label='other',\n", + " ignore_case=False,\n", + ")\n", + "data_labeler.label_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28e7b2ee-c661-4b31-b727-078f1393b5c4", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.predict(['123 FAKE St.'])" + ] + }, + { + "cell_type": "markdown", + "id": "f60c8fd1-76e1-469f-9e5a-62d7529301b3", + "metadata": {}, + "source": [ + "### Adjusting postprocessor properties\n", + "\n", + "Below we can look at the possible postprocessor parameters to adjust the aggregation function to the desired output. The previous outputs by default used the `split` aggregation function, however, below we will show the `random` aggregation function which will randomly choose a label if multiple labels have a vote for a given character." + ] + }, + { + "cell_type": "markdown", + "id": "36afa82b-1ca5-49ad-9aa9-84c6de621f59", + "metadata": {}, + "source": [ + "data_labeler.postprocessor.help()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66840940-47bf-433a-8ee8-977f26926e0b", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.postprocessor.set_params(aggregation_func='random')\n", + "data_labeler.predict(['123 FAKE St.'], predict_options=dict(show_confidences=True))" + ] + }, + { + "cell_type": "markdown", + "id": "c32b74fc-5051-4d53-b02a-4d1e4a35958f", + "metadata": {}, + "source": [ + "## Integrating the new Regex labeler into Structured Profiling\n", + "\n", + "While the labeler can be used alone, it is also possible to integrate the labeler into the StructuredProfiler with a slight change to its postprocessor. The StructuredProfiler requires a labeler which outputs othe confidence of each label for a given cell being processed. To convert the output of the `RegexPostProcessor` into said format, we will use the `StructRegexPostProcessor`. We can create the postprocessor and set the `data_labeler`'s postprocessor to this value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2663f2d-29a2-41ed-88dd-8a213d303365", + "metadata": {}, + "outputs": [], + "source": [ + "from dataprofiler.labelers.data_processing import StructRegexPostProcessor\n", + "\n", + "postprocesor = StructRegexPostProcessor()\n", + "data_labeler.set_postprocessor(postprocesor)" + ] + }, + { + "cell_type": "markdown", + "id": "f7352769-d636-42c6-9706-7d9cff520a72", + "metadata": {}, + "source": [ + "Below we will see the output is now one vote per sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18814634-0fd0-4ce8-b0c3-9b9454701a43", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.predict(['123 FAKE St.', '123', 'FAKE'], predict_options=dict(show_confidences=True))" + ] + }, + { + "cell_type": "markdown", + "id": "b4aa4e36-7362-4966-b827-3f5a6f2dfa7c", + "metadata": {}, + "source": [ + "### Setting the Structuredprofiler's DataLabeler\n", + "\n", + "We can create a `ProfilerOption` and set the structured options to have the new data_labeler as its value. We then run the StructuredProfiler with the specified options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f18cf7f-283e-4e54-b3f9-1312828c3029", + "metadata": {}, + "outputs": [], + "source": [ + "# create and set the option for the regex data labeler to be used at profile time\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': data_labeler})\n", + "\n", + "# profile the dataset using the suggested regex data labeler\n", + "data = pd.DataFrame(\n", + " [['123 FAKE St.', 123, 'this'], \n", + " [123 , -9, 'IS'], \n", + " ['...' , +80, 'A'], \n", + " ['123' , 202, 'raNDom'], \n", + " ['test' , -1, 'TEST']], \n", + " dtype=object)\n", + "profiler = dp.Profiler(data, options=profile_options)" + ] + }, + { + "cell_type": "markdown", + "id": "663e49f7-358b-4b0f-99a4-1823908ef990", + "metadata": {}, + "source": [ + "Below we see the first column is given 3 labels as it received multiple votes for said column. However, it was confident on the second and third column which is why it only specified `digits` and `chars` respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f796d7f5-7e8a-447b-9cbb-d5b8180660a3", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(profiler.report(\n", + " dict(output_format='compact', \n", + " omit_keys=['data_stats.*.statistics', \n", + " 'data_stats.*.categorical', \n", + " 'data_stats.*.order', \n", + " 'global_stats'])))" + ] + }, + { + "cell_type": "markdown", + "id": "261b903f-8f4c-403f-839b-ab8813f850e9", + "metadata": {}, + "source": [ + "## Saving the Data Labeler for future use" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6ffbaf2-9400-486a-ba83-5fc9ba9334d7", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.isdir('my_new_regex_labeler'):\n", + " os.mkdir('my_new_regex_labeler')\n", + "data_labeler.save_to_disk('my_new_regex_labeler')" + ] + }, + { + "cell_type": "markdown", + "id": "09e40cb6-9d89-41c4-ae28-3dca498f8c68", + "metadata": {}, + "source": [ + "## Loading the saved Data Labeler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52615b25-70a6-4ebb-8a32-14aaf1e747d9", + "metadata": {}, + "outputs": [], + "source": [ + "saved_labeler = dp.DataLabeler.load_from_disk('my_new_regex_labeler')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1ccc0b3-1dc2-4847-95c2-d6b8769b1590", + "metadata": {}, + "outputs": [], + "source": [ + "# ensuring the parametesr are what we saved.\n", + "print(\"label_mapping:\")\n", + "pprint(saved_labeler.label_mapping)\n", + "print(\"\\nmodel parameters:\")\n", + "pprint(saved_labeler.model._parameters)\n", + "print()\n", + "print(\"postprocessor: \" + saved_labeler.postprocessor.__class__.__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c827f2ae-4af6-4f3f-9651-9ee9ebea9fa0", + "metadata": {}, + "outputs": [], + "source": [ + "# predicting with the loaded labeler.\n", + "saved_labeler.predict(['test', '123'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "606f9bbf-5955-4b7b-b0d1-390de5600f73", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/nbsphinx/unstructured_profiler_example.ipynb b/docs/0.12.0/doctrees/nbsphinx/unstructured_profiler_example.ipynb new file mode 100644 index 000000000..9ab754cc7 --- /dev/null +++ b/docs/0.12.0/doctrees/nbsphinx/unstructured_profiler_example.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f37ca393", + "metadata": {}, + "source": [ + "# Unstructured Profilers" + ] + }, + { + "cell_type": "markdown", + "id": "ff9bd095", + "metadata": {}, + "source": [ + "**Data profiling** - *is the process of examining a dataset and collecting statistical or informational summaries about said dataset.*\n", + "\n", + "The Profiler class inside the DataProfiler is designed to generate *data profiles* via the Profiler class, which ingests either a Data class or a Pandas DataFrame. \n", + "\n", + "Currently, the Data class supports loading the following file formats:\n", + "\n", + "* Any delimited (CSV, TSV, etc.)\n", + "* JSON object\n", + "* Avro\n", + "* Parquet\n", + "* Text files\n", + "* Pandas Series/Dataframe\n", + "\n", + "Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.\n", + "\n", + "This example will look at specifically the unstructured data types for unstructured profiling. This means that only text files, lists of strings, single column pandas dataframes/series, or DataProfile Data objects in string format will work with the unstructured profiler. " + ] + }, + { + "cell_type": "markdown", + "id": "de58b9c4", + "metadata": {}, + "source": [ + "## Reporting" + ] + }, + { + "cell_type": "markdown", + "id": "8001185a", + "metadata": {}, + "source": [ + "One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.\n", + "\n", + "In terms of reporting, there are multiple reporting options:\n", + "\n", + "* **Pretty**: Floats are rounded to four decimal places, and lists are shortened.\n", + "* **Compact**: Similar to pretty, but removes detailed statistics\n", + "* **Serializable**: Output is json serializable and not prettified\n", + "* **Flat**: Nested Output is returned as a flattened dictionary\n", + "\n", + "The **Pretty** and **Compact** reports are the two most commonly used reports and includes `global_stats` and `data_stats` for the given dataset. `global_stats` contains overall properties of the data such as samples used and file encoding. `data_stats` contains specific properties and statistics for each text sample.\n", + "\n", + "For unstructured profiles, the report looks like this:\n", + "\n", + "```\n", + "\"global_stats\": {\n", + " \"samples_used\": int,\n", + " \"empty_line_count\": int,\n", + " \"file_type\": string,\n", + " \"encoding\": string\n", + "},\n", + "\"data_stats\": {\n", + " \"data_label\": {\n", + " \"entity_counts\": {\n", + " \"word_level\": dict(int),\n", + " \"true_char_level\": dict(int),\n", + " \"postprocess_char_level\": dict(int)\n", + " },\n", + " \"times\": dict(float)\n", + " },\n", + " \"statistics\": {\n", + " \"vocab\": list(char),\n", + " \"words\": list(string),\n", + " \"word_count\": dict(int),\n", + " \"times\": dict(float)\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fcb5447", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"\n", + "\n", + "# remove extra tf loggin\n", + "import tensorflow as tf\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7fc2df6", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"txt/discussion_reddit.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "4d183992", + "metadata": {}, + "source": [ + "## Profiler Type" + ] + }, + { + "cell_type": "markdown", + "id": "d7ec39d2", + "metadata": {}, + "source": [ + "It should be noted, in addition to reading the input data from text files, DataProfiler allows the input data as a pandas dataframe, a pandas series, a list, and Data objects (when an unstructured format is selected) if the Profiler is explicitly chosen as unstructured." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29737f25", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "import pandas as pd\n", + "data = dp.Data(os.path.join(data_path, \"csv/SchoolDataSmall.csv\"), options={\"data_format\": \"records\"})\n", + "profile = dp.Profiler(data, profiler_type='unstructured')\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "fe02ad64", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "40804cc9", + "metadata": {}, + "source": [ + "The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the `ProfilerOptions` class.\n", + "\n", + "For example, if a user doesn't require vocab count information they may desire to turn off the word count functionality.\n", + "\n", + "Below, let's remove the vocab count and set the stop words. \n", + "\n", + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler/profile_options.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d25d899", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"txt/discussion_reddit.txt\"))\n", + "\n", + "profile_options = dp.ProfilerOptions()\n", + "\n", + "# Setting multiple options via set\n", + "profile_options.set({ \"*.vocab.is_enabled\": False, \"*.is_case_sensitive\": True })\n", + "\n", + "# Set options via directly setting them\n", + "profile_options.unstructured_options.text.stop_words = [\"These\", \"are\", \"stop\", \"words\"]\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\": \"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "2052415a", + "metadata": {}, + "source": [ + "## Updating Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "7e02f746", + "metadata": {}, + "source": [ + "Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab8022f", + "metadata": {}, + "outputs": [], + "source": [ + "# Load and profile a CSV file\n", + "data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Update the profile with new data:\n", + "new_data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile.update_profile(new_data)\n", + "\n", + "# Take a peek at the data\n", + "print(data.data)\n", + "print(new_data.data)\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\": \"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "66ec6dc5", + "metadata": {}, + "source": [ + "## Merging Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "e2265fe9", + "metadata": {}, + "source": [ + "Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple `+` command: `profile3 = profile1 + profile2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc68ca07", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a CSV file with a schema\n", + "data1 = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile1 = dp.Profiler(data1)\n", + "\n", + "# Load another CSV file with the same schema\n", + "data2 = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile2 = dp.Profiler(data2)\n", + "\n", + "# Merge the profiles\n", + "profile3 = profile1 + profile2\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile3.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "7ea07dc6", + "metadata": {}, + "source": [ + "As you can see, the `update_profile` function and the `+` operator function similarly. The reason the `+` operator is important is that it's possible to *save and load profiles*, which we cover next." + ] + }, + { + "cell_type": "markdown", + "id": "4704961a", + "metadata": {}, + "source": [ + "## Differences in Data\n", + "Can be applied to both structured and unstructured datasets. \n", + "\n", + "Such reports can provide details on the differences between training and validation data like in this pseudo example:\n", + "```python\n", + "profiler_training = dp.Profiler(training_data)\n", + "profiler_testing = dp.Profiler(testing_data)\n", + "\n", + "validation_report = profiler_training.diff(profiler_testing)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58f92c1b", + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "# unstructured differences example\n", + "data_split_differences = profile1.diff(profile2)\n", + "pprint(data_split_differences)" + ] + }, + { + "cell_type": "markdown", + "id": "30868000", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile" + ] + }, + { + "cell_type": "markdown", + "id": "f2858072", + "metadata": {}, + "source": [ + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ad9ca57", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "\n", + "# Generate a profile\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Save a profile to disk for later (saves as pickle file)\n", + "profile.save(filepath=\"my_profile.pkl\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.pkl\")\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "8f9859c2", + "metadata": {}, + "source": [ + "With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3571f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a multiple files via the Data class\n", + "filenames = [\"txt/sentence-3x.txt\",\n", + " \"txt/sentence.txt\"]\n", + "data_objects = []\n", + "for filename in filenames:\n", + " data_objects.append(dp.Data(os.path.join(data_path, filename)))\n", + "\n", + "print(data_objects)\n", + "# Generate and save profiles\n", + "for i in range(len(data_objects)):\n", + " profile = dp.Profiler(data_objects[i])\n", + " report = profile.report(report_options={\"output_format\":\"compact\"})\n", + " print(json.dumps(report, indent=4))\n", + " profile.save(filepath=\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Load profiles and add them together\n", + "profile = None\n", + "for i in range(len(data_objects)):\n", + " if profile is None:\n", + " profile = dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + " else:\n", + " profile += dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/doctrees/overview.doctree b/docs/0.12.0/doctrees/overview.doctree new file mode 100644 index 000000000..030cab2da Binary files /dev/null and b/docs/0.12.0/doctrees/overview.doctree differ diff --git a/docs/0.12.0/doctrees/popmon_dp_loader_example.doctree b/docs/0.12.0/doctrees/popmon_dp_loader_example.doctree new file mode 100644 index 000000000..ef090f4b0 Binary files /dev/null and b/docs/0.12.0/doctrees/popmon_dp_loader_example.doctree differ diff --git a/docs/0.12.0/doctrees/profiler.doctree b/docs/0.12.0/doctrees/profiler.doctree new file mode 100644 index 000000000..7e4f52008 Binary files /dev/null and b/docs/0.12.0/doctrees/profiler.doctree differ diff --git a/docs/0.12.0/doctrees/profiler_example.doctree b/docs/0.12.0/doctrees/profiler_example.doctree new file mode 100644 index 000000000..4b753990f Binary files /dev/null and b/docs/0.12.0/doctrees/profiler_example.doctree differ diff --git a/docs/0.12.0/doctrees/regex_labeler_from_scratch.doctree b/docs/0.12.0/doctrees/regex_labeler_from_scratch.doctree new file mode 100644 index 000000000..f50c8f6dd Binary files /dev/null and b/docs/0.12.0/doctrees/regex_labeler_from_scratch.doctree differ diff --git a/docs/0.12.0/doctrees/roadmap.doctree b/docs/0.12.0/doctrees/roadmap.doctree new file mode 100644 index 000000000..b2797a148 Binary files /dev/null and b/docs/0.12.0/doctrees/roadmap.doctree differ diff --git a/docs/0.12.0/doctrees/unstructured_profiler_example.doctree b/docs/0.12.0/doctrees/unstructured_profiler_example.doctree new file mode 100644 index 000000000..494a768d6 Binary files /dev/null and b/docs/0.12.0/doctrees/unstructured_profiler_example.doctree differ diff --git a/docs/0.12.0/html/.buildinfo b/docs/0.12.0/html/.buildinfo new file mode 100644 index 000000000..47e47ccf8 --- /dev/null +++ b/docs/0.12.0/html/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: e4666d8c8256d407b13bfa1e2cf2ab0e +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/0.12.0/html/API.html b/docs/0.12.0/html/API.html new file mode 100644 index 000000000..4ba99dfef --- /dev/null +++ b/docs/0.12.0/html/API.html @@ -0,0 +1,295 @@ + + + + + + + + + API - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/_images/DL-Flowchart.png b/docs/0.12.0/html/_images/DL-Flowchart.png new file mode 100644 index 000000000..696eeb5dc Binary files /dev/null and b/docs/0.12.0/html/_images/DL-Flowchart.png differ diff --git a/docs/0.12.0/html/_images/histogram_example_0.png b/docs/0.12.0/html/_images/histogram_example_0.png new file mode 100644 index 000000000..9b8301363 Binary files /dev/null and b/docs/0.12.0/html/_images/histogram_example_0.png differ diff --git a/docs/0.12.0/html/_images/histogram_example_1.png b/docs/0.12.0/html/_images/histogram_example_1.png new file mode 100644 index 000000000..062dfdbb9 Binary files /dev/null and b/docs/0.12.0/html/_images/histogram_example_1.png differ diff --git a/docs/0.12.0/html/_images/histogram_example_2.png b/docs/0.12.0/html/_images/histogram_example_2.png new file mode 100644 index 000000000..1aedf7549 Binary files /dev/null and b/docs/0.12.0/html/_images/histogram_example_2.png differ diff --git a/docs/0.12.0/html/_images/missing_value_barchart_example_0.png b/docs/0.12.0/html/_images/missing_value_barchart_example_0.png new file mode 100644 index 000000000..33cb7afd2 Binary files /dev/null and b/docs/0.12.0/html/_images/missing_value_barchart_example_0.png differ diff --git a/docs/0.12.0/html/_images/missing_value_matrix_example_0.png b/docs/0.12.0/html/_images/missing_value_matrix_example_0.png new file mode 100644 index 000000000..21799cddf Binary files /dev/null and b/docs/0.12.0/html/_images/missing_value_matrix_example_0.png differ diff --git a/docs/0.12.0/html/_sources/API.rst.txt b/docs/0.12.0/html/_sources/API.rst.txt new file mode 100644 index 000000000..fdbf2242b --- /dev/null +++ b/docs/0.12.0/html/_sources/API.rst.txt @@ -0,0 +1,16 @@ +.. _API: + +API +*** + +The API is split into 4 main components: Profilers, Labelers, Data Readers, and +Validators. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + dataprofiler.data_readers + dataprofiler.profilers + dataprofiler.labelers + dataprofiler.validators \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/add_new_model_to_data_labeler.nblink.txt b/docs/0.12.0/html/_sources/add_new_model_to_data_labeler.nblink.txt new file mode 100644 index 000000000..130e413fc --- /dev/null +++ b/docs/0.12.0/html/_sources/add_new_model_to_data_labeler.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/add_new_model_to_data_labeler.ipynb" +} \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/column_name_labeler_example.nblink.txt b/docs/0.12.0/html/_sources/column_name_labeler_example.nblink.txt new file mode 100644 index 000000000..6f95cf7cb --- /dev/null +++ b/docs/0.12.0/html/_sources/column_name_labeler_example.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/column_name_labeler.ipynb" +} \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/data_labeling.rst.txt b/docs/0.12.0/html/_sources/data_labeling.rst.txt new file mode 100644 index 000000000..db76fe791 --- /dev/null +++ b/docs/0.12.0/html/_sources/data_labeling.rst.txt @@ -0,0 +1,365 @@ +.. _data_labeling: + +Labeler (Sensitive Data) +************************ + +In this library, the term *data labeling* refers to entity recognition. + +Builtin to the data profiler is a classifier which evaluates the complex data types of the dataset. +For structured data, it determines the complex data type of each column. When +running the data profile, it uses the default data labeling model builtin to the +library. However, the data labeler allows users to train their own data labeler +as well. + +*Data Labels* are determined per cell for structured data (column/row when +the *profiler* is used) or at the character level for unstructured data. This +is a list of the default labels. + +* UNKNOWN +* ADDRESS +* BAN (bank account number, 10-18 digits) +* CREDIT_CARD +* EMAIL_ADDRESS +* UUID +* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.) +* IPV4 +* IPV6 +* MAC_ADDRESS +* PERSON +* PHONE_NUMBER +* SSN +* URL +* US_STATE +* DRIVERS_LICENSE +* DATE +* TIME +* DATETIME +* INTEGER +* FLOAT +* QUANTITY +* ORDINAL + + +Identify Entities in Structured Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Makes predictions and identifying labels: + +.. code-block:: python + + import dataprofiler as dp + + # load data and data labeler + data = dp.Data("your_data.csv") + data_labeler = dp.DataLabeler(labeler_type='structured') + + # make predictions and get labels per cell + predictions = data_labeler.predict(data) + +Identify Entities in Unstructured Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Predict which class characters belong to in unstructured text: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler(labeler_type='unstructured') + + # Example sample string, must be in an array (multiple arrays can be passed) + sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." + "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"] + + # Prediction what class each character belongs to + model_predictions = data_labeler.predict( + sample, predict_options=dict(show_confidences=True)) + + # Predictions / confidences are at the character level + final_results = model_predictions["pred"] + final_confidences = model_predictions["conf"] + +It's also possible to change output formats, output similar to a **SpaCy** format: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler(labeler_type='unstructured', trainable=True) + + # Example sample string, must be in an array (multiple arrays can be passed) + sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." + "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"] + + # Set the output to the NER format (start position, end position, label) + data_labeler.set_params( + { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } } + ) + + results = data_labeler.predict(sample) + + print(results) + +Train a New Data Labeler +~~~~~~~~~~~~~~~~~~~~~~~~ + +Mechanism for training your own data labeler on their own set of structured data +(tabular): + +.. code-block:: python + + import dataprofiler as dp + + # Will need one column with a default label of UNKNOWN + data = dp.Data("your_file.csv") + + data_labeler = dp.train_structured_labeler( + data=data, + save_dirpath="/path/to/save/labeler", + epochs=2 + ) + + data_labeler.save_to_disk("my/save/path") # Saves the data labeler for reuse + +Load an Existing Data Labeler +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Mechanism for loading an existing data_labeler: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler( + labeler_type='structured', dirpath="/path/to/my/labeler") + + # get information about the parameters/inputs/output formats for the DataLabeler + data_labeler.help() + +Extending a Data Labeler with Transfer Learning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Extending or changing labels of a data labeler w/ transfer learning: +Note: By default, **a labeler loaded will not be trainable**. In order to load a +trainable DataLabeler, the user must set `trainable=True` or load a labeler +using the `TrainableDataLabeler` class. + +The following illustrates how to change the labels: + +.. code-block:: python + + import dataprofiler as dp + + labels = ['label1', 'label2', ...] # new label set can also be an encoding dict + data = dp.Data("your_file.csv") # contains data with new labels + + # load default structured Data Labeler w/ trainable set to True + data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True) + + # this will use transfer learning to retrain the data labeler on your new + # dataset and labels. + # NOTE: data must be in an acceptable format for the preprocessor to interpret. + # please refer to the preprocessor/model for the expected data format. + # Currently, the DataLabeler cannot take in Tabular data, but requires + # data to be ingested with two columns [X, y] where X is the samples and + # y is the labels. + model_results = data_labeler.fit(x=data['samples'], y=data['labels'], + validation_split=0.2, epochs=2, labels=labels) + + # final_results, final_confidences are a list of results for each epoch + epoch_id = 0 + final_results = model_results[epoch_id]["pred"] + final_confidences = model_results[epoch_id]["conf"] + +The following illustrates how to extend the labels: + +.. code-block:: python + + import dataprofiler as dp + + new_labels = ['label1', 'label2', ...] + data = dp.Data("your_file.csv") # contains data with new labels + + # load default structured Data Labeler w/ trainable set to True + data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True) + + # this will maintain current labels and model weights, but extend the model's + # labels + for label in new_labels: + data_labeler.add_label(label) + + # NOTE: a user can also add a label which maps to the same index as an existing + # label + # data_labeler.add_label(label, same_as='') + + # For a trainable model, the user must then train the model to be able to + # continue using the labeler since the model's graph has likely changed + # NOTE: data must be in an acceptable format for the preprocessor to interpret. + # please refer to the preprocessor/model for the expected data format. + # Currently, the DataLabeler cannot take in Tabular data, but requires + # data to be ingested with two columns [X, y] where X is the samples and + # y is the labels. + model_results = data_labeler.fit(x=data['samples'], y=data['labels'], + validation_split=0.2, epochs=2) + + # final_results, final_confidences are a list of results for each epoch + epoch_id = 0 + final_results = model_results[epoch_id]["pred"] + final_confidences = model_results[epoch_id]["conf"] + + +Changing pipeline parameters: + +.. code-block:: python + + import dataprofiler as dp + + # load default Data Labeler + data_labeler = dp.DataLabeler(labeler_type='structured') + + # change parameters of specific component + data_labeler.preprocessor.set_params({'param1': 'value1'}) + + # change multiple simultaneously. + data_labeler.set_params({ + 'preprocessor': {'param1': 'value1'}, + 'model': {'param2': 'value2'}, + 'postprocessor': {'param3': 'value3'} + }) + + +Build Your Own Data Labeler +=========================== + +The DataLabeler has 3 main components: preprocessor, model, and postprocessor. +To create your own DataLabeler, each one would have to be created or an +existing component can be reused. + +Given a set of the 3 components, you can construct your own DataLabeler: + +.. code-block:: python + from dataprofiler.labelers.base_data_labeler import BaseDataLabeler, \ + TrainableDataLabeler + from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel + from dataprofiler.labelers.data_processing import \ + StructCharPreprocessor, StructCharPostprocessor + + # load a non-trainable data labeler + model = CharacterLevelCnnModel(...) + preprocessor = StructCharPreprocessor(...) + postprocessor = StructCharPostprocessor(...) + + data_labeler = BaseDataLabeler.load_with_components( + preprocessor=preprocessor, model=model, postprocessor=postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + + + # load trainable data labeler + data_labeler = TrainableDataLabeler.load_with_components( + preprocessor=preprocessor, model=model, postprocessor=postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + +Option for swapping out specific components of an existing labeler. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.labelers.character_level_cnn_model import \ + CharacterLevelCnnModel + from dataprofiler.labelers.data_processing import \ + StructCharPreprocessor, StructCharPostprocessor + + model = CharacterLevelCnnModel(...) + preprocessor = StructCharPreprocessor(...) + postprocessor = StructCharPostprocessor(...) + + data_labeler = dp.DataLabeler(labeler_type='structured') + data_labeler.set_preprocessor(preprocessor) + data_labeler.set_model(model) + data_labeler.set_postprocessor(postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + + +Model Component +~~~~~~~~~~~~~~~ + +In order to create your own model component for data labeling, you can utilize +the `BaseModel` class from `dataprofiler.labelers.base_model` and +overriding the abstract class methods. + +Reviewing `CharacterLevelCnnModel` from +`dataprofiler.labelers.character_level_cnn_model` illustrates the functions +which need an override. + +#. `__init__`: specifying default parameters and calling base `__init__` +#. `_validate_parameters`: validating parameters given by user during setting +#. `_need_to_reconstruct_model`: flag for when to reconstruct a model (i.e. + parameters change or labels change require a model reconstruction) +#. `_construct_model`: initial construction of the model given the parameters +#. `_reconstruct_model`: updates model architecture for new label set while + maintaining current model weights +#. `fit`: mechanism for the model to learn given training data +#. `predict`: mechanism for model to make predictions on data +#. `details`: prints a summary of the model construction +#. `save_to_disk`: saves model and model parameters to disk +#. `load_from_disk`: loads model given a path on disk + + +Preprocessor Component +~~~~~~~~~~~~~~~~~~~~~~ + +In order to create your own preprocessor component for data labeling, you can +utilize the `BaseDataPreprocessor` class +from `dataprofiler.labelers.data_processing` and override the abstract class +methods. + +Reviewing `StructCharPreprocessor` from +`dataprofiler.labelers.data_processing` illustrates the functions which +need an override. + +#. `__init__`: passing parameters to the base class and executing any + extraneous calculations to be saved as parameters +#. `_validate_parameters`: validating parameters given by user during + setting +#. `process`: takes in the user data and converts it into an digestible, + iterable format for the model +#. `set_params` (optional): if a parameter requires processing before setting, + a user can override this function to assist with setting the parameter +#. `_save_processor` (optional): if a parameter is not JSON serializable, a + user can override this function to assist in saving the processor and its + parameters +#. `load_from_disk` (optional): if a parameter(s) is not JSON serializable, a + user can override this function to assist in loading the processor + +Postprocessor Component +~~~~~~~~~~~~~~~~~~~~~~~ + +The postprocessor is nearly identical to the preprocessor except it handles +the output of the model for processing. In order to create your own +postprocessor component for data labeling, you can utilize the +`BaseDataPostprocessor` class from `dataprofiler.labelers.data_processing` +and override the abstract class methods. + +Reviewing `StructCharPostprocessor` from +`dataprofiler.labelers.data_processing` illustrates the functions which +need an override. + +#. `__init__`: passing parameters to the base class and executing any + extraneous calculations to be saved as parameters +#. `_validate_parameters`: validating parameters given by user during + setting +#. `process`: takes in the output of the model and processes for output to + the user +#. `set_params` (optional): if a parameter requires processing before setting, + a user can override this function to assist with setting the parameter +#. `_save_processor` (optional): if a parameter is not JSON serializable, a + user can override this function to assist in saving the processor and its + parameters +#. `load_from_disk` (optional): if a parameter(s) is not JSON serializable, a + user can override this function to assist in loading the processor diff --git a/docs/0.12.0/html/_sources/data_reader.nblink.txt b/docs/0.12.0/html/_sources/data_reader.nblink.txt new file mode 100644 index 000000000..4722970da --- /dev/null +++ b/docs/0.12.0/html/_sources/data_reader.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/data_readers.ipynb" +} \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/data_readers.rst.txt b/docs/0.12.0/html/_sources/data_readers.rst.txt new file mode 100644 index 000000000..877ea56dd --- /dev/null +++ b/docs/0.12.0/html/_sources/data_readers.rst.txt @@ -0,0 +1,184 @@ +.. _data_readers: + +Data Readers +************ + +The `Data` class itself will identify then output one of the following `Data` class types. +Using the data reader is easy, just pass it through the Data object. + +.. code-block:: python + + import dataprofiler as dp + data = dp.Data("your_file.csv") + +The supported file types are: + +* CSV file (or any delimited file) +* JSON object +* Avro file +* Parquet file +* Graph data file +* Text file +* Pandas DataFrame +* A URL that points to one of the supported file types above + +It's also possible to specifically call one of the data classes such as the following command: + +.. code-block:: python + + from dataprofiler.data_readers.csv_data import CSVData + data = CSVData("your_file.csv", options={"delimiter": ","}) + +Additionally any of the data classes can be loaded using a URL: + +.. code-block:: python + + import dataprofiler as dp + data = dp.Data("https://you_website.com/your_file.file", options={"verify_ssl": "True"}) + +Below are descriptions of the various `Data` classes and the available options. + +CSVData +======= + +Data class for loading datasets of type CSV. Can be specified by passing +in memory data or via a file path. Options pertaining the CSV may also +be specified using the options dict parameter. + +`CSVData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* delimiter - Must be a string, for example `"delimiter": ","` +* data_format - Must be a string, possible choices: "dataframe", "records" +* selected_columns - Columns being selected from the entire dataset, must be a + list `["column 1", "ssn"]` +* sample_nrows - Reservoir sampling to sample `"n"` rows out of a total of `"M"` rows. + Specified for how many rows to sample, default None. +* header - Define the header, for example + + * `"header": 'auto'` for auto detection + * `"header": None` for no header + * `"header": ` to specify the header row (0 based index) + +JSONData +======== + +Data class for loading datasets of type JSON. Can be specified by +passing in memory data or via a file path. Options pertaining the JSON +may also be specified using the options dict parameter. JSON data can be +accessed via the "data" property, the "metadata" property, and the +"data_and_metadata" property. + +`JSONData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "json", "flattened_dataframe" + + * "flattened_dataframe" is best used for JSON structure typically found in data streams that contain + nested lists of dictionaries and a payload. For example: `{"data": [ columns ], "response": 200}` +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` +* payload_keys - The dictionary keys for the payload of the JSON, typically called "data" + or "payload". Defaults to ["data", "payload", "response"]. + + +AVROData +======== + +Data class for loading datasets of type AVRO. Can be specified by +passing in memory data or via a file path. Options pertaining the AVRO +may also be specified using the options dict parameter. + +`AVROData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "avro", "json", "flattened_dataframe" + + * "flattened_dataframe" is best used for AVROs with a JSON structure typically found in data streams that contain + nested lists of dictionaries and a payload. For example: `{"data": [ columns ], "response": 200}` +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` + +ParquetData +=========== + +Data class for loading datasets of type PARQUET. Can be specified by +passing in memory data or via a file path. Options pertaining the +PARQUET may also be specified using the options dict parameter. + +`ParquetData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "json" +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` +* sample_nrows - Random sampling to sample `"n"` rows out of a total of `"M"` rows. + Specified for how many rows to sample, default None. + +GraphData +========= + +Data Class for loading datasets of graph data. Currently takes CSV format, +further type formats will be supported. Can be specified by passing +in memory data (NetworkX Graph) or via a file path. Options pertaining the CSV file may also +be specified using the options dict parameter. Loads data from CSV into memory +as a NetworkX Graph. + +`GraphData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* delimiter - must be a string, for example `"delimiter": ","` +* data_format - must be a string, possible choices: "graph", "dataframe", "records" +* header - Define the header, for example + + * `"header": 'auto'` for auto detection + * `"header": None` for no header + * `"header": ` to specify the header row (0 based index) + +TextData +======== + +Data class for loading datasets of type TEXT. Can be specified by +passing in memory data or via a file path. Options pertaining the TEXT +may also be specified using the options dict parameter. + +`TextData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format: user selected format in which to return data. Currently only supports "text". +* samples_per_line - chunks by which to read in the specified dataset + + +Data Using a URL +================ + +Data class for loading datasets of any type using a URL. Specified by passing in +any valid URL that points to one of the valid data types. Options pertaining the +URL may also be specified using the options dict parameter. + +`Data(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* verify_ssl: must be a boolean string, choices: "True", "False". Set to "True" by default. + +Data Using an AWS S3 URI +======================== + +Data class for loading datasets from AWS S3 URI. Specified by passing in +any valid bucket path that points to one of the valid data types. + +`Data('s3a://my-bucket/file_name.txt')` + +Possible `options`: + +* `storage_options`: must be a dictionary where the keys for boto3 initialization are set + If `storage_options` is provided in `options`, the below variables are retrieved from the dictionary provided. Otherwise, will retrieve from `environment variables `_. + + * `AWS_ACCESS_KEY_ID` + * `AWS_SECRET_ACCESS_KEY` + * `AWS_SESSION_TOKEN` + * `AWS_REGION` (default `us-east-1`) diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.avro_data.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.avro_data.rst.txt new file mode 100644 index 000000000..d3227df29 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.avro_data.rst.txt @@ -0,0 +1,7 @@ +Avro Data +========= + +.. automodule:: dataprofiler.data_readers.avro_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.base_data.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.base_data.rst.txt new file mode 100644 index 000000000..b82883cb9 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.base_data.rst.txt @@ -0,0 +1,7 @@ +Base Data +========= + +.. automodule:: dataprofiler.data_readers.base_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.csv_data.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.csv_data.rst.txt new file mode 100644 index 000000000..85a625d69 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.csv_data.rst.txt @@ -0,0 +1,7 @@ +CSV Data +======== + +.. automodule:: dataprofiler.data_readers.csv_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.data.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.data.rst.txt new file mode 100644 index 000000000..813e81805 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.data.rst.txt @@ -0,0 +1,7 @@ +Data +==== + +.. automodule:: dataprofiler.data_readers.data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.data_utils.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.data_utils.rst.txt new file mode 100644 index 000000000..309208b73 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.data_utils.rst.txt @@ -0,0 +1,7 @@ +Data Utils +========== + +.. automodule:: dataprofiler.data_readers.data_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.filepath_or_buffer.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.filepath_or_buffer.rst.txt new file mode 100644 index 000000000..89e78cc2d --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.filepath_or_buffer.rst.txt @@ -0,0 +1,7 @@ +Filepath Or Buffer +================== + +.. automodule:: dataprofiler.data_readers.filepath_or_buffer + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.graph_data.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.graph_data.rst.txt new file mode 100644 index 000000000..b8b982f79 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.graph_data.rst.txt @@ -0,0 +1,7 @@ +Graph Data +========== + +.. automodule:: dataprofiler.data_readers.graph_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.json_data.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.json_data.rst.txt new file mode 100644 index 000000000..ae0a51d13 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.json_data.rst.txt @@ -0,0 +1,7 @@ +JSON Data +========= + +.. automodule:: dataprofiler.data_readers.json_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.parquet_data.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.parquet_data.rst.txt new file mode 100644 index 000000000..dfdcbe4bb --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.parquet_data.rst.txt @@ -0,0 +1,7 @@ +Parquet Data +============ + +.. automodule:: dataprofiler.data_readers.parquet_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.rst.txt new file mode 100644 index 000000000..56d68fe83 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.rst.txt @@ -0,0 +1,30 @@ +Data Readers +============ + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.data_readers.avro_data + dataprofiler.data_readers.base_data + dataprofiler.data_readers.csv_data + dataprofiler.data_readers.data + dataprofiler.data_readers.data_utils + dataprofiler.data_readers.filepath_or_buffer + dataprofiler.data_readers.graph_data + dataprofiler.data_readers.json_data + dataprofiler.data_readers.parquet_data + dataprofiler.data_readers.structured_mixins + dataprofiler.data_readers.text_data + +.. automodule:: dataprofiler.data_readers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.structured_mixins.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.structured_mixins.rst.txt new file mode 100644 index 000000000..157e03d2c --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.structured_mixins.rst.txt @@ -0,0 +1,7 @@ +Structured Mixins +================= + +.. automodule:: dataprofiler.data_readers.structured_mixins + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.data_readers.text_data.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.data_readers.text_data.rst.txt new file mode 100644 index 000000000..6ac6b9648 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.data_readers.text_data.rst.txt @@ -0,0 +1,7 @@ +Text Data +========= + +.. automodule:: dataprofiler.data_readers.text_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.dp_logging.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.dp_logging.rst.txt new file mode 100644 index 000000000..d1c6c910d --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.dp_logging.rst.txt @@ -0,0 +1,7 @@ +Dp Logging +========== + +.. automodule:: dataprofiler.dp_logging + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.base_data_labeler.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.base_data_labeler.rst.txt new file mode 100644 index 000000000..839a74157 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.base_data_labeler.rst.txt @@ -0,0 +1,7 @@ +Base Data Labeler +================= + +.. automodule:: dataprofiler.labelers.base_data_labeler + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.base_model.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.base_model.rst.txt new file mode 100644 index 000000000..c4bc9b08b --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.base_model.rst.txt @@ -0,0 +1,7 @@ +Base Model +========== + +.. automodule:: dataprofiler.labelers.base_model + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.char_load_tf_model.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.char_load_tf_model.rst.txt new file mode 100644 index 000000000..7dad3aa71 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.char_load_tf_model.rst.txt @@ -0,0 +1,7 @@ +Char Load Tf Model +================== + +.. automodule:: dataprofiler.labelers.char_load_tf_model + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.character_level_cnn_model.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.character_level_cnn_model.rst.txt new file mode 100644 index 000000000..80113a935 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.character_level_cnn_model.rst.txt @@ -0,0 +1,7 @@ +Character Level Cnn Model +========================= + +.. automodule:: dataprofiler.labelers.character_level_cnn_model + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.classification_report_utils.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.classification_report_utils.rst.txt new file mode 100644 index 000000000..4c3624869 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.classification_report_utils.rst.txt @@ -0,0 +1,7 @@ +Classification Report Utils +=========================== + +.. automodule:: dataprofiler.labelers.classification_report_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.column_name_model.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.column_name_model.rst.txt new file mode 100644 index 000000000..e4a5bc988 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.column_name_model.rst.txt @@ -0,0 +1,7 @@ +Column Name Model +================= + +.. automodule:: dataprofiler.labelers.column_name_model + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.data_labelers.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.data_labelers.rst.txt new file mode 100644 index 000000000..6ac45d9e6 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.data_labelers.rst.txt @@ -0,0 +1,7 @@ +Data Labelers +============= + +.. automodule:: dataprofiler.labelers.data_labelers + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.data_processing.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.data_processing.rst.txt new file mode 100644 index 000000000..58d572a29 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.data_processing.rst.txt @@ -0,0 +1,7 @@ +Data Processing +=============== + +.. automodule:: dataprofiler.labelers.data_processing + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.labeler_utils.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.labeler_utils.rst.txt new file mode 100644 index 000000000..f14cabd5c --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.labeler_utils.rst.txt @@ -0,0 +1,7 @@ +Labeler Utils +============= + +.. automodule:: dataprofiler.labelers.labeler_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.regex_model.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.regex_model.rst.txt new file mode 100644 index 000000000..e85772ad2 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.regex_model.rst.txt @@ -0,0 +1,7 @@ +Regex Model +=========== + +.. automodule:: dataprofiler.labelers.regex_model + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.rst.txt new file mode 100644 index 000000000..7d660273e --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.rst.txt @@ -0,0 +1,30 @@ +Labelers +======== + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.labelers.base_data_labeler + dataprofiler.labelers.base_model + dataprofiler.labelers.char_load_tf_model + dataprofiler.labelers.character_level_cnn_model + dataprofiler.labelers.classification_report_utils + dataprofiler.labelers.column_name_model + dataprofiler.labelers.data_labelers + dataprofiler.labelers.data_processing + dataprofiler.labelers.labeler_utils + dataprofiler.labelers.regex_model + dataprofiler.labelers.utils + +.. automodule:: dataprofiler.labelers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.12.0/html/_sources/dataprofiler.labelers.utils.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.labelers.utils.rst.txt new file mode 100644 index 000000000..a0486f9be --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.labelers.utils.rst.txt @@ -0,0 +1,7 @@ +Utils +===== + +.. automodule:: dataprofiler.labelers.utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.plugins.decorators.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.plugins.decorators.rst.txt new file mode 100644 index 000000000..2c9dfb41d --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.plugins.decorators.rst.txt @@ -0,0 +1,7 @@ +Decorators +========== + +.. automodule:: dataprofiler.plugins.decorators + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.plugins.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.plugins.rst.txt new file mode 100644 index 000000000..8547dba2b --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.plugins.rst.txt @@ -0,0 +1,20 @@ +Plugins +======= + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.plugins.decorators + +.. automodule:: dataprofiler.plugins + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.base_column_profilers.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.base_column_profilers.rst.txt new file mode 100644 index 000000000..13cab9ff4 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.base_column_profilers.rst.txt @@ -0,0 +1,7 @@ +Base Column Profilers +===================== + +.. automodule:: dataprofiler.profilers.base_column_profilers + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.categorical_column_profile.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.categorical_column_profile.rst.txt new file mode 100644 index 000000000..a525d86c8 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.categorical_column_profile.rst.txt @@ -0,0 +1,7 @@ +Categorical Column Profile +========================== + +.. automodule:: dataprofiler.profilers.categorical_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.column_profile_compilers.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.column_profile_compilers.rst.txt new file mode 100644 index 000000000..9599deb9a --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.column_profile_compilers.rst.txt @@ -0,0 +1,7 @@ +Column Profile Compilers +======================== + +.. automodule:: dataprofiler.profilers.column_profile_compilers + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.data_labeler_column_profile.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.data_labeler_column_profile.rst.txt new file mode 100644 index 000000000..282408931 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.data_labeler_column_profile.rst.txt @@ -0,0 +1,7 @@ +Data Labeler Column Profile +=========================== + +.. automodule:: dataprofiler.profilers.data_labeler_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.datetime_column_profile.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.datetime_column_profile.rst.txt new file mode 100644 index 000000000..d4467634f --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.datetime_column_profile.rst.txt @@ -0,0 +1,7 @@ +Datetime Column Profile +======================= + +.. automodule:: dataprofiler.profilers.datetime_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.float_column_profile.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.float_column_profile.rst.txt new file mode 100644 index 000000000..d23bb4336 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.float_column_profile.rst.txt @@ -0,0 +1,7 @@ +Float Column Profile +==================== + +.. automodule:: dataprofiler.profilers.float_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.graph_profiler.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.graph_profiler.rst.txt new file mode 100644 index 000000000..196194771 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.graph_profiler.rst.txt @@ -0,0 +1,7 @@ +Graph Profiler +============== + +.. automodule:: dataprofiler.profilers.graph_profiler + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.helpers.report_helpers.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.helpers.report_helpers.rst.txt new file mode 100644 index 000000000..c20d1f391 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.helpers.report_helpers.rst.txt @@ -0,0 +1,7 @@ +Report Helpers +============== + +.. automodule:: dataprofiler.profilers.helpers.report_helpers + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.helpers.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.helpers.rst.txt new file mode 100644 index 000000000..b82f660b5 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.helpers.rst.txt @@ -0,0 +1,20 @@ +Helpers +======= + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.profilers.helpers.report_helpers + +.. automodule:: dataprofiler.profilers.helpers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.histogram_utils.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.histogram_utils.rst.txt new file mode 100644 index 000000000..039bb08d8 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.histogram_utils.rst.txt @@ -0,0 +1,7 @@ +Histogram Utils +=============== + +.. automodule:: dataprofiler.profilers.histogram_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.int_column_profile.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.int_column_profile.rst.txt new file mode 100644 index 000000000..b6d8d7921 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.int_column_profile.rst.txt @@ -0,0 +1,7 @@ +Int Column Profile +================== + +.. automodule:: dataprofiler.profilers.int_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.json_decoder.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.json_decoder.rst.txt new file mode 100644 index 000000000..761551996 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.json_decoder.rst.txt @@ -0,0 +1,7 @@ +JSON Decoder +============ + +.. automodule:: dataprofiler.profilers.json_decoder + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.json_encoder.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.json_encoder.rst.txt new file mode 100644 index 000000000..bef659ecd --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.json_encoder.rst.txt @@ -0,0 +1,7 @@ +JSON Encoder +============ + +.. automodule:: dataprofiler.profilers.json_encoder + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.numerical_column_stats.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.numerical_column_stats.rst.txt new file mode 100644 index 000000000..a8b6ac226 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.numerical_column_stats.rst.txt @@ -0,0 +1,7 @@ +Numerical Column Stats +====================== + +.. automodule:: dataprofiler.profilers.numerical_column_stats + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.order_column_profile.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.order_column_profile.rst.txt new file mode 100644 index 000000000..7b1605659 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.order_column_profile.rst.txt @@ -0,0 +1,7 @@ +Order Column Profile +==================== + +.. automodule:: dataprofiler.profilers.order_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.profile_builder.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.profile_builder.rst.txt new file mode 100644 index 000000000..d73e6d9a2 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.profile_builder.rst.txt @@ -0,0 +1,7 @@ +Profile Builder +=============== + +.. automodule:: dataprofiler.profilers.profile_builder + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.profiler_options.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.profiler_options.rst.txt new file mode 100644 index 000000000..127e8e1ad --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.profiler_options.rst.txt @@ -0,0 +1,7 @@ +Profiler Options +================ + +.. automodule:: dataprofiler.profilers.profiler_options + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.profiler_utils.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.profiler_utils.rst.txt new file mode 100644 index 000000000..5e9c6533c --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.profiler_utils.rst.txt @@ -0,0 +1,7 @@ +Profiler Utils +============== + +.. automodule:: dataprofiler.profilers.profiler_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.rst.txt new file mode 100644 index 000000000..7a5b45394 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.rst.txt @@ -0,0 +1,39 @@ +Profilers +========= + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + dataprofiler.profilers.helpers + +.. toctree:: + :maxdepth: 4 + + dataprofiler.profilers.base_column_profilers + dataprofiler.profilers.categorical_column_profile + dataprofiler.profilers.column_profile_compilers + dataprofiler.profilers.data_labeler_column_profile + dataprofiler.profilers.datetime_column_profile + dataprofiler.profilers.float_column_profile + dataprofiler.profilers.graph_profiler + dataprofiler.profilers.histogram_utils + dataprofiler.profilers.int_column_profile + dataprofiler.profilers.json_decoder + dataprofiler.profilers.json_encoder + dataprofiler.profilers.numerical_column_stats + dataprofiler.profilers.order_column_profile + dataprofiler.profilers.profile_builder + dataprofiler.profilers.profiler_options + dataprofiler.profilers.profiler_utils + dataprofiler.profilers.text_column_profile + dataprofiler.profilers.unstructured_labeler_profile + dataprofiler.profilers.unstructured_text_profile + +.. automodule:: dataprofiler.profilers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.text_column_profile.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.text_column_profile.rst.txt new file mode 100644 index 000000000..097e6e02c --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.text_column_profile.rst.txt @@ -0,0 +1,7 @@ +Text Column Profile +=================== + +.. automodule:: dataprofiler.profilers.text_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.unstructured_labeler_profile.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.unstructured_labeler_profile.rst.txt new file mode 100644 index 000000000..c49f68004 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.unstructured_labeler_profile.rst.txt @@ -0,0 +1,7 @@ +Unstructured Labeler Profile +============================ + +.. automodule:: dataprofiler.profilers.unstructured_labeler_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.unstructured_text_profile.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.unstructured_text_profile.rst.txt new file mode 100644 index 000000000..27b56ea93 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.unstructured_text_profile.rst.txt @@ -0,0 +1,7 @@ +Unstructured Text Profile +========================= + +.. automodule:: dataprofiler.profilers.unstructured_text_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.profilers.utils.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.profilers.utils.rst.txt new file mode 100644 index 000000000..ddae85e63 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.profilers.utils.rst.txt @@ -0,0 +1,7 @@ +Utils +===== + +.. automodule:: dataprofiler.profilers.utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.reports.graphs.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.reports.graphs.rst.txt new file mode 100644 index 000000000..3a7adf900 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.reports.graphs.rst.txt @@ -0,0 +1,7 @@ +Graphs +====== + +.. automodule:: dataprofiler.reports.graphs + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.reports.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.reports.rst.txt new file mode 100644 index 000000000..59525629c --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.reports.rst.txt @@ -0,0 +1,21 @@ +Reports +======= + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.reports.graphs + dataprofiler.reports.utils + +.. automodule:: dataprofiler.reports + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.12.0/html/_sources/dataprofiler.reports.utils.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.reports.utils.rst.txt new file mode 100644 index 000000000..97e26cde0 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.reports.utils.rst.txt @@ -0,0 +1,7 @@ +Utils +===== + +.. automodule:: dataprofiler.reports.utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.rng_utils.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.rng_utils.rst.txt new file mode 100644 index 000000000..e4b84b46a --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.rng_utils.rst.txt @@ -0,0 +1,7 @@ +Rng Utils +========= + +.. automodule:: dataprofiler.rng_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.rst.txt new file mode 100644 index 000000000..c95eaad12 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.rst.txt @@ -0,0 +1,29 @@ +Dataprofiler +============ + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + dataprofiler.data_readers + dataprofiler.labelers + dataprofiler.plugins + dataprofiler.profilers + dataprofiler.reports + dataprofiler.validators + +.. toctree:: + :maxdepth: 4 + + dataprofiler.dp_logging + dataprofiler.rng_utils + dataprofiler.settings + dataprofiler.version + +.. automodule:: dataprofiler + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.12.0/html/_sources/dataprofiler.settings.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.settings.rst.txt new file mode 100644 index 000000000..81c664c07 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.settings.rst.txt @@ -0,0 +1,7 @@ +Settings +======== + +.. automodule:: dataprofiler.settings + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.validators.base_validators.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.validators.base_validators.rst.txt new file mode 100644 index 000000000..b8f328736 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.validators.base_validators.rst.txt @@ -0,0 +1,7 @@ +Base Validators +=============== + +.. automodule:: dataprofiler.validators.base_validators + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/dataprofiler.validators.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.validators.rst.txt new file mode 100644 index 000000000..704dfee21 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.validators.rst.txt @@ -0,0 +1,20 @@ +Validators +========== + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.validators.base_validators + +.. automodule:: dataprofiler.validators + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.12.0/html/_sources/dataprofiler.version.rst.txt b/docs/0.12.0/html/_sources/dataprofiler.version.rst.txt new file mode 100644 index 000000000..3977b6379 --- /dev/null +++ b/docs/0.12.0/html/_sources/dataprofiler.version.rst.txt @@ -0,0 +1,7 @@ +Version +======= + +.. automodule:: dataprofiler.version + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/examples.rst.txt b/docs/0.12.0/html/_sources/examples.rst.txt new file mode 100644 index 000000000..3637da6ac --- /dev/null +++ b/docs/0.12.0/html/_sources/examples.rst.txt @@ -0,0 +1,24 @@ +.. _examples: + +Examples +******** + +These examples provide a more in-depth look into the details of the ``Data Profiler`` library. + +Basics +------ + +.. toctree:: + :maxdepth: 0 + + Overview of Data Profiler + Data Reader + Structured Profiler + Unstructured Profiler + Graph Profiler + Labeler + Adding Models to a Labeler Pipeline + Creating a Regex Labeler + Creating a ColumnName Labeler + Merge Profile List + Dataloader with Popmon Reports diff --git a/docs/0.12.0/html/_sources/graph_data_demo.nblink.txt b/docs/0.12.0/html/_sources/graph_data_demo.nblink.txt new file mode 100644 index 000000000..e627c61f8 --- /dev/null +++ b/docs/0.12.0/html/_sources/graph_data_demo.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/graph_data_demo.ipynb" +} diff --git a/docs/0.12.0/html/_sources/graphs.rst.txt b/docs/0.12.0/html/_sources/graphs.rst.txt new file mode 100644 index 000000000..23c2d316b --- /dev/null +++ b/docs/0.12.0/html/_sources/graphs.rst.txt @@ -0,0 +1,196 @@ +.. _reports: + +Graphs +****** + +Graph Your Data +=============== + +We can plot some of our data as seaborn histogram plots. Below will demonstrate how to do so and provide examples. + +The following plots are currently available to work directly with your profilers: + + * histogram (numeric columns only) + * missing values matrix + +Below shows how to do so with examples. + +What we need to import +~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: python + + from dataprofiler.reports import graphs + +The main functions that is used to plot histograms are in graphs. **You will also need the `dataprofiler[reports]` requirement to be installed**: + +.. code-block:: console + + pip install 'dataprofiler[reports]' + +Plotting from a StructuredProfiler class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With a StructuredProfiler class variable, we can specify what columns we want to be plotted, and plot them into histograms. + +.. code-block:: python + + graphs.plot_histograms(profiler, column_names, column_inds) + +These are what the variables mean: + + * **profiler** - StructuredProfiler class variable that contains the data we want + * **columns** - (Optional) The list of IntColumn or FloatColumn *names* we want to specifically plot. If specified, `column_inds` cannot be specified. + * **column_inds** - (Optional) The list of IntColumn or FloatColumn *indexes* we want to specifically plot. If specified, `column_names` cannot be specified. + + +Additionally, we can also plot the missing values matrix for a StructuredProfiler: + +.. code-block:: python + + graphs.plot_missing_values_matrix(profiler, ax, title) + +These are what the variables mean: + + * **profiler** - StructuredProfiler class variable that contains the data we want + * **ax** - (Optional) MatPlotLib Axes to plot the matrix within. + * **title** - (Optional) The title of the axes we want to define. + + +Plotting an individual IntColumn or FloatColumn +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With a column's Int or Float profile, we can plot their respective histograms. + +.. code-block:: python + + graphs.plot_col_histogram(column, axes, title) + +These are what the variables mean: + + * **column** - The IntColumn or FloatColumn we want to plot + * **axes** - (Optional) The MatPlotLib Axes to plot the histogram within. + * **title** - (Optional) The title of the axes we want to define. + + +Additionally, we can also plot the missing values bargraph for any column profile: + +.. code-block:: python + + graphs.plot_col_missing_values(profiler, ax, title) + +These are what the variables mean: + + * **profiler** - The StructuredColProfiler we want to plot + * **ax** - (Optional) MatPlotLib Axes to plot the matrix within. + * **title** - (Optional) The title of the axes we want to define. + +Examples +~~~~~~~~ + +Histograms +---------- + +1. This example demonstrates how we can take a StructuredProfiler class and plot histograms of the specified columns. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.reports import graphs + + + data = [[1, 'a', 1.0], + [2, 'b', 2.2], + [3, 'c', 3.5], + [None, 'd', 10.0]] + profiler = dp.StructuredProfiler(data) + + # This will plot all IntColumn and FloatColumn as histograms (The first and last column). + fig = graphs.plot_histograms(profiler) + fig.show() + + # This will only plot the specified column, 0. + columns_names = [0] + fig = graphs.plot_histograms(profiler, columns_names) + fig.show() + +.. image:: _static/images/histogram_example_0.png + :alt: First Histogram Example Image + +.. image:: _static/images/histogram_example_1.png + :alt: Second Histogram Example Image + +2. This example demonstrates how we can plot a low level profiler. + +.. code-block:: python + + import pandas as pd + + from dataprofiler.profilers import IntColumn + from dataprofiler.reports import graphs + + + data = pd.Series([1, 2, 3], dtype=str) + profiler = IntColumn('example') + profiler.update(data) + + # Plot the axes + ax = graphs.plot_col_histogram(profiler) + + # get and show the figure of the plotted histogram + fig = ax.get_figure() + fig.show() + +.. image:: _static/images/histogram_example_2.png + :alt: Histogram Column Only Example Image + + +Missing Values Matrix +--------------------- + +1. This example demonstrates how we can take a StructuredProfiler class and plot a missing values matrix. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.reports import graphs + + + data = pd.DataFrame( + [[None, '', 1.0, '1/2/2021'], + [3, None, 3.5, ''], + [1, None, 1.0, '2/5/2020'], + [None, 1, 10.0, '3/5/2020']], + columns=['integer', 'str', 'float', 'datetime'], + dtype=object + ) + profiler = dp.StructuredProfiler(data) + + # This will plot the missing values matrix for all columns. + fig = graphs.plot_missing_values_matrix(profiler) + fig.show() + +.. image:: _static/images/missing_value_matrix_example_0.png + :alt: Missing Values Matrix Example Image + +2. This example demonstrates how we can plot barchart of a column's missing values. + +.. code-block:: python + + import pandas as pd + + from dataprofiler.profilers.profile_builder import StructuredColProfiler + from dataprofiler.reports import graphs + + + data = pd.Series([1, 2, 3, None, None, 4], name='example', dtype=str) + profiler = StructuredColProfiler(data) + + # Plot the axes, can be a list of multiple columns + ax = graphs.plot_col_missing_values([profiler]) + + # get and show the figure of the plotted histogram + fig = ax.get_figure() + fig.show() + +.. image:: _static/images/missing_value_barchart_example_0.png + :alt: Missing Values Column Only Example Image \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/index.rst.txt b/docs/0.12.0/html/_sources/index.rst.txt new file mode 100644 index 000000000..a20aa5ff0 --- /dev/null +++ b/docs/0.12.0/html/_sources/index.rst.txt @@ -0,0 +1,605 @@ +.. _Data Profiler: + +==================================== +Data Profiler | What's in your data? +==================================== + +Purpose +======= + +The DataProfiler is a Python library designed to make data analysis, monitoring and **sensitive data detection** easy. + +Loading **Data** with a single command, the library automatically formats & loads files into a DataFrame. **Profiling** the Data, the library identifies the schema, statistics, entities and more. Data Profiles can then be used in downstream applications or reports. + +The Data Profiler comes with a cutting edge pre-trained deep learning model, used to efficiently identify **sensitive data** (or **PII**). If customization is needed, it's easy to add new entities to the existing pre-trained model or insert a new pipeline for entity recognition. + +The best part? Getting started only takes a few lines of code (`Example CSV`_): + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text + print(data.data.head(5)) # Access data directly via a compatible Pandas DataFrame + + profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc + readable_report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(readable_report, indent=4)) + + +To install the full package from pypi: + +.. code-block:: console + + pip install DataProfiler[ml] + +If the ML requirements are too strict (say, you don't want to install tensorflow), you can install a slimmer package. The slimmer package disables the default sensitive data detection / entity recognition (labler) + +Install from pypi: + +.. code-block:: console + + pip install DataProfiler + +If you have suggestions or find a bug, please open an `issue`_. + +Visit the :ref:`API` to explore Data Profiler's terminology. + + +What is a Data Profile? +======================= + +In the case of this library, a data profile is a dictionary containing statistics and predictions about the underlying dataset. There are "global statistics" or `global_stats`, which contain dataset level data and there are "column/row level statistics" or `data_stats` (each column is a new key-value entry). + +The format for a structured profile is below: + +.. code-block:: python + + "global_stats": { + "samples_used": int, + "column_count": int, + "row_count": int, + "row_has_null_ratio": float, + "row_is_null_ratio": float, + "unique_row_ratio": float, + "duplicate_row_count": int, + "file_type": string, + "encoding": string, + "correlation_matrix": list[list[int]], (*) + "chi2_matrix": list[list[float]], + "profile_schema": dict[string, list[int]] + }, + "data_stats": [ + { + "column_name": string, + "data_type": string, + "data_label": string, + "categorical": bool, + "order": string, + "samples": list[str], + "statistics": { + "sample_size": int, + "null_count": int, + "null_types": list[string], + "null_types_index": dict[string, list[int]], + "data_type_representation": dict[string, list[string]], + "min": [null, float], + "max": [null, float], + "sum": float, + "mode": list[float], + "median": float, + "median_absolute_deviation": float, + "mean": float, + "variance": float, + "stddev": float, + "skewness": float, + "kurtosis": float, + "num_zeros": int, + "num_negatives": int, + "histogram": { + "bin_counts": list[int], + "bin_edges": list[float], + }, + "quantiles": { + int: float + }, + "vocab": list[char], + "avg_predictions": dict[string, float], + "data_label_representation": dict[string, float], + "categories": list[str], + "unique_count": int, + "unique_ratio": float, + "categorical_count": dict[string, int], + "gini_impurity": float, + "unalikeability": float, + "precision": { + 'min': int, + 'max': int, + 'mean': float, + 'var': float, + 'std': float, + 'sample_size': int, + 'margin_of_error': float, + 'confidence_level': float + }, + "times": dict[string, float], + "format": string + }, + "null_replication_metrics": { + "class_prior": list[int], + "class_sum": list[list[int]], + "class_mean": list[list[int]] + } + } + ] + +(*) Currently the correlation matrix update is toggled off. It will be reset in a later update. Users can still use it as desired with the is_enable option set to True. + +The format for an unstructured profile is below: + +.. code-block:: python + + "global_stats": { + "samples_used": int, + "empty_line_count": int, + "file_type": string, + "encoding": string, + "memory_size": float, # in MB + }, + "data_stats": { + "data_label": { + "entity_counts": { + "word_level": dict[string, int], + "true_char_level": dict[string, int], + "postprocess_char_level": dict[string, int] + }, + "entity_percentages": { + "word_level": dict[string, float], + "true_char_level": dict[string, float], + "postprocess_char_level": dict[string, float] + }, + "times": dict[string, float] + }, + "statistics": { + "vocab": list[char], + "vocab_count": dict[string, int], + "words": list[string], + "word_count": dict[string, int], + "times": dict[string, float] + } + } + +The format for a graph profile is below: + +.. code-block:: python + + "num_nodes": int, + "num_edges": int, + "categorical_attributes": list[string], + "continuous_attributes": list[string], + "avg_node_degree": float, + "global_max_component_size": int, + "continuous_distribution": { + "": { + "name": string, + "scale": float, + "properties": list[float, np.array] + }, + "": None, + }, + "categorical_distribution": { + "": None, + "": { + "bin_counts": list[int], + "bin_edges": list[float] + }, + }, + "times": dict[string, float] + +Supported Data Formats +~~~~~~~~~~~~~~~~~~~~~~ + +* Any delimited file (CSV, TSV, etc.) +* JSON object +* Avro file +* Parquet file +* Text file +* Pandas DataFrame +* A URL that points to one of the supported file types above + + +Data Labels +~~~~~~~~~~~ + +*Data Labels* are determined per cell for structured data (column/row when the *profiler* is used) or at the character level for unstructured data. + +* UNKNOWN +* ADDRESS +* BAN (bank account number, 10-18 digits) +* CREDIT_CARD +* EMAIL_ADDRESS +* UUID +* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.) +* IPV4 +* IPV6 +* MAC_ADDRESS +* PERSON +* PHONE_NUMBER +* SSN +* URL +* US_STATE +* DRIVERS_LICENSE +* DATE +* TIME +* DATETIME +* INTEGER +* FLOAT +* QUANTITY +* ORDINAL + + +Get Started +=========== + +Load a File +~~~~~~~~~~~ + +The profiler should automatically identify the file type and load the data into a `Data Class`. + +Along with other attributtes the `Data class` enables structured data to be accessed via a valid Pandas DataFrame. + +.. code-block:: python + + # Load a csv file, return a CSVData object + csv_data = Data('your_file.csv') + + # Print the first 10 rows of the csv file + print(csv_data.data.head(10)) + + # Load a parquet file, return a ParquetData object + parquet_data = Data('your_file.parquet') + + # Sort the data by the name column + parquet_data.data.sort_values(by='name', inplace=True) + + # Print the sorted first 10 rows of the parquet data + print(parquet_data.data.head(10)) + + +If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers. + +Profile a File +~~~~~~~~~~~~~~ + +Example uses a CSV file for example, but CSV, JSON, Avro, Parquet or Text should also work. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load file (CSV should be automatically identified) + data = Data("your_file.csv") + + # Profile the dataset + profile = Profiler(data) + + # Generate a report and use json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + + # Print the report + print(json.dumps(report, indent=4)) + +Updating Profiles +~~~~~~~~~~~~~~~~~ + +Currently, the data profiler is equipped to update its profile in batches. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load and profile a CSV file + data = Data("your_file.csv") + profile = Profiler(data) + + # Update the profile with new data: + new_data = Data("new_data.csv") + profile.update_profile(new_data) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + +Merging Profiles +~~~~~~~~~~~~~~~~ + +If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator. + +This also enables profiles to be determined in a distributed manner. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file with a schema + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file with the same schema + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + profile3 = profile1 + profile2 + + # Print the report using json to prettify. + report = profile3.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + +Profile a Pandas DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import pandas as pd + import dataprofiler as dp + import json + + my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]]) + profile = dp.Profiler(my_dataframe) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + # read a specified column, in this case it is labeled 0: + print(json.dumps(report["data stats"][0], indent=4)) + + +Unstructured Profiler +~~~~~~~~~~~~~~~~~~~~~ + +In addition to the structured profiler, the Data Profiler provides unstructured +profiling for the TextData object or string. Unstructured profiling also works +with list(string), pd.Series(string) or pd.DataFrame(string) given profiler_type +option specified as `unstructured`. Below is an example of unstructured profile +with a text file. + +.. code-block:: python + + import dataprofiler as dp + import json + my_text = dp.Data('text_file.txt') + profile = dp.Profiler(my_text) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + +Another example of unstructured profile with pd.Series of string is given as below + +.. code-block:: python + + import dataprofiler as dp + import pandas as pd + import json + + text_data = pd.Series(['first string', 'second string']) + profile = dp.Profiler(text_data, profiler_type="unstructured") + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + +Graph Profiler +~~~~~~~~~~~~~~ + +DataProfiler also provides the ability to profile graph data from a csv file. Below is an example of the graph profiler with a graph data csv file: + +.. code-block:: python + + import dataprofiler as dp + import pprint + + my_graph = dp.Data('graph_file.csv') + profile = dp.Profiler(my_graph) + + # print the report using pretty print (json dump does not work on numpy array values inside dict) + report = profile.report() + printer = pprint.PrettyPrinter(sort_dicts=False, compact=True) + printer.pprint(report) + + +Specifying a Filetype or Delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a CSV data type, with a `,` delimiter. +In addition, it utilizes only the first 10,000 rows. + +.. code-block:: python + + import json + import os + from dataprofiler import Data, Profiler + from dataprofiler.data_readers.csv_data import CSVData + + # Load a CSV file, with "," as the delimiter + data = CSVData("your_file.csv", options={"delimiter": ","}) + + # Split the data, such that only the first 10,000 rows are used + data = data.data[0:10000] + + # Read in profile and print results + profile = Profiler(data) + print(json.dumps(profile.report(report_options={"output_format":"pretty"}), indent=4)) + + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Getting Started: + + Intro + install.rst + data_readers.rst + profiler.rst + data_labeling.rst + graphs.rst + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: User Guide: + + examples.rst + API.rst + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Community: + + roadmap.rst + Changelog + Feedback + GitHub + Contributing + +.. _Example CSV: https://raw.githubusercontent.com/capitalone/DataProfiler/main/dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv +.. _issue: https://github.com/capitalone/DataProfiler/issues/new/choose + +Versions +======== +* `0.12.0`_ +* `0.11.0`_ +* `0.10.9`_ +* `0.10.8`_ +* `0.10.7`_ +* `0.10.6`_ +* `0.10.5`_ +* `0.10.4`_ +* `0.10.3`_ +* `0.10.2`_ +* `0.10.1`_ +* `0.10.0`_ +* `0.9.0`_ +* `0.8.9`_ +* `0.8.8`_ +* `0.8.7`_ +* `0.8.6`_ +* `0.8.5`_ +* `0.8.4`_ +* `0.8.3`_ +* `0.8.2`_ +* `0.8.1`_ +* `0.8.0`_ +* `0.7.11`_ +* `0.7.10`_ +* `0.7.9`_ +* `0.7.8`_ +* `0.7.7`_ +* `0.7.6`_ +* `0.7.2`_ +* `0.7.1`_ +* `0.7.0`_ +* `0.6.0`_ +* `0.5.3`_ +* `0.5.2`_ +* `0.5.1`_ +* `0.5.0`_ +* `0.4.7`_ +* `0.4.6`_ +* `0.4.5`_ +* `0.4.4`_ +* `0.4.3`_ +* `0.3.0`_ + +.. _0.3.0: ../../v0.3/html/index.html +.. _0.4.3: ../../0.4.3/html/index.html + +.. _0.4.4: ../../0.4.4/html/index.html + +.. _0.4.5: ../../0.4.5/html/index.html + +.. _0.4.6: ../../0.4.6/html/index.html + +.. _0.4.7: ../../0.4.7/html/index.html + +.. _0.5.0: ../../0.5.0/html/index.html + +.. _0.5.1: ../../0.5.1/html/index.html + +.. _0.5.2: ../../0.5.2/html/index.html + +.. _0.5.3: ../../0.5.3/html/index.html +.. _0.6.0: ../../0.6.0/html/index.html + +.. _0.7.0: ../../0.7.0/html/index.html + +.. _0.7.1: ../../0.7.1/html/index.html +.. _0.7.2: ../../0.7.2/html/index.html + +.. _0.7.6: ../../0.7.6/html/index.html + +.. _0.7.7: ../../0.7.7/html/index.html + +.. _0.7.8: ../../0.7.8/html/index.html + +.. _0.7.9: ../../0.7.9/html/index.html + +.. _0.7.10: ../../0.7.10/html/index.html + +.. _0.7.11: ../../0.7.11/html/index.html + +.. _0.8.0: ../../0.8.0/html/index.html + +.. _0.8.1: ../../0.8.1/html/index.html + +.. _0.8.2: ../../0.8.2/html/index.html + +.. _0.8.3: ../../0.8.3/html/index.html + +.. _0.8.4: ../../0.8.4/html/index.html + +.. _0.8.5: ../../0.8.5/html/index.html + +.. _0.8.6: ../../0.8.6/html/index.html + +.. _0.8.7: ../../0.8.7/html/index.html + +.. _0.8.8: ../../0.8.8/html/index.html + +.. _0.8.9: ../../0.8.9/html/index.html + +.. _0.9.0: ../../0.9.0/html/index.html + +.. _0.10.0: ../../0.10.0/html/index.html + +.. _0.10.1: ../../0.10.1/html/index.html + +.. _0.10.2: ../../0.10.2/html/index.html + +.. _0.10.3: ../../0.10.3/html/index.html + +.. _0.10.4: ../../0.10.4/html/index.html + +.. _0.10.5: ../../0.10.5/html/index.html + +.. _0.10.6: ../../0.10.6/html/index.html + +.. _0.10.7: ../../0.10.7/html/index.html + +.. _0.10.8: ../../0.10.8/html/index.html + +.. _0.10.9: ../../0.10.9/html/index.html + +.. _0.11.0: ../../0.11.0/html/index.html + +.. _0.12.0: ../../0.12.0/html/index.html + diff --git a/docs/0.12.0/html/_sources/install.rst.txt b/docs/0.12.0/html/_sources/install.rst.txt new file mode 100644 index 000000000..bdf4c3bb4 --- /dev/null +++ b/docs/0.12.0/html/_sources/install.rst.txt @@ -0,0 +1,145 @@ +.. _install: + +Install +******* + +To install the full package from pypi: + +.. code-block:: console + + pip install DataProfiler[ml] + +If the ML requirements are too strict (say, you don't want to install +tensorflow), you can install a slimmer package. The slimmer package disables +the default sensitive data detection / entity recognition (labler) + +Install from pypi: + +.. code-block:: console + + pip install DataProfiler + +Snappy Installation +=================== + +This is required to profile parquet/avro datasets + +MacOS (intel chip) with homebrew: + +.. code-block:: console + + brew install snappy && CPPFLAGS="-I/usr/local/include -L/usr/local/lib" pip install python-snappy + + +MacOS (apple chip) with homebrew: + +.. code-block:: console + + brew install snappy && CPPFLAGS="-I/opt/homebrew/include -L/opt/homebrew/lib" pip install python-snappy + + +Linux install: + +.. code-block:: console + + sudo apt-get -y install libsnappy-dev + + +Build From Scratch +================== + +NOTE: Installation for python3 + +virtualenv install: + +.. code-block:: console + + python3 -m pip install virtualenv + + +Setup virtual env: + +.. code-block:: console + + python3 -m virtualenv --python=python3 venv3 + source venv3/bin/activate + + +Install requirements: + +.. code-block:: console + + pip3 install -r requirements.txt + +Install labeler dependencies: + +.. code-block:: console + + pip3 install -r requirements-ml.txt + + +Install via the repo -- Build setup.py and install locally: + +.. code-block:: console + + python3 setup.py sdist bdist bdist_wheel + pip3 install dist/DataProfiler*-py3-none-any.whl + + +If you see: + +.. code-block:: console + + ERROR: Double requirement given:dataprofiler==X.Y.Z from dataprofiler/dist/DataProfiler-X.Y.Z-py3-none-any.whl (already in dataprofiler==X2.Y2.Z2 from dataprofiler/dist/DataProfiler-X2.Y2.Z2-py3-none-any.whl, name='dataprofiler') + +This means that you have multiple versions of the DataProfiler distribution +in the dist folder. +To resolve, either remove the older one or delete the folder and rerun the steps +above. + +Install via github: + +.. code-block:: console + + pip3 install git+https://github.com/capitalone/dataprofiler.git#egg=dataprofiler + + + +Testing +======= + +For testing, install test requirements: + +.. code-block:: console + + pip3 install -r requirements-test.txt + + +To run all unit tests, use: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py" + + +To run file of unit tests, use form: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest discover -p test_profile_builder.py + + +To run a file with Pytest use: + +.. code-block:: console + + DATAPROFILER_SEED=0 pytest dataprofiler/tests/data_readers/test_csv_data.py -v + + +To run individual of unit test, use form: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest dataprofiler.tests.profilers.test_profile_builder.TestProfiler + + diff --git a/docs/0.12.0/html/_sources/labeler.nblink.txt b/docs/0.12.0/html/_sources/labeler.nblink.txt new file mode 100644 index 000000000..bed6517bf --- /dev/null +++ b/docs/0.12.0/html/_sources/labeler.nblink.txt @@ -0,0 +1,6 @@ +{ + "path": "../../feature_branch/examples/labeler.ipynb", + "extra-media": [ + "../../feature_branch/examples/DL-Flowchart.png" + ] +} \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/merge_profile_list.nblink.txt b/docs/0.12.0/html/_sources/merge_profile_list.nblink.txt new file mode 100644 index 000000000..50698d637 --- /dev/null +++ b/docs/0.12.0/html/_sources/merge_profile_list.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/merge_profile_list.ipynb" +} \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/modules.rst.txt b/docs/0.12.0/html/_sources/modules.rst.txt new file mode 100644 index 000000000..0593459df --- /dev/null +++ b/docs/0.12.0/html/_sources/modules.rst.txt @@ -0,0 +1,7 @@ +dataprofiler +============ + +.. toctree:: + :maxdepth: 4 + + dataprofiler diff --git a/docs/0.12.0/html/_sources/overview.nblink.txt b/docs/0.12.0/html/_sources/overview.nblink.txt new file mode 100644 index 000000000..3d9f89d3d --- /dev/null +++ b/docs/0.12.0/html/_sources/overview.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/intro_data_profiler.ipynb" +} \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/popmon_dp_loader_example.nblink.txt b/docs/0.12.0/html/_sources/popmon_dp_loader_example.nblink.txt new file mode 100644 index 000000000..3beced0ab --- /dev/null +++ b/docs/0.12.0/html/_sources/popmon_dp_loader_example.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/popmon_dp_loader_example.ipynb" +} \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/profiler.rst.txt b/docs/0.12.0/html/_sources/profiler.rst.txt new file mode 100644 index 000000000..56d16a274 --- /dev/null +++ b/docs/0.12.0/html/_sources/profiler.rst.txt @@ -0,0 +1,965 @@ +.. _profiler: + +Profiler +******** + +Profile Your Data +================= + +Profiling your data is easy. Just use the data reader, send the data to the +profiler, and print out the report. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text + + profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc + + readable_report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(readable_report, indent=4)) + +If the data is structured, the profile will return global statistics as well as +column by column statistics. The vast amount of statistics are listed on the +intro page. + +Load a File +~~~~~~~~~~~ + +The profiler should automatically identify the file type and load the data into a `Data Class`. + +Along with other attributtes the `Data class` enables structured data to be accessed via a valid Pandas DataFrame. + +.. code-block:: python + + # Load a csv file, return a CSVData object + csv_data = Data('your_file.csv') + + # Print the first 10 rows of the csv file + print(csv_data.data.head(10)) + + # Load a parquet file, return a ParquetData object + parquet_data = Data('your_file.parquet') + + # Sort the data by the name column + parquet_data.data.sort_values(by='name', inplace=True) + + # Print the sorted first 10 rows of the parquet data + print(parquet_data.data.head(10)) + + +If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers. + +Profile a File +~~~~~~~~~~~~~~ + +Example uses a CSV file for example, but CSV, JSON, Avro or Parquet should also work. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load file (CSV should be automatically identified) + data = Data("your_file.csv") + + # Profile the dataset + profile = Profiler(data) + + # Generate a report and use json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + + # Print the report + print(json.dumps(report, indent=4)) + +Updating Profiles +~~~~~~~~~~~~~~~~~ + +Currently, the data profiler is equipped to update its profile in batches. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load and profile a CSV file + data = Data("your_file.csv") + profile = Profiler(data) + + # Update the profile with new data: + new_data = Data("new_data.csv") + profile.update_profile(new_data) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Merging Profiles +~~~~~~~~~~~~~~~~ + +If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator. + +This also enables profiles to be determined in a distributed manner. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file with a schema + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file with the same schema + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + profile3 = profile1 + profile2 + + # Print the report using json to prettify. + report = profile3.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Profile Differences +~~~~~~~~~~~~~~~~~~~ + +Profile differences take two profiles and find the differences +between them. Create the difference report like this: + +.. code-block:: python + + from dataprofiler import Data, Profiler + + # Load a CSV file + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + diff_report = profile1.diff(profile2) + print(diff_report) + +The `.diff()` operation is available between two profiles, although there are different +outputs depending on the type of profile being differenced. For example, for numerical +column profiles (e.g. integers and floats), two valuable calculations that +`.diff()` returns are `t-test`, `chi2-test`, and `psi` (Popoulation Stability Index) +for understanding distributional changes. + +The difference report contains a dictionary that mirrors the profile report. +Each data type has its own difference: + +* **Int/Float** - One profile subtracts the value from the other. + +* **String** - The strings will be shown in a list: + + - [profile1 str, profile2 str] +* **List** - A list of 3 will be returned showing the unique values of + each profile and the shared values: + + - [profile 1 unique values, shared values, profile 2 unique values] +* **Dict** - Some dictionaries with varied keys will also return a list + of three in the format: + + - [profile 1 unique key-values, shared key differences, profile 2 unique key-values] + +Otherwise, when no differences occur: + +* **Any Type No Differences** - A string will report: "unchanged". + +Below is the structured difference report: + +.. code-block:: python + + { + 'global_stats': { + 'file_type': [str, str], + 'encoding': [str, str], + 'samples_used': int, + 'column_count': int, + 'row_count': int, + 'row_has_null_ratio': float, + 'row_is_null_ratio': float, + 'unique_row_ratio': float, + 'duplicate_row_count': int, + 'correlation_matrix': list[list[float]], + 'chi2_matrix': list[list[float]], + 'profile_schema': list[dict[str, int]] + }, + 'data_stats': [{ + 'column_name': str, + 'data_type': [str, str], + 'data_label': [list[str], list[str], list[str]], + 'categorical': [str, str], + 'order': [str, str], + 'statistics': { + 'min': float, + 'max': float, + 'sum': float, + 'mean': float, + 'median': float, + 'mode': [list[float], list[float], list[float]], + 'median_absolute_deviation': float, + 'variance': float, + 'stddev': float, + 't-test': { + 't-statistic': float, + 'conservative': {'deg_of_free': int, + 'p-value': float}, + 'welch': {'deg_of_free': float, + 'p-value': float}}, + 'psi': float, + "chi2-test": { + "chi2-statistic": float, + "deg_of_free": int, + "p-value": float + }, + 'unique_count': int, + 'unique_ratio': float, + 'categories': [list[str], list[str], list[str]], + 'gini_impurity': float, + 'unalikeability': float, + 'categorical_count': [dict[str, int], dict[str, int], dict[str, int]], + 'avg_predictions': [dict[str, float]], + 'label_representation': [dict[str, float]], + 'sample_size': int, + 'null_count': int, + 'null_types': [list[str], list[str], list[str]], + 'null_types_index': [dict[str, int], dict[str, int], dict[str, int]], + 'data_type_representation': [dict[str, float]] + }, + "null_replication_metrics": { + "class_prior": list[int], + "class_sum": list[list[int]], + "class_mean": list[list[int]] + } + } + +Below is the unstructured difference report: + +.. code-block:: python + + { + 'global_stats': { + 'file_type': [str, str], + 'encoding': [str, str], + 'samples_used': int, + 'empty_line_count': int, + 'memory_size': float + }, + 'data_stats': { + 'data_label': { + 'entity_counts': { + 'word_level': dict[str, int], + 'true_char_level': dict[str, int], + 'postprocess_char_level': dict[str, int] + }, + 'entity_percentages': { + 'word_level': dict[str, float], + 'true_char_level': dict[str, float], + 'postprocess_char_level': dict[str, float] + } + }, + 'statistics': { + 'vocab': [list[str], list[str], list[str]], + 'vocab_count': [dict[str, int], dict[str, int], dict[str, int]], + 'words': [list[str], list[str], list[str]], + 'word_count': [dict[str, int], dict[str, int], dict[str, int]] + } + } + } + + +Saving and Loading a Profile +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The profiles can easily be saved and loaded as shown below: + +**NOTE: Json saving and loading only supports Structured Profiles currently.** + +There are two save/load methods: + +* **Pickle save/load** + + * Save a profile as a `.pkl` file. + * Load a `.pkl` file as a profile object. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file, with "," as the delimiter + data = Data("your_file.csv") + + # Read data into profile + profile = Profiler(data) + + # save structured profile to pkl file + profile.save(filepath="my_profile.pkl") + + # load pkl file to structured profile + loaded_pkl_profile = dp.Profiler.load(filepath="my_profile.pkl") + + print(json.dumps(loaded_pkl_profile.report(report_options={"output_format": "compact"}), + indent=4)) + +* **Json save/load** + + * Save a profile as a human-readable `.json` file. + * Load a `.json` file as a profile object. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file, with "," as the delimiter + data = Data("your_file.csv") + + # Read data into profile + profile = Profiler(data) + + # save structured profile to json file + profile.save(filepath="my_profile.json", save_method="json") + + # load json file to structured profile + loaded_json_profile = dp.Profiler.load(filepath="my_profile.json", load_method="json") + + print(json.dumps(loaded_json_profile.report(report_options={"output_format": "compact"}), + indent=4)) + + +Structured vs Unstructured Profiles +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using the profiler, the data profiler will automatically infer whether to +create the structured profile or the unstructured profile. However, you can be +explicit as shown below: + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Creating a structured profile + data1 = Data("normal_csv_file.csv") + structured_profile = Profiler(data1, profiler_type="structured") + + structured_report = structured_profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(structured_report, indent=4)) + + # Creating an unstructured profile + data2 = Data("normal_text_file.txt") + unstructured_profile = Profiler(data2, profiler_type="unstructured") + + unstructured_report = unstructured_profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(unstructured_report, indent=4)) + + +Setting the Sample Size +~~~~~~~~~~~~~~~~~~~~~~~ + +There are two ways to set sample size in a profile: samples_per_update and +min_true_samples. Samples_per_update takes an integer as the exact amount that +will be sampled. Min_true_samples will set the minimum amount of samples that +are not null. For example: + +.. code-block:: python + + from dataprofiler import Profiler + + sample_array = [1.0, NULL, 2.0] + profile = dp.Profiler(sample_array, samples_per_update=2) + +The first two samples (1.0 and NULL) are used for the statistical analysis. + +In contrast, if we also set min_true_samples to 2 then the Data Reader will +continue to read until the minimum true samples were found for the given column. +For example: + +.. code-block:: python + + from dataprofiler import Profiler + + sample_array = [1.0, NULL, 2.0] + profile = dp.Profiler(sample_array, samples_per_update=2, min_true_samples=2) + +This will use all samples in the statistical analysis until the number of "true" +(non-NULL) values are reached. Both min_true_samples and +samples_per_update conditions must be met. In this case, the profile will grab +the first two samples (1.0 and NULL) to satisfy the samples_per_update, and then +it will grab the first two VALID samples (1.0 and 2.0) to satisfy the +min_true_samples. + +Profile a Pandas DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import pandas as pd + import dataprofiler as dp + import json + + my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]]) + profile = dp.Profiler(my_dataframe) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + # read a specified column, in this case it is labeled 0: + print(json.dumps(report["data stats"][0], indent=4)) + + +Specifying a Filetype or Delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a CSV data type, with a `,` delimiter. +In addition, it utilizes only the first 10,000 rows. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + from dataprofiler.data_readers.csv_data import CSVData + + # Load a CSV file, with "," as the delimiter + data = CSVData("your_file.csv", options={"delimiter": ","}) + + # Split the data, such that only the first 10,000 rows are used + data = data.data[0:10000] + + # Read in profile and print results + profile = Profiler(data) + print(json.dumps(profile.report(report_options={"output_format": "pretty"}), indent=4)) + +Setting Profiler Seed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a seed for reproducibility. + +.. code-block:: python + + import dataprofiler as dp + + # Set seed to non-negative integer value or None + dp.set_seed(0) + + +Profile Statistic Descriptions +============================== + +Structured Profile +~~~~~~~~~~~~~~~~~~ + +**global_stats**: + +* samples_used - number of input data samples used to generate this profile +* column_count - the number of columns contained in the input dataset +* row_count - the number of rows contained in the input dataset +* row_has_null_ratio - the proportion of rows that contain at least one null value to the total number of rows +* row_is_null_ratio - the proportion of rows that are fully comprised of null values (null rows) to the total number of rows +* unique_row_ratio - the proportion of distinct rows in the input dataset to the total number of rows +* duplicate_row_count - the number of rows that occur more than once in the input dataset +* file_type - the format of the file containing the input dataset (ex: .csv) +* encoding - the encoding of the file containing the input dataset (ex: UTF-8) +* correlation_matrix - matrix of shape `column_count` x `column_count` containing the correlation coefficients between each column in the dataset +* chi2_matrix - matrix of shape `column_count` x `column_count` containing the chi-square statistics between each column in the dataset +* profile_schema - a description of the format of the input dataset labeling each column and its index in the dataset + * string - the label of the column in question and its index in the profile schema +* times - the duration of time it took to generate the global statistics for this dataset in milliseconds + +**data_stats**: + +* column_name - the label/title of this column in the input dataset +* data_type - the primitive python data type that is contained within this column +* data_label - the label/entity of the data in this column as determined by the Labeler component +* categorical - 'true' if this column contains categorical data +* order - the way in which the data in this column is ordered, if any, otherwise “random” +* samples - a small subset of data entries from this column +* statistics - statistical information on the column + * sample_size - number of input data samples used to generate this profile + * null_count - the number of null entries in the sample + * null_types - a list of the different null types present within this sample + * null_types_index - a dict containing each null type and a respective list of the indicies that it is present within this sample + * data_type_representation - the percentage of samples used identifying as each data_type + * min - minimum value in the sample + * max - maximum value in the sample + * mode - mode of the entries in the sample + * median - median of the entries in the sample + * median_absolute_deviation - the median absolute deviation of the entries in the sample + * sum - the total of all sampled values from the column + * mean - the average of all entries in the sample + * variance - the variance of all entries in the sample + * stddev - the standard deviation of all entries in the sample + * skewness - the statistical skewness of all entries in the sample + * kurtosis - the statistical kurtosis of all entries in the sample + * num_zeros - the number of entries in this sample that have the value 0 + * num_negatives - the number of entries in this sample that have a value less than 0 + * histogram - contains histogram relevant information + * bin_counts - the number of entries within each bin + * bin_edges - the thresholds of each bin + * quantiles - the value at each percentile in the order they are listed based on the entries in the sample + * vocab - a list of the characters used within the entries in this sample + * avg_predictions - average of the data label prediction confidences across all data points sampled + * categories - a list of each distinct category within the sample if `categorial` = 'true' + * unique_count - the number of distinct entries in the sample + * unique_ratio - the proportion of the number of distinct entries in the sample to the total number of entries in the sample + * categorical_count - number of entries sampled for each category if `categorical` = 'true' + * gini_impurity - measure of how often a randomly chosen element from the set would be incorrectly labeled if it was randomly labeled according to the distribution of labels in the subset + * unalikeability - a value denoting how frequently entries differ from one another within the sample + * precision - a dict of statistics with respect to the number of digits in a number for each sample + * times - the duration of time it took to generate this sample's statistics in milliseconds + * format - list of possible datetime formats +* null_replication_metrics - statistics of data partitioned based on whether column value is null (index 1 of lists referenced by dict keys) or not (index 0) + * class_prior - a list containing probability of a column value being null and not null + * class_sum - a list containing sum of all other rows based on whether column value is null or not + * class_mean - a list containing mean of all other rows based on whether column value is null or not + +Unstructured Profile +~~~~~~~~~~~~~~~~~~~~ + +**global_stats**: + +* samples_used - number of input data samples used to generate this profile +* empty_line_count - the number of empty lines in the input data +* file_type - the file type of the input data (ex: .txt) +* encoding - file encoding of the input data file (ex: UTF-8) +* memory_size - size of the input data in MB +* times - duration of time it took to generate this profile in milliseconds + +**data_stats**: + +* data_label - labels and statistics on the labels of the input data + * entity_counts - the number of times a specific label or entity appears inside the input data + * word_level - the number of words counted within each label or entity + * true_char_level - the number of characters counted within each label or entity as determined by the model + * postprocess_char_level - the number of characters counted within each label or entity as determined by the postprocessor + * entity_percentages - the percentages of each label or entity within the input data + * word_level - the percentage of words in the input data that are contained within each label or entity + * true_char_level - the percentage of characters in the input data that are contained within each label or entity as determined by the model + * postprocess_char_level - the percentage of characters in the input data that are contained within each label or entity as determined by the postprocessor + * times - the duration of time it took for the data labeler to predict on the data +* statistics - statistics of the input data + * vocab - a list of each character in the input data + * vocab_count - the number of occurrences of each distinct character in the input data + * words - a list of each word in the input data + * word_count - the number of occurrences of each distinct word in the input data + * times - the duration of time it took to generate the vocab and words statistics in milliseconds + +Graph Profile +~~~~~~~~~~~~~~~~~~ + +* num_nodes - number of nodes in the graph +* num_edges - number of edges in the graph +* categorical_attributes - list of categorical edge attributes +* continuous_attributes - list of continuous edge attributes +* avg_node_degree - average degree of nodes in the graph +* global_max_component_size: size of the global max component + +**continuous_distribution**: + +* : name of N-th edge attribute in list of attributes + * name - name of distribution for attribute + * scale - negative log likelihood used to scale and compare distributions + * properties - list of statistical properties describing the distribution + * [shape (optional), loc, scale, mean, variance, skew, kurtosis] + +**categorical_distribution**: + +* : name of N-th edge attribute in list of attributes + * bin_counts: counts in each bin of the distribution histogram + * bin_edges: edges of each bin of the distribution histogram + +* times - duration of time it took to generate this profile in milliseconds + +Profile Options +=============== + +The data profiler accepts several options to toggle on and off +features. The 8 columns (int options, float options, datetime options, +text options, order options, category options, data labeler options) can be +enabled or disabled. By default, all options are toggled on. Below is an example +of how to alter these options. Options shared by structured and unstructured options +must be specified as structured or unstructured when setting (ie. datalabeler options). + + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler, ProfilerOptions + + # Load and profile a CSV file + data = Data("your_file.csv") + profile_options = ProfilerOptions() + + #All of these are different examples of adjusting the profile options + + # Options can be toggled directly like this: + profile_options.structured_options.text.is_enabled = False + profile_options.structured_options.text.vocab.is_enabled = True + profile_options.structured_options.int.variance.is_enabled = True + profile_options.structured_options.data_labeler.data_labeler_dirpath = \ + "Wheres/My/Datalabeler" + profile_options.structured_options.data_labeler.is_enabled = False + + # A dictionary can be sent in to set the properties for all the options + profile_options.set({"structured_options.data_labeler.is_enabled": False, "min.is_enabled": False}) + + # Specific columns can be set/disabled/enabled in the same way + profile_options.structured_options.text.set({"max.is_enabled":True, + "variance.is_enabled": True}) + + # numeric stats can be turned off/on entirely + profile_options.set({"is_numeric_stats_enabled": False}) + profile_options.set({"int.is_numeric_stats_enabled": False}) + + profile = Profiler(data, options=profile_options) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Below is an breakdown of all the options. + +* **ProfilerOptions** - The top-level options class that contains options for the Profiler class + + * **presets** - A pre-configured mapping of a string name to group of options: + + * **default is None** + + * **"complete"** + + .. code-block:: python + + options = ProfilerOptions(presets="complete") + + * **"data_types"** + + .. code-block:: python + + options = ProfilerOptions(presets="data_types") + + * **"numeric_stats_disabled"** + + .. code-block:: python + + options = ProfilerOptions(presets="numeric_stats_disabled") + + * **"lower_memory_sketching"** + + .. code-block:: python + + options = ProfilerOptions(presets="lower_memory_sketching") + + * **structured_options** - Options responsible for all structured data + + * **multiprocess** - Option to enable multiprocessing. If on, multiprocessing is toggled on if the dataset contains more than 750,000 rows or more than 20 columns. + Automatically selects the optimal number of pooling processes to utilize based on system constraints when toggled on. + + * is_enabled - (Boolean) Enables or disables multiprocessing + + * **sampling_ratio** - A percentage, as a decimal, ranging from greater than 0 to less than or equal to 1 indicating how much input data to sample. Default value set to 0.2. + + * **int** - Options for the integer columns + + * is_enabled - (Boolean) Enables or disables the integer operations + + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * median_abs_deviation - Finds median absolute deviation of all values in a column + + * is_enabled - (Boolean) Enables or disables median absolute deviation + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * num_quantiles - (Int) Number of quantiles to bin the data. + Default value is set to 1,000 quantiles. + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **float** - Options for the float columns + + * is_enabled - (Boolean) Enables or disables the float operations + * precision - Finds the precision (significant figures) within the column + + * is_enabled - (Boolean) Enables or disables precision + * sample_ratio - (Float) The ratio of 0 to 1 how much data (identified as floats) to utilize as samples in determining precision + + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * median_abs_deviation - Finds median absolute deviation of all values in a column + + * is_enabled - (Boolean) Enables or disables median absolute deviation + * is_numeric_stats_enabled - (Boolean) enable or disable all numeric stats + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * num_quantiles - (Int) Number of quantiles to bin the data. + Default value is set to 1,000 quantiles. + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **text** - Options for the text columns + + * is_enabled - (Boolean) Enables or disables the text operations + * vocab - Finds all the unique characters used in a column + + * is_enabled - (Boolean) Enables or disables vocab + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + * mode - Finds mode(s) in a column + + * is_enabled - (Boolean) Enables or disables mode + * top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes. + * median - Finds median value in a column + + * is_enabled - (Boolean) Enables or disables median + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * median_abs_deviation - Finds median absolute deviation of all values in a column + + * is_enabled - (Boolean) Enables or disables median absolute deviation + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * is_numeric_stats_enabled - (Boolean) enable or disable all numeric stats + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * num_quantiles - (Int) Number of quantiles to bin the data. + Default value is set to 1,000 quantiles. + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **datetime** - Options for the datetime columns + + * is_enabled - (Boolean) Enables or disables the datetime operations + * **order** - Options for the order columns + + * is_enabled - (Boolean) Enables or disables the order operations + * **category** - Options for the category columns + + * is_enabled - (Boolean) Enables or disables the category operations + * top_k_categories - (int) Number of categories to be displayed when reporting + * max_sample_size_to_check_stop_condition - (int) The maximum sample size before categorical stop conditions are checked + * stop_condition_unique_value_ratio - (float) The highest ratio of unique values to dataset size that is to be considered a categorical type + * cms - (Boolean) Enables or Disables the use of count min sketch / heavy hitters for approximate frequency counts + * cms_confidence - (float) Defines the number of hashes used in CMS, default 0.95 + * cms_relative_error - (float) Defines the number of buckets used in CMS, default 0.01 + * cms_max_num_heavy_hitters - (int) The value used to define the threshold for minimum frequency required by a category to be counted + * **data_labeler** - Options for the data labeler columns + + * is_enabled - (Boolean) Enables or disables the data labeler operations + * data_labeler_dirpath - (String) Directory path to data labeler + * data_labeler_object - (BaseDataLabeler) Datalabeler to replace + the default labeler + * max_sample_size - (Int) The max number of samples for the data + labeler + * **correlation** - Option set for correlation profiling + * is_enabled - (Boolean) Enables or disables performing correlation profiling + * columns - Columns considered to calculate correlation + * **row_statistics** - (Boolean) Option to enable/disable row statistics calculations + + * unique_count - (UniqueCountOptions) Option to enable/disable unique row count calculations + + * is_enabled - (Bool) Enables or disables options for unique row count + * hashing_method - (String) Property to specify row hashing method ("full" | "hll") + * hll - (HyperLogLogOptions) Options for alternative method of estimating unique row count (activated when `hll` is the selected hashing_method) + + * seed - (Int) Used to set HLL hashing function seed + * register_count - (Int) Number of registers is equal to 2^register_count + + * null_count - (Boolean) Option to enable/disable functionalities for row_has_null_ratio and row_is_null_ratio + * **chi2_homogeneity** - Options for the chi-squared test matrix + + * is_enabled - (Boolean) Enables or disables performing chi-squared tests for homogeneity between the categorical columns of the dataset. + * **null_replication_metrics** - Options for calculating null replication metrics + + * is_enabled - (Boolean) Enables or disables calculation of null replication metrics + * **unstructured_options** - Options responsible for all unstructured data + + * **text** - Options for the text profile + + * is_case_sensitive - (Boolean) Specify whether the profile is case sensitive + * stop_words - (List of Strings) List of stop words to be removed when profiling + * top_k_chars - (Int) Number of top characters to be retrieved when profiling + * top_k_words - (Int) Number of top words to be retrieved when profiling + * vocab - Options for vocab count + + * is_enabled - (Boolean) Enables or disables the vocab stats + * words - Options for word count + + * is_enabled - (Boolean) Enables or disables the word stats + * **data_labeler** - Options for the data labeler + + * is_enabled - (Boolean) Enables or disables the data labeler operations + * data_labeler_dirpath - (String) Directory path to data labeler + * data_labeler_object - (BaseDataLabeler) Datalabeler to replace + the default labeler + * max_sample_size - (Int) The max number of samples for the data + labeler + + + +Statistical Dependency on Order of Updates +========================================== + +Some profile features/statistics are dependent on the order in which the profiler +is updated with new data. + +Order Profile +~~~~~~~~~~~~~ + +The order profiler utilizes the last value in the previous data batch to ensure +the subsequent dataset is above/below/equal to that value when predicting +non-random order. + +For instance, a dataset to be predicted as ascending would require the following +batch data update to be ascending and its first value `>=` than that of the +previous batch of data. + +Ex. of ascending: + +.. code-block:: python + + batch_1 = [0, 1, 2] + batch_2 = [3, 4, 5] + +Ex. of random: + +.. code-block:: python + + batch_1 = [0, 1, 2] + batch_2 = [1, 2, 3] # notice how the first value is less than the last value in the previous batch + + +Reporting Structure +=================== + +For every profile, we can provide a report and customize it with a couple optional parameters: + +* output_format (string) + + * This will allow the user to decide the output format for report. + + * Options are one of [pretty, compact, serializable, flat]: + + * Pretty: floats are rounded to four decimal places, and lists are shortened. + * Compact: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc. + * Serializable: Output is json serializable and not prettified + * Flat: Nested output is returned as a flattened dictionary +* num_quantile_groups (int) + + * You can sample your data as you like! With a minimum of one and a maximum of 1000, you can decide the number of quantile groups! + +.. code-block:: python + + report = profile.report(report_options={"output_format": "pretty"}) + report = profile.report(report_options={"output_format": "compact"}) + report = profile.report(report_options={"output_format": "serializable"}) + report = profile.report(report_options={"output_format": "flat"}) diff --git a/docs/0.12.0/html/_sources/profiler_example.nblink.txt b/docs/0.12.0/html/_sources/profiler_example.nblink.txt new file mode 100644 index 000000000..8b1612784 --- /dev/null +++ b/docs/0.12.0/html/_sources/profiler_example.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/structured_profilers.ipynb" +} \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/regex_labeler_from_scratch.nblink.txt b/docs/0.12.0/html/_sources/regex_labeler_from_scratch.nblink.txt new file mode 100644 index 000000000..a0d3fe033 --- /dev/null +++ b/docs/0.12.0/html/_sources/regex_labeler_from_scratch.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/regex_labeler_from_scratch/DataLabeler_from_scratch.ipynb" +} \ No newline at end of file diff --git a/docs/0.12.0/html/_sources/roadmap.rst.txt b/docs/0.12.0/html/_sources/roadmap.rst.txt new file mode 100644 index 000000000..93886690b --- /dev/null +++ b/docs/0.12.0/html/_sources/roadmap.rst.txt @@ -0,0 +1,58 @@ +.. _roadmap: + +Roadmap +******* + +For more detailed tasks, checkout the repo's github issues page here: +`Github Issues `_. + + +Data Reader Updates +=================== +- Read data from S3 bucket + - All in the current `dp.Data()` API paradigm, we want to enable passing an S3 bucket file path to read in data from AWS s3. +- Pass list of data file paths to data reader +- Pass in linst of data frames to data reader + +New Model +========= +- Transformer model from sensitive data detection + +Historical Profiles +=================== +- Some questions about Historical Profiles / need to step back and rething design to start: + - Meta profile on top? + - Stored windowed info inside? Etc... +- Branch with current state of Historical Profiles +- Two example notebooks of current state: + - Notebook example `one `_. + - Notebook example `two `_. + + +Conditional Report Metric +========================= +- Based on what is populated on other metrics in the report, have "secondary" / "derivatives" of that number (or that number in conjunction with another number) populate in thie report as well. +- For example, if null_count is not None, then populate a null_percent key with a value of the dividence of (null_count / sample_count). + +Space / Time Testing +==================== +- Automatic comparison testing for space and time analysis on PR’s + - Standardize a report for space time analysis for future comparisons (create baseline numbers) + - Include those in integration tests that will automatically run on code when it is changed in PRs +- Could be an optional test, if the user thinks there is concern around the change driving an issue in the library performance + +Testing Suite Upgrades +====================== +- Add mocking to unit tests where mocking is not utilized +- Integration testing separated out from the unit testing suite. Determine how to only run remotely during PRs +- Backward compatibility testing along with informative warnings and errors when a user is utilizing incompatible versions of the library and saved profile object + +Historical Versions +=================== +- Legacy version upgrades to enable patches to prior versions of the Data Profiler + +Miscellaneous +============== +- Refact/or Pandas to Polars DataFrames +- Spearman correlation calculation +- Workflow Profiles diff --git a/docs/0.12.0/html/_sources/unstructured_profiler_example.nblink.txt b/docs/0.12.0/html/_sources/unstructured_profiler_example.nblink.txt new file mode 100644 index 000000000..1589c41d4 --- /dev/null +++ b/docs/0.12.0/html/_sources/unstructured_profiler_example.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/unstructured_profilers.ipynb" +} \ No newline at end of file diff --git a/docs/0.12.0/html/_static/DataProfilerLogoLightTheme.png b/docs/0.12.0/html/_static/DataProfilerLogoLightTheme.png new file mode 100644 index 000000000..35e59c349 Binary files /dev/null and b/docs/0.12.0/html/_static/DataProfilerLogoLightTheme.png differ diff --git a/docs/0.12.0/html/_static/_sphinx_javascript_frameworks_compat.js b/docs/0.12.0/html/_static/_sphinx_javascript_frameworks_compat.js new file mode 100644 index 000000000..8549469dc --- /dev/null +++ b/docs/0.12.0/html/_static/_sphinx_javascript_frameworks_compat.js @@ -0,0 +1,134 @@ +/* + * _sphinx_javascript_frameworks_compat.js + * ~~~~~~~~~~ + * + * Compatability shim for jQuery and underscores.js. + * + * WILL BE REMOVED IN Sphinx 6.0 + * xref RemovedInSphinx60Warning + * + */ + +/** + * select a different prefix for underscore + */ +$u = _.noConflict(); + + +/** + * small helper function to urldecode strings + * + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL + */ +jQuery.urldecode = function(x) { + if (!x) { + return x + } + return decodeURIComponent(x.replace(/\+/g, ' ')); +}; + +/** + * small helper function to urlencode strings + */ +jQuery.urlencode = encodeURIComponent; + +/** + * This function returns the parsed url parameters of the + * current request. Multiple values per key are supported, + * it will always return arrays of strings for the value parts. + */ +jQuery.getQueryParameters = function(s) { + if (typeof s === 'undefined') + s = document.location.search; + var parts = s.substr(s.indexOf('?') + 1).split('&'); + var result = {}; + for (var i = 0; i < parts.length; i++) { + var tmp = parts[i].split('=', 2); + var key = jQuery.urldecode(tmp[0]); + var value = jQuery.urldecode(tmp[1]); + if (key in result) + result[key].push(value); + else + result[key] = [value]; + } + return result; +}; + +/** + * highlight a given string on a jquery object by wrapping it in + * span elements with the given class name. + */ +jQuery.fn.highlightText = function(text, className) { + function highlight(node, addItems) { + if (node.nodeType === 3) { + var val = node.nodeValue; + var pos = val.toLowerCase().indexOf(text); + if (pos >= 0 && + !jQuery(node.parentNode).hasClass(className) && + !jQuery(node.parentNode).hasClass("nohighlight")) { + var span; + var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.className = className; + } + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + node.parentNode.insertBefore(span, node.parentNode.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling)); + node.nodeValue = val.substr(0, pos); + if (isInSVG) { + var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); + var bbox = node.parentElement.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute('class', className); + addItems.push({ + "parent": node.parentNode, + "target": rect}); + } + } + } + else if (!jQuery(node).is("button, select, textarea")) { + jQuery.each(node.childNodes, function() { + highlight(this, addItems); + }); + } + } + var addItems = []; + var result = this.each(function() { + highlight(this, addItems); + }); + for (var i = 0; i < addItems.length; ++i) { + jQuery(addItems[i].parent).before(addItems[i].target); + } + return result; +}; + +/* + * backward compatibility for jQuery.browser + * This will be supported until firefox bug is fixed. + */ +if (!jQuery.browser) { + jQuery.uaMatch = function(ua) { + ua = ua.toLowerCase(); + + var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || + /(webkit)[ \/]([\w.]+)/.exec(ua) || + /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || + /(msie) ([\w.]+)/.exec(ua) || + ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || + []; + + return { + browser: match[ 1 ] || "", + version: match[ 2 ] || "0" + }; + }; + jQuery.browser = {}; + jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; +} diff --git a/docs/0.12.0/html/_static/basic.css b/docs/0.12.0/html/_static/basic.css new file mode 100644 index 000000000..7243282d2 --- /dev/null +++ b/docs/0.12.0/html/_static/basic.css @@ -0,0 +1,930 @@ +/* + * basic.css + * ~~~~~~~~~ + * + * Sphinx stylesheet -- basic theme. + * + * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 230px; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 360px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +a.brackets:before, +span.brackets > a:before{ + content: "["; +} + +a.brackets:after, +span.brackets > a:after { + content: "]"; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +/* Docutils 0.17 and older (footnotes & citations) */ +dl.footnote > dt, +dl.citation > dt { + float: left; + margin-right: 0.5em; +} + +dl.footnote > dd, +dl.citation > dd { + margin-bottom: 0em; +} + +dl.footnote > dd:after, +dl.citation > dd:after { + content: ""; + clear: both; +} + +/* Docutils 0.18+ (footnotes & citations) */ +aside.footnote > span, +div.citation > span { + float: left; +} +aside.footnote > span:last-of-type, +div.citation > span:last-of-type { + padding-right: 0.5em; +} +aside.footnote > p { + margin-left: 2em; +} +div.citation > p { + margin-left: 4em; +} +aside.footnote > p:last-of-type, +div.citation > p:last-of-type { + margin-bottom: 0em; +} +aside.footnote > p:last-of-type:after, +div.citation > p:last-of-type:after { + content: ""; + clear: both; +} + +/* Footnotes & citations ends */ + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dt:after { + content: ":"; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0 0.5em; + content: ":"; + display: inline-block; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; + white-space: nowrap; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/docs/0.12.0/html/_static/custom.css b/docs/0.12.0/html/_static/custom.css new file mode 100644 index 000000000..8a7c7cb54 --- /dev/null +++ b/docs/0.12.0/html/_static/custom.css @@ -0,0 +1,50 @@ +/* + the ipython3 code blocks coming from the notebooks + were not getting the dark theme styles applied, so + manually overriding them +*/ +@media (prefers-color-scheme: dark) { + .highlight-ipython3 { + border: none !important; + border-radius: 2px !important; + background: #202020 !important; + color: #d0d0d0 !important; + } +} + +@media (prefers-color-scheme: dark) { + tr:nth-child(odd) { + background-color: #202020 !important; + } +} + +@media (prefers-color-scheme: dark) { + .dataframe { + color: white !important; + } +} + +.hidden { + display: none; +} + +.version { + text-align: right; + font-size: 24px; + margin-top: -47px; + margin-right: 3px; +} + +.sidebar-brand { + margin-bottom: -10px; + margin-top: 10px; +} + +/* unknown warning was showing, manually hiding */ +#Visualizing-Logged-Dataframes .admonition.warning { + display: none; +} + +div.output_area.stderr { + display: none; +} diff --git a/docs/0.12.0/html/_static/doctools.js b/docs/0.12.0/html/_static/doctools.js new file mode 100644 index 000000000..c3db08d1c --- /dev/null +++ b/docs/0.12.0/html/_static/doctools.js @@ -0,0 +1,264 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Base JavaScript utilities for all Sphinx HTML documentation. + * + * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +const _ready = (callback) => { + if (document.readyState !== "loading") { + callback(); + } else { + document.addEventListener("DOMContentLoaded", callback); + } +}; + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + parent.insertBefore( + span, + parent.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const Documentation = { + init: () => { + Documentation.highlightSearchWords(); + Documentation.initDomainIndexTable(); + Documentation.initOnKeyListeners(); + }, + + /** + * i18n support + */ + TRANSLATIONS: {}, + PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), + LOCALE: "unknown", + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext: (string) => { + const translated = Documentation.TRANSLATIONS[string]; + switch (typeof translated) { + case "undefined": + return string; // no translation + case "string": + return translated; // translation exists + default: + return translated[0]; // (singular, plural) translation tuple exists + } + }, + + ngettext: (singular, plural, n) => { + const translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated !== "undefined") + return translated[Documentation.PLURAL_EXPR(n)]; + return n === 1 ? singular : plural; + }, + + addTranslations: (catalog) => { + Object.assign(Documentation.TRANSLATIONS, catalog.messages); + Documentation.PLURAL_EXPR = new Function( + "n", + `return (${catalog.plural_expr})` + ); + Documentation.LOCALE = catalog.locale; + }, + + /** + * highlight the search words provided in the url in the text + */ + highlightSearchWords: () => { + const highlight = + new URLSearchParams(window.location.search).get("highlight") || ""; + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + const url = new URL(window.location); + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + }, + + /** + * helper function to focus on search bar + */ + focusSearchBar: () => { + document.querySelectorAll("input[name=q]")[0]?.focus(); + }, + + /** + * Initialise the domain index toggle buttons + */ + initDomainIndexTable: () => { + const toggler = (el) => { + const idNumber = el.id.substr(7); + const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); + if (el.src.substr(-9) === "minus.png") { + el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; + toggledRows.forEach((el) => (el.style.display = "none")); + } else { + el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; + toggledRows.forEach((el) => (el.style.display = "")); + } + }; + + const togglerElements = document.querySelectorAll("img.toggler"); + togglerElements.forEach((el) => + el.addEventListener("click", (event) => toggler(event.currentTarget)) + ); + togglerElements.forEach((el) => (el.style.display = "")); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); + }, + + initOnKeyListeners: () => { + // only install a listener if it is really needed + if ( + !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && + !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS + ) + return; + + const blacklistedElements = new Set([ + "TEXTAREA", + "INPUT", + "SELECT", + "BUTTON", + ]); + document.addEventListener("keydown", (event) => { + if (blacklistedElements.has(document.activeElement.tagName)) return; // bail for input elements + if (event.altKey || event.ctrlKey || event.metaKey) return; // bail with special keys + + if (!event.shiftKey) { + switch (event.key) { + case "ArrowLeft": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const prevLink = document.querySelector('link[rel="prev"]'); + if (prevLink && prevLink.href) { + window.location.href = prevLink.href; + event.preventDefault(); + } + break; + case "ArrowRight": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const nextLink = document.querySelector('link[rel="next"]'); + if (nextLink && nextLink.href) { + window.location.href = nextLink.href; + event.preventDefault(); + } + break; + case "Escape": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.hideSearchWords(); + event.preventDefault(); + } + } + + // some keyboard layouts may need Shift to get / + switch (event.key) { + case "/": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.focusSearchBar(); + event.preventDefault(); + } + }); + }, +}; + +// quick alias for translations +const _ = Documentation.gettext; + +_ready(Documentation.init); diff --git a/docs/0.12.0/html/_static/documentation_options.js b/docs/0.12.0/html/_static/documentation_options.js new file mode 100644 index 000000000..a750e4d5e --- /dev/null +++ b/docs/0.12.0/html/_static/documentation_options.js @@ -0,0 +1,14 @@ +var DOCUMENTATION_OPTIONS = { + URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), + VERSION: '', + LANGUAGE: 'en', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: true, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false, + SHOW_SEARCH_SUMMARY: true, + ENABLE_SEARCH_SHORTCUTS: false, +}; \ No newline at end of file diff --git a/docs/0.12.0/html/_static/file.png b/docs/0.12.0/html/_static/file.png new file mode 100644 index 000000000..a858a410e Binary files /dev/null and b/docs/0.12.0/html/_static/file.png differ diff --git a/docs/0.12.0/html/_static/images/DataProfilerDarkLogoLong.png b/docs/0.12.0/html/_static/images/DataProfilerDarkLogoLong.png new file mode 100644 index 000000000..a339e0f6a Binary files /dev/null and b/docs/0.12.0/html/_static/images/DataProfilerDarkLogoLong.png differ diff --git a/docs/0.12.0/html/_static/images/DataProfilerLogoLightTheme.png b/docs/0.12.0/html/_static/images/DataProfilerLogoLightTheme.png new file mode 100644 index 000000000..35e59c349 Binary files /dev/null and b/docs/0.12.0/html/_static/images/DataProfilerLogoLightTheme.png differ diff --git a/docs/0.12.0/html/_static/images/DataProfilerLogoLightThemeLong.png b/docs/0.12.0/html/_static/images/DataProfilerLogoLightThemeLong.png new file mode 100644 index 000000000..ca86fe167 Binary files /dev/null and b/docs/0.12.0/html/_static/images/DataProfilerLogoLightThemeLong.png differ diff --git a/docs/0.12.0/html/_static/images/branching_workflow_diagram.png b/docs/0.12.0/html/_static/images/branching_workflow_diagram.png new file mode 100644 index 000000000..60a9515d0 Binary files /dev/null and b/docs/0.12.0/html/_static/images/branching_workflow_diagram.png differ diff --git a/docs/0.12.0/html/_static/images/histogram_example_0.png b/docs/0.12.0/html/_static/images/histogram_example_0.png new file mode 100644 index 000000000..9b8301363 Binary files /dev/null and b/docs/0.12.0/html/_static/images/histogram_example_0.png differ diff --git a/docs/0.12.0/html/_static/images/histogram_example_1.png b/docs/0.12.0/html/_static/images/histogram_example_1.png new file mode 100644 index 000000000..062dfdbb9 Binary files /dev/null and b/docs/0.12.0/html/_static/images/histogram_example_1.png differ diff --git a/docs/0.12.0/html/_static/images/histogram_example_2.png b/docs/0.12.0/html/_static/images/histogram_example_2.png new file mode 100644 index 000000000..1aedf7549 Binary files /dev/null and b/docs/0.12.0/html/_static/images/histogram_example_2.png differ diff --git a/docs/0.12.0/html/_static/images/missing_value_barchart_example_0.png b/docs/0.12.0/html/_static/images/missing_value_barchart_example_0.png new file mode 100644 index 000000000..33cb7afd2 Binary files /dev/null and b/docs/0.12.0/html/_static/images/missing_value_barchart_example_0.png differ diff --git a/docs/0.12.0/html/_static/images/missing_value_matrix_example_0.png b/docs/0.12.0/html/_static/images/missing_value_matrix_example_0.png new file mode 100644 index 000000000..21799cddf Binary files /dev/null and b/docs/0.12.0/html/_static/images/missing_value_matrix_example_0.png differ diff --git a/docs/0.12.0/html/_static/jquery-3.6.0.js b/docs/0.12.0/html/_static/jquery-3.6.0.js new file mode 100644 index 000000000..fc6c299b7 --- /dev/null +++ b/docs/0.12.0/html/_static/jquery-3.6.0.js @@ -0,0 +1,10881 @@ +/*! + * jQuery JavaScript Library v3.6.0 + * https://jquery.com/ + * + * Includes Sizzle.js + * https://sizzlejs.com/ + * + * Copyright OpenJS Foundation and other contributors + * Released under the MIT license + * https://jquery.org/license + * + * Date: 2021-03-02T17:08Z + */ +( function( global, factory ) { + + "use strict"; + + if ( typeof module === "object" && typeof module.exports === "object" ) { + + // For CommonJS and CommonJS-like environments where a proper `window` + // is present, execute the factory and get jQuery. + // For environments that do not have a `window` with a `document` + // (such as Node.js), expose a factory as module.exports. + // This accentuates the need for the creation of a real `window`. + // e.g. var jQuery = require("jquery")(window); + // See ticket #14549 for more info. + module.exports = global.document ? + factory( global, true ) : + function( w ) { + if ( !w.document ) { + throw new Error( "jQuery requires a window with a document" ); + } + return factory( w ); + }; + } else { + factory( global ); + } + +// Pass this if window is not defined yet +} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { + +// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 +// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode +// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common +// enough that all such attempts are guarded in a try block. +"use strict"; + +var arr = []; + +var getProto = Object.getPrototypeOf; + +var slice = arr.slice; + +var flat = arr.flat ? function( array ) { + return arr.flat.call( array ); +} : function( array ) { + return arr.concat.apply( [], array ); +}; + + +var push = arr.push; + +var indexOf = arr.indexOf; + +var class2type = {}; + +var toString = class2type.toString; + +var hasOwn = class2type.hasOwnProperty; + +var fnToString = hasOwn.toString; + +var ObjectFunctionString = fnToString.call( Object ); + +var support = {}; + +var isFunction = function isFunction( obj ) { + + // Support: Chrome <=57, Firefox <=52 + // In some browsers, typeof returns "function" for HTML elements + // (i.e., `typeof document.createElement( "object" ) === "function"`). + // We don't want to classify *any* DOM node as a function. + // Support: QtWeb <=3.8.5, WebKit <=534.34, wkhtmltopdf tool <=0.12.5 + // Plus for old WebKit, typeof returns "function" for HTML collections + // (e.g., `typeof document.getElementsByTagName("div") === "function"`). (gh-4756) + return typeof obj === "function" && typeof obj.nodeType !== "number" && + typeof obj.item !== "function"; + }; + + +var isWindow = function isWindow( obj ) { + return obj != null && obj === obj.window; + }; + + +var document = window.document; + + + + var preservedScriptAttributes = { + type: true, + src: true, + nonce: true, + noModule: true + }; + + function DOMEval( code, node, doc ) { + doc = doc || document; + + var i, val, + script = doc.createElement( "script" ); + + script.text = code; + if ( node ) { + for ( i in preservedScriptAttributes ) { + + // Support: Firefox 64+, Edge 18+ + // Some browsers don't support the "nonce" property on scripts. + // On the other hand, just using `getAttribute` is not enough as + // the `nonce` attribute is reset to an empty string whenever it + // becomes browsing-context connected. + // See https://github.com/whatwg/html/issues/2369 + // See https://html.spec.whatwg.org/#nonce-attributes + // The `node.getAttribute` check was added for the sake of + // `jQuery.globalEval` so that it can fake a nonce-containing node + // via an object. + val = node[ i ] || node.getAttribute && node.getAttribute( i ); + if ( val ) { + script.setAttribute( i, val ); + } + } + } + doc.head.appendChild( script ).parentNode.removeChild( script ); + } + + +function toType( obj ) { + if ( obj == null ) { + return obj + ""; + } + + // Support: Android <=2.3 only (functionish RegExp) + return typeof obj === "object" || typeof obj === "function" ? + class2type[ toString.call( obj ) ] || "object" : + typeof obj; +} +/* global Symbol */ +// Defining this global in .eslintrc.json would create a danger of using the global +// unguarded in another place, it seems safer to define global only for this module + + + +var + version = "3.6.0", + + // Define a local copy of jQuery + jQuery = function( selector, context ) { + + // The jQuery object is actually just the init constructor 'enhanced' + // Need init if jQuery is called (just allow error to be thrown if not included) + return new jQuery.fn.init( selector, context ); + }; + +jQuery.fn = jQuery.prototype = { + + // The current version of jQuery being used + jquery: version, + + constructor: jQuery, + + // The default length of a jQuery object is 0 + length: 0, + + toArray: function() { + return slice.call( this ); + }, + + // Get the Nth element in the matched element set OR + // Get the whole matched element set as a clean array + get: function( num ) { + + // Return all the elements in a clean array + if ( num == null ) { + return slice.call( this ); + } + + // Return just the one element from the set + return num < 0 ? this[ num + this.length ] : this[ num ]; + }, + + // Take an array of elements and push it onto the stack + // (returning the new matched element set) + pushStack: function( elems ) { + + // Build a new jQuery matched element set + var ret = jQuery.merge( this.constructor(), elems ); + + // Add the old object onto the stack (as a reference) + ret.prevObject = this; + + // Return the newly-formed element set + return ret; + }, + + // Execute a callback for every element in the matched set. + each: function( callback ) { + return jQuery.each( this, callback ); + }, + + map: function( callback ) { + return this.pushStack( jQuery.map( this, function( elem, i ) { + return callback.call( elem, i, elem ); + } ) ); + }, + + slice: function() { + return this.pushStack( slice.apply( this, arguments ) ); + }, + + first: function() { + return this.eq( 0 ); + }, + + last: function() { + return this.eq( -1 ); + }, + + even: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return ( i + 1 ) % 2; + } ) ); + }, + + odd: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return i % 2; + } ) ); + }, + + eq: function( i ) { + var len = this.length, + j = +i + ( i < 0 ? len : 0 ); + return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); + }, + + end: function() { + return this.prevObject || this.constructor(); + }, + + // For internal use only. + // Behaves like an Array's method, not like a jQuery method. + push: push, + sort: arr.sort, + splice: arr.splice +}; + +jQuery.extend = jQuery.fn.extend = function() { + var options, name, src, copy, copyIsArray, clone, + target = arguments[ 0 ] || {}, + i = 1, + length = arguments.length, + deep = false; + + // Handle a deep copy situation + if ( typeof target === "boolean" ) { + deep = target; + + // Skip the boolean and the target + target = arguments[ i ] || {}; + i++; + } + + // Handle case when target is a string or something (possible in deep copy) + if ( typeof target !== "object" && !isFunction( target ) ) { + target = {}; + } + + // Extend jQuery itself if only one argument is passed + if ( i === length ) { + target = this; + i--; + } + + for ( ; i < length; i++ ) { + + // Only deal with non-null/undefined values + if ( ( options = arguments[ i ] ) != null ) { + + // Extend the base object + for ( name in options ) { + copy = options[ name ]; + + // Prevent Object.prototype pollution + // Prevent never-ending loop + if ( name === "__proto__" || target === copy ) { + continue; + } + + // Recurse if we're merging plain objects or arrays + if ( deep && copy && ( jQuery.isPlainObject( copy ) || + ( copyIsArray = Array.isArray( copy ) ) ) ) { + src = target[ name ]; + + // Ensure proper type for the source value + if ( copyIsArray && !Array.isArray( src ) ) { + clone = []; + } else if ( !copyIsArray && !jQuery.isPlainObject( src ) ) { + clone = {}; + } else { + clone = src; + } + copyIsArray = false; + + // Never move original objects, clone them + target[ name ] = jQuery.extend( deep, clone, copy ); + + // Don't bring in undefined values + } else if ( copy !== undefined ) { + target[ name ] = copy; + } + } + } + } + + // Return the modified object + return target; +}; + +jQuery.extend( { + + // Unique for each copy of jQuery on the page + expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), + + // Assume jQuery is ready without the ready module + isReady: true, + + error: function( msg ) { + throw new Error( msg ); + }, + + noop: function() {}, + + isPlainObject: function( obj ) { + var proto, Ctor; + + // Detect obvious negatives + // Use toString instead of jQuery.type to catch host objects + if ( !obj || toString.call( obj ) !== "[object Object]" ) { + return false; + } + + proto = getProto( obj ); + + // Objects with no prototype (e.g., `Object.create( null )`) are plain + if ( !proto ) { + return true; + } + + // Objects with prototype are plain iff they were constructed by a global Object function + Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; + return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; + }, + + isEmptyObject: function( obj ) { + var name; + + for ( name in obj ) { + return false; + } + return true; + }, + + // Evaluates a script in a provided context; falls back to the global one + // if not specified. + globalEval: function( code, options, doc ) { + DOMEval( code, { nonce: options && options.nonce }, doc ); + }, + + each: function( obj, callback ) { + var length, i = 0; + + if ( isArrayLike( obj ) ) { + length = obj.length; + for ( ; i < length; i++ ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } else { + for ( i in obj ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } + + return obj; + }, + + // results is for internal usage only + makeArray: function( arr, results ) { + var ret = results || []; + + if ( arr != null ) { + if ( isArrayLike( Object( arr ) ) ) { + jQuery.merge( ret, + typeof arr === "string" ? + [ arr ] : arr + ); + } else { + push.call( ret, arr ); + } + } + + return ret; + }, + + inArray: function( elem, arr, i ) { + return arr == null ? -1 : indexOf.call( arr, elem, i ); + }, + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + merge: function( first, second ) { + var len = +second.length, + j = 0, + i = first.length; + + for ( ; j < len; j++ ) { + first[ i++ ] = second[ j ]; + } + + first.length = i; + + return first; + }, + + grep: function( elems, callback, invert ) { + var callbackInverse, + matches = [], + i = 0, + length = elems.length, + callbackExpect = !invert; + + // Go through the array, only saving the items + // that pass the validator function + for ( ; i < length; i++ ) { + callbackInverse = !callback( elems[ i ], i ); + if ( callbackInverse !== callbackExpect ) { + matches.push( elems[ i ] ); + } + } + + return matches; + }, + + // arg is for internal usage only + map: function( elems, callback, arg ) { + var length, value, + i = 0, + ret = []; + + // Go through the array, translating each of the items to their new values + if ( isArrayLike( elems ) ) { + length = elems.length; + for ( ; i < length; i++ ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + + // Go through every key on the object, + } else { + for ( i in elems ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + } + + // Flatten any nested arrays + return flat( ret ); + }, + + // A global GUID counter for objects + guid: 1, + + // jQuery.support is not used in Core but other projects attach their + // properties to it so it needs to exist. + support: support +} ); + +if ( typeof Symbol === "function" ) { + jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; +} + +// Populate the class2type map +jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), + function( _i, name ) { + class2type[ "[object " + name + "]" ] = name.toLowerCase(); + } ); + +function isArrayLike( obj ) { + + // Support: real iOS 8.2 only (not reproducible in simulator) + // `in` check used to prevent JIT error (gh-2145) + // hasOwn isn't used here due to false negatives + // regarding Nodelist length in IE + var length = !!obj && "length" in obj && obj.length, + type = toType( obj ); + + if ( isFunction( obj ) || isWindow( obj ) ) { + return false; + } + + return type === "array" || length === 0 || + typeof length === "number" && length > 0 && ( length - 1 ) in obj; +} +var Sizzle = +/*! + * Sizzle CSS Selector Engine v2.3.6 + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://js.foundation/ + * + * Date: 2021-02-16 + */ +( function( window ) { +var i, + support, + Expr, + getText, + isXML, + tokenize, + compile, + select, + outermostContext, + sortInput, + hasDuplicate, + + // Local document vars + setDocument, + document, + docElem, + documentIsHTML, + rbuggyQSA, + rbuggyMatches, + matches, + contains, + + // Instance-specific data + expando = "sizzle" + 1 * new Date(), + preferredDoc = window.document, + dirruns = 0, + done = 0, + classCache = createCache(), + tokenCache = createCache(), + compilerCache = createCache(), + nonnativeSelectorCache = createCache(), + sortOrder = function( a, b ) { + if ( a === b ) { + hasDuplicate = true; + } + return 0; + }, + + // Instance methods + hasOwn = ( {} ).hasOwnProperty, + arr = [], + pop = arr.pop, + pushNative = arr.push, + push = arr.push, + slice = arr.slice, + + // Use a stripped-down indexOf as it's faster than native + // https://jsperf.com/thor-indexof-vs-for/5 + indexOf = function( list, elem ) { + var i = 0, + len = list.length; + for ( ; i < len; i++ ) { + if ( list[ i ] === elem ) { + return i; + } + } + return -1; + }, + + booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|" + + "ismap|loop|multiple|open|readonly|required|scoped", + + // Regular expressions + + // http://www.w3.org/TR/css3-selectors/#whitespace + whitespace = "[\\x20\\t\\r\\n\\f]", + + // https://www.w3.org/TR/css-syntax-3/#ident-token-diagram + identifier = "(?:\\\\[\\da-fA-F]{1,6}" + whitespace + + "?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+", + + // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors + attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + + + // Operator (capture 2) + "*([*^$|!~]?=)" + whitespace + + + // "Attribute values must be CSS identifiers [capture 5] + // or strings [capture 3 or capture 4]" + "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + + whitespace + "*\\]", + + pseudos = ":(" + identifier + ")(?:\\((" + + + // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: + // 1. quoted (capture 3; capture 4 or capture 5) + "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + + + // 2. simple (capture 6) + "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + + + // 3. anything else (capture 2) + ".*" + + ")\\)|)", + + // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter + rwhitespace = new RegExp( whitespace + "+", "g" ), + rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + + whitespace + "+$", "g" ), + + rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), + rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + + "*" ), + rdescend = new RegExp( whitespace + "|>" ), + + rpseudo = new RegExp( pseudos ), + ridentifier = new RegExp( "^" + identifier + "$" ), + + matchExpr = { + "ID": new RegExp( "^#(" + identifier + ")" ), + "CLASS": new RegExp( "^\\.(" + identifier + ")" ), + "TAG": new RegExp( "^(" + identifier + "|[*])" ), + "ATTR": new RegExp( "^" + attributes ), + "PSEUDO": new RegExp( "^" + pseudos ), + "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + + whitespace + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + + whitespace + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), + "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), + + // For use in libraries implementing .is() + // We use this for POS matching in `select` + "needsContext": new RegExp( "^" + whitespace + + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + whitespace + + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) + }, + + rhtml = /HTML$/i, + rinputs = /^(?:input|select|textarea|button)$/i, + rheader = /^h\d$/i, + + rnative = /^[^{]+\{\s*\[native \w/, + + // Easily-parseable/retrievable ID or TAG or CLASS selectors + rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, + + rsibling = /[+~]/, + + // CSS escapes + // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters + runescape = new RegExp( "\\\\[\\da-fA-F]{1,6}" + whitespace + "?|\\\\([^\\r\\n\\f])", "g" ), + funescape = function( escape, nonHex ) { + var high = "0x" + escape.slice( 1 ) - 0x10000; + + return nonHex ? + + // Strip the backslash prefix from a non-hex escape sequence + nonHex : + + // Replace a hexadecimal escape sequence with the encoded Unicode code point + // Support: IE <=11+ + // For values outside the Basic Multilingual Plane (BMP), manually construct a + // surrogate pair + high < 0 ? + String.fromCharCode( high + 0x10000 ) : + String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); + }, + + // CSS string/identifier serialization + // https://drafts.csswg.org/cssom/#common-serializing-idioms + rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g, + fcssescape = function( ch, asCodePoint ) { + if ( asCodePoint ) { + + // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER + if ( ch === "\0" ) { + return "\uFFFD"; + } + + // Control characters and (dependent upon position) numbers get escaped as code points + return ch.slice( 0, -1 ) + "\\" + + ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; + } + + // Other potentially-special ASCII characters get backslash-escaped + return "\\" + ch; + }, + + // Used for iframes + // See setDocument() + // Removing the function wrapper causes a "Permission Denied" + // error in IE + unloadHandler = function() { + setDocument(); + }, + + inDisabledFieldset = addCombinator( + function( elem ) { + return elem.disabled === true && elem.nodeName.toLowerCase() === "fieldset"; + }, + { dir: "parentNode", next: "legend" } + ); + +// Optimize for push.apply( _, NodeList ) +try { + push.apply( + ( arr = slice.call( preferredDoc.childNodes ) ), + preferredDoc.childNodes + ); + + // Support: Android<4.0 + // Detect silently failing push.apply + // eslint-disable-next-line no-unused-expressions + arr[ preferredDoc.childNodes.length ].nodeType; +} catch ( e ) { + push = { apply: arr.length ? + + // Leverage slice if possible + function( target, els ) { + pushNative.apply( target, slice.call( els ) ); + } : + + // Support: IE<9 + // Otherwise append directly + function( target, els ) { + var j = target.length, + i = 0; + + // Can't trust NodeList.length + while ( ( target[ j++ ] = els[ i++ ] ) ) {} + target.length = j - 1; + } + }; +} + +function Sizzle( selector, context, results, seed ) { + var m, i, elem, nid, match, groups, newSelector, + newContext = context && context.ownerDocument, + + // nodeType defaults to 9, since context defaults to document + nodeType = context ? context.nodeType : 9; + + results = results || []; + + // Return early from calls with invalid selector or context + if ( typeof selector !== "string" || !selector || + nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { + + return results; + } + + // Try to shortcut find operations (as opposed to filters) in HTML documents + if ( !seed ) { + setDocument( context ); + context = context || document; + + if ( documentIsHTML ) { + + // If the selector is sufficiently simple, try using a "get*By*" DOM method + // (excepting DocumentFragment context, where the methods don't exist) + if ( nodeType !== 11 && ( match = rquickExpr.exec( selector ) ) ) { + + // ID selector + if ( ( m = match[ 1 ] ) ) { + + // Document context + if ( nodeType === 9 ) { + if ( ( elem = context.getElementById( m ) ) ) { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( elem.id === m ) { + results.push( elem ); + return results; + } + } else { + return results; + } + + // Element context + } else { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( newContext && ( elem = newContext.getElementById( m ) ) && + contains( context, elem ) && + elem.id === m ) { + + results.push( elem ); + return results; + } + } + + // Type selector + } else if ( match[ 2 ] ) { + push.apply( results, context.getElementsByTagName( selector ) ); + return results; + + // Class selector + } else if ( ( m = match[ 3 ] ) && support.getElementsByClassName && + context.getElementsByClassName ) { + + push.apply( results, context.getElementsByClassName( m ) ); + return results; + } + } + + // Take advantage of querySelectorAll + if ( support.qsa && + !nonnativeSelectorCache[ selector + " " ] && + ( !rbuggyQSA || !rbuggyQSA.test( selector ) ) && + + // Support: IE 8 only + // Exclude object elements + ( nodeType !== 1 || context.nodeName.toLowerCase() !== "object" ) ) { + + newSelector = selector; + newContext = context; + + // qSA considers elements outside a scoping root when evaluating child or + // descendant combinators, which is not what we want. + // In such cases, we work around the behavior by prefixing every selector in the + // list with an ID selector referencing the scope context. + // The technique has to be used as well when a leading combinator is used + // as such selectors are not recognized by querySelectorAll. + // Thanks to Andrew Dupont for this technique. + if ( nodeType === 1 && + ( rdescend.test( selector ) || rcombinators.test( selector ) ) ) { + + // Expand context for sibling selectors + newContext = rsibling.test( selector ) && testContext( context.parentNode ) || + context; + + // We can use :scope instead of the ID hack if the browser + // supports it & if we're not changing the context. + if ( newContext !== context || !support.scope ) { + + // Capture the context ID, setting it first if necessary + if ( ( nid = context.getAttribute( "id" ) ) ) { + nid = nid.replace( rcssescape, fcssescape ); + } else { + context.setAttribute( "id", ( nid = expando ) ); + } + } + + // Prefix every selector in the list + groups = tokenize( selector ); + i = groups.length; + while ( i-- ) { + groups[ i ] = ( nid ? "#" + nid : ":scope" ) + " " + + toSelector( groups[ i ] ); + } + newSelector = groups.join( "," ); + } + + try { + push.apply( results, + newContext.querySelectorAll( newSelector ) + ); + return results; + } catch ( qsaError ) { + nonnativeSelectorCache( selector, true ); + } finally { + if ( nid === expando ) { + context.removeAttribute( "id" ); + } + } + } + } + } + + // All others + return select( selector.replace( rtrim, "$1" ), context, results, seed ); +} + +/** + * Create key-value caches of limited size + * @returns {function(string, object)} Returns the Object data after storing it on itself with + * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) + * deleting the oldest entry + */ +function createCache() { + var keys = []; + + function cache( key, value ) { + + // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) + if ( keys.push( key + " " ) > Expr.cacheLength ) { + + // Only keep the most recent entries + delete cache[ keys.shift() ]; + } + return ( cache[ key + " " ] = value ); + } + return cache; +} + +/** + * Mark a function for special use by Sizzle + * @param {Function} fn The function to mark + */ +function markFunction( fn ) { + fn[ expando ] = true; + return fn; +} + +/** + * Support testing using an element + * @param {Function} fn Passed the created element and returns a boolean result + */ +function assert( fn ) { + var el = document.createElement( "fieldset" ); + + try { + return !!fn( el ); + } catch ( e ) { + return false; + } finally { + + // Remove from its parent by default + if ( el.parentNode ) { + el.parentNode.removeChild( el ); + } + + // release memory in IE + el = null; + } +} + +/** + * Adds the same handler for all of the specified attrs + * @param {String} attrs Pipe-separated list of attributes + * @param {Function} handler The method that will be applied + */ +function addHandle( attrs, handler ) { + var arr = attrs.split( "|" ), + i = arr.length; + + while ( i-- ) { + Expr.attrHandle[ arr[ i ] ] = handler; + } +} + +/** + * Checks document order of two siblings + * @param {Element} a + * @param {Element} b + * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b + */ +function siblingCheck( a, b ) { + var cur = b && a, + diff = cur && a.nodeType === 1 && b.nodeType === 1 && + a.sourceIndex - b.sourceIndex; + + // Use IE sourceIndex if available on both nodes + if ( diff ) { + return diff; + } + + // Check if b follows a + if ( cur ) { + while ( ( cur = cur.nextSibling ) ) { + if ( cur === b ) { + return -1; + } + } + } + + return a ? 1 : -1; +} + +/** + * Returns a function to use in pseudos for input types + * @param {String} type + */ +function createInputPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for buttons + * @param {String} type + */ +function createButtonPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return ( name === "input" || name === "button" ) && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for :enabled/:disabled + * @param {Boolean} disabled true for :disabled; false for :enabled + */ +function createDisabledPseudo( disabled ) { + + // Known :disabled false positives: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable + return function( elem ) { + + // Only certain elements can match :enabled or :disabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-enabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-disabled + if ( "form" in elem ) { + + // Check for inherited disabledness on relevant non-disabled elements: + // * listed form-associated elements in a disabled fieldset + // https://html.spec.whatwg.org/multipage/forms.html#category-listed + // https://html.spec.whatwg.org/multipage/forms.html#concept-fe-disabled + // * option elements in a disabled optgroup + // https://html.spec.whatwg.org/multipage/forms.html#concept-option-disabled + // All such elements have a "form" property. + if ( elem.parentNode && elem.disabled === false ) { + + // Option elements defer to a parent optgroup if present + if ( "label" in elem ) { + if ( "label" in elem.parentNode ) { + return elem.parentNode.disabled === disabled; + } else { + return elem.disabled === disabled; + } + } + + // Support: IE 6 - 11 + // Use the isDisabled shortcut property to check for disabled fieldset ancestors + return elem.isDisabled === disabled || + + // Where there is no isDisabled, check manually + /* jshint -W018 */ + elem.isDisabled !== !disabled && + inDisabledFieldset( elem ) === disabled; + } + + return elem.disabled === disabled; + + // Try to winnow out elements that can't be disabled before trusting the disabled property. + // Some victims get caught in our net (label, legend, menu, track), but it shouldn't + // even exist on them, let alone have a boolean value. + } else if ( "label" in elem ) { + return elem.disabled === disabled; + } + + // Remaining elements are neither :enabled nor :disabled + return false; + }; +} + +/** + * Returns a function to use in pseudos for positionals + * @param {Function} fn + */ +function createPositionalPseudo( fn ) { + return markFunction( function( argument ) { + argument = +argument; + return markFunction( function( seed, matches ) { + var j, + matchIndexes = fn( [], seed.length, argument ), + i = matchIndexes.length; + + // Match elements found at the specified indexes + while ( i-- ) { + if ( seed[ ( j = matchIndexes[ i ] ) ] ) { + seed[ j ] = !( matches[ j ] = seed[ j ] ); + } + } + } ); + } ); +} + +/** + * Checks a node for validity as a Sizzle context + * @param {Element|Object=} context + * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value + */ +function testContext( context ) { + return context && typeof context.getElementsByTagName !== "undefined" && context; +} + +// Expose support vars for convenience +support = Sizzle.support = {}; + +/** + * Detects XML nodes + * @param {Element|Object} elem An element or a document + * @returns {Boolean} True iff elem is a non-HTML XML node + */ +isXML = Sizzle.isXML = function( elem ) { + var namespace = elem && elem.namespaceURI, + docElem = elem && ( elem.ownerDocument || elem ).documentElement; + + // Support: IE <=8 + // Assume HTML when documentElement doesn't yet exist, such as inside loading iframes + // https://bugs.jquery.com/ticket/4833 + return !rhtml.test( namespace || docElem && docElem.nodeName || "HTML" ); +}; + +/** + * Sets document-related variables once based on the current document + * @param {Element|Object} [doc] An element or document object to use to set the document + * @returns {Object} Returns the current document + */ +setDocument = Sizzle.setDocument = function( node ) { + var hasCompare, subWindow, + doc = node ? node.ownerDocument || node : preferredDoc; + + // Return early if doc is invalid or already selected + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( doc == document || doc.nodeType !== 9 || !doc.documentElement ) { + return document; + } + + // Update global variables + document = doc; + docElem = document.documentElement; + documentIsHTML = !isXML( document ); + + // Support: IE 9 - 11+, Edge 12 - 18+ + // Accessing iframe documents after unload throws "permission denied" errors (jQuery #13936) + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( preferredDoc != document && + ( subWindow = document.defaultView ) && subWindow.top !== subWindow ) { + + // Support: IE 11, Edge + if ( subWindow.addEventListener ) { + subWindow.addEventListener( "unload", unloadHandler, false ); + + // Support: IE 9 - 10 only + } else if ( subWindow.attachEvent ) { + subWindow.attachEvent( "onunload", unloadHandler ); + } + } + + // Support: IE 8 - 11+, Edge 12 - 18+, Chrome <=16 - 25 only, Firefox <=3.6 - 31 only, + // Safari 4 - 5 only, Opera <=11.6 - 12.x only + // IE/Edge & older browsers don't support the :scope pseudo-class. + // Support: Safari 6.0 only + // Safari 6.0 supports :scope but it's an alias of :root there. + support.scope = assert( function( el ) { + docElem.appendChild( el ).appendChild( document.createElement( "div" ) ); + return typeof el.querySelectorAll !== "undefined" && + !el.querySelectorAll( ":scope fieldset div" ).length; + } ); + + /* Attributes + ---------------------------------------------------------------------- */ + + // Support: IE<8 + // Verify that getAttribute really returns attributes and not properties + // (excepting IE8 booleans) + support.attributes = assert( function( el ) { + el.className = "i"; + return !el.getAttribute( "className" ); + } ); + + /* getElement(s)By* + ---------------------------------------------------------------------- */ + + // Check if getElementsByTagName("*") returns only elements + support.getElementsByTagName = assert( function( el ) { + el.appendChild( document.createComment( "" ) ); + return !el.getElementsByTagName( "*" ).length; + } ); + + // Support: IE<9 + support.getElementsByClassName = rnative.test( document.getElementsByClassName ); + + // Support: IE<10 + // Check if getElementById returns elements by name + // The broken getElementById methods don't pick up programmatically-set names, + // so use a roundabout getElementsByName test + support.getById = assert( function( el ) { + docElem.appendChild( el ).id = expando; + return !document.getElementsByName || !document.getElementsByName( expando ).length; + } ); + + // ID filter and find + if ( support.getById ) { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + return elem.getAttribute( "id" ) === attrId; + }; + }; + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var elem = context.getElementById( id ); + return elem ? [ elem ] : []; + } + }; + } else { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + var node = typeof elem.getAttributeNode !== "undefined" && + elem.getAttributeNode( "id" ); + return node && node.value === attrId; + }; + }; + + // Support: IE 6 - 7 only + // getElementById is not reliable as a find shortcut + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var node, i, elems, + elem = context.getElementById( id ); + + if ( elem ) { + + // Verify the id attribute + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + + // Fall back on getElementsByName + elems = context.getElementsByName( id ); + i = 0; + while ( ( elem = elems[ i++ ] ) ) { + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + } + } + + return []; + } + }; + } + + // Tag + Expr.find[ "TAG" ] = support.getElementsByTagName ? + function( tag, context ) { + if ( typeof context.getElementsByTagName !== "undefined" ) { + return context.getElementsByTagName( tag ); + + // DocumentFragment nodes don't have gEBTN + } else if ( support.qsa ) { + return context.querySelectorAll( tag ); + } + } : + + function( tag, context ) { + var elem, + tmp = [], + i = 0, + + // By happy coincidence, a (broken) gEBTN appears on DocumentFragment nodes too + results = context.getElementsByTagName( tag ); + + // Filter out possible comments + if ( tag === "*" ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem.nodeType === 1 ) { + tmp.push( elem ); + } + } + + return tmp; + } + return results; + }; + + // Class + Expr.find[ "CLASS" ] = support.getElementsByClassName && function( className, context ) { + if ( typeof context.getElementsByClassName !== "undefined" && documentIsHTML ) { + return context.getElementsByClassName( className ); + } + }; + + /* QSA/matchesSelector + ---------------------------------------------------------------------- */ + + // QSA and matchesSelector support + + // matchesSelector(:active) reports false when true (IE9/Opera 11.5) + rbuggyMatches = []; + + // qSa(:focus) reports false when true (Chrome 21) + // We allow this because of a bug in IE8/9 that throws an error + // whenever `document.activeElement` is accessed on an iframe + // So, we allow :focus to pass through QSA all the time to avoid the IE error + // See https://bugs.jquery.com/ticket/13378 + rbuggyQSA = []; + + if ( ( support.qsa = rnative.test( document.querySelectorAll ) ) ) { + + // Build QSA regex + // Regex strategy adopted from Diego Perini + assert( function( el ) { + + var input; + + // Select is set to empty string on purpose + // This is to test IE's treatment of not explicitly + // setting a boolean content attribute, + // since its presence should be enough + // https://bugs.jquery.com/ticket/12359 + docElem.appendChild( el ).innerHTML = "" + + ""; + + // Support: IE8, Opera 11-12.16 + // Nothing should be selected when empty strings follow ^= or $= or *= + // The test attribute must be unknown in Opera but "safe" for WinRT + // https://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section + if ( el.querySelectorAll( "[msallowcapture^='']" ).length ) { + rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); + } + + // Support: IE8 + // Boolean attributes and "value" are not treated correctly + if ( !el.querySelectorAll( "[selected]" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); + } + + // Support: Chrome<29, Android<4.4, Safari<7.0+, iOS<7.0+, PhantomJS<1.9.8+ + if ( !el.querySelectorAll( "[id~=" + expando + "-]" ).length ) { + rbuggyQSA.push( "~=" ); + } + + // Support: IE 11+, Edge 15 - 18+ + // IE 11/Edge don't find elements on a `[name='']` query in some cases. + // Adding a temporary attribute to the document before the selection works + // around the issue. + // Interestingly, IE 10 & older don't seem to have the issue. + input = document.createElement( "input" ); + input.setAttribute( "name", "" ); + el.appendChild( input ); + if ( !el.querySelectorAll( "[name='']" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*name" + whitespace + "*=" + + whitespace + "*(?:''|\"\")" ); + } + + // Webkit/Opera - :checked should return selected option elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + // IE8 throws error here and will not see later tests + if ( !el.querySelectorAll( ":checked" ).length ) { + rbuggyQSA.push( ":checked" ); + } + + // Support: Safari 8+, iOS 8+ + // https://bugs.webkit.org/show_bug.cgi?id=136851 + // In-page `selector#id sibling-combinator selector` fails + if ( !el.querySelectorAll( "a#" + expando + "+*" ).length ) { + rbuggyQSA.push( ".#.+[+~]" ); + } + + // Support: Firefox <=3.6 - 5 only + // Old Firefox doesn't throw on a badly-escaped identifier. + el.querySelectorAll( "\\\f" ); + rbuggyQSA.push( "[\\r\\n\\f]" ); + } ); + + assert( function( el ) { + el.innerHTML = "" + + ""; + + // Support: Windows 8 Native Apps + // The type and name attributes are restricted during .innerHTML assignment + var input = document.createElement( "input" ); + input.setAttribute( "type", "hidden" ); + el.appendChild( input ).setAttribute( "name", "D" ); + + // Support: IE8 + // Enforce case-sensitivity of name attribute + if ( el.querySelectorAll( "[name=d]" ).length ) { + rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); + } + + // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) + // IE8 throws error here and will not see later tests + if ( el.querySelectorAll( ":enabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: IE9-11+ + // IE's :disabled selector does not pick up the children of disabled fieldsets + docElem.appendChild( el ).disabled = true; + if ( el.querySelectorAll( ":disabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: Opera 10 - 11 only + // Opera 10-11 does not throw on post-comma invalid pseudos + el.querySelectorAll( "*,:x" ); + rbuggyQSA.push( ",.*:" ); + } ); + } + + if ( ( support.matchesSelector = rnative.test( ( matches = docElem.matches || + docElem.webkitMatchesSelector || + docElem.mozMatchesSelector || + docElem.oMatchesSelector || + docElem.msMatchesSelector ) ) ) ) { + + assert( function( el ) { + + // Check to see if it's possible to do matchesSelector + // on a disconnected node (IE 9) + support.disconnectedMatch = matches.call( el, "*" ); + + // This should fail with an exception + // Gecko does not error, returns false instead + matches.call( el, "[s!='']:x" ); + rbuggyMatches.push( "!=", pseudos ); + } ); + } + + rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join( "|" ) ); + rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join( "|" ) ); + + /* Contains + ---------------------------------------------------------------------- */ + hasCompare = rnative.test( docElem.compareDocumentPosition ); + + // Element contains another + // Purposefully self-exclusive + // As in, an element does not contain itself + contains = hasCompare || rnative.test( docElem.contains ) ? + function( a, b ) { + var adown = a.nodeType === 9 ? a.documentElement : a, + bup = b && b.parentNode; + return a === bup || !!( bup && bup.nodeType === 1 && ( + adown.contains ? + adown.contains( bup ) : + a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 + ) ); + } : + function( a, b ) { + if ( b ) { + while ( ( b = b.parentNode ) ) { + if ( b === a ) { + return true; + } + } + } + return false; + }; + + /* Sorting + ---------------------------------------------------------------------- */ + + // Document order sorting + sortOrder = hasCompare ? + function( a, b ) { + + // Flag for duplicate removal + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + // Sort on method existence if only one input has compareDocumentPosition + var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; + if ( compare ) { + return compare; + } + + // Calculate position if both inputs belong to the same document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + compare = ( a.ownerDocument || a ) == ( b.ownerDocument || b ) ? + a.compareDocumentPosition( b ) : + + // Otherwise we know they are disconnected + 1; + + // Disconnected nodes + if ( compare & 1 || + ( !support.sortDetached && b.compareDocumentPosition( a ) === compare ) ) { + + // Choose the first element that is related to our preferred document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( a == document || a.ownerDocument == preferredDoc && + contains( preferredDoc, a ) ) { + return -1; + } + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( b == document || b.ownerDocument == preferredDoc && + contains( preferredDoc, b ) ) { + return 1; + } + + // Maintain original order + return sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + } + + return compare & 4 ? -1 : 1; + } : + function( a, b ) { + + // Exit early if the nodes are identical + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + var cur, + i = 0, + aup = a.parentNode, + bup = b.parentNode, + ap = [ a ], + bp = [ b ]; + + // Parentless nodes are either documents or disconnected + if ( !aup || !bup ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + return a == document ? -1 : + b == document ? 1 : + /* eslint-enable eqeqeq */ + aup ? -1 : + bup ? 1 : + sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + + // If the nodes are siblings, we can do a quick check + } else if ( aup === bup ) { + return siblingCheck( a, b ); + } + + // Otherwise we need full lists of their ancestors for comparison + cur = a; + while ( ( cur = cur.parentNode ) ) { + ap.unshift( cur ); + } + cur = b; + while ( ( cur = cur.parentNode ) ) { + bp.unshift( cur ); + } + + // Walk down the tree looking for a discrepancy + while ( ap[ i ] === bp[ i ] ) { + i++; + } + + return i ? + + // Do a sibling check if the nodes have a common ancestor + siblingCheck( ap[ i ], bp[ i ] ) : + + // Otherwise nodes in our document sort first + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + ap[ i ] == preferredDoc ? -1 : + bp[ i ] == preferredDoc ? 1 : + /* eslint-enable eqeqeq */ + 0; + }; + + return document; +}; + +Sizzle.matches = function( expr, elements ) { + return Sizzle( expr, null, null, elements ); +}; + +Sizzle.matchesSelector = function( elem, expr ) { + setDocument( elem ); + + if ( support.matchesSelector && documentIsHTML && + !nonnativeSelectorCache[ expr + " " ] && + ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && + ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { + + try { + var ret = matches.call( elem, expr ); + + // IE 9's matchesSelector returns false on disconnected nodes + if ( ret || support.disconnectedMatch || + + // As well, disconnected nodes are said to be in a document + // fragment in IE 9 + elem.document && elem.document.nodeType !== 11 ) { + return ret; + } + } catch ( e ) { + nonnativeSelectorCache( expr, true ); + } + } + + return Sizzle( expr, document, null, [ elem ] ).length > 0; +}; + +Sizzle.contains = function( context, elem ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( context.ownerDocument || context ) != document ) { + setDocument( context ); + } + return contains( context, elem ); +}; + +Sizzle.attr = function( elem, name ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( elem.ownerDocument || elem ) != document ) { + setDocument( elem ); + } + + var fn = Expr.attrHandle[ name.toLowerCase() ], + + // Don't get fooled by Object.prototype properties (jQuery #13807) + val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? + fn( elem, name, !documentIsHTML ) : + undefined; + + return val !== undefined ? + val : + support.attributes || !documentIsHTML ? + elem.getAttribute( name ) : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; +}; + +Sizzle.escape = function( sel ) { + return ( sel + "" ).replace( rcssescape, fcssescape ); +}; + +Sizzle.error = function( msg ) { + throw new Error( "Syntax error, unrecognized expression: " + msg ); +}; + +/** + * Document sorting and removing duplicates + * @param {ArrayLike} results + */ +Sizzle.uniqueSort = function( results ) { + var elem, + duplicates = [], + j = 0, + i = 0; + + // Unless we *know* we can detect duplicates, assume their presence + hasDuplicate = !support.detectDuplicates; + sortInput = !support.sortStable && results.slice( 0 ); + results.sort( sortOrder ); + + if ( hasDuplicate ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem === results[ i ] ) { + j = duplicates.push( i ); + } + } + while ( j-- ) { + results.splice( duplicates[ j ], 1 ); + } + } + + // Clear input after sorting to release objects + // See https://github.com/jquery/sizzle/pull/225 + sortInput = null; + + return results; +}; + +/** + * Utility function for retrieving the text value of an array of DOM nodes + * @param {Array|Element} elem + */ +getText = Sizzle.getText = function( elem ) { + var node, + ret = "", + i = 0, + nodeType = elem.nodeType; + + if ( !nodeType ) { + + // If no nodeType, this is expected to be an array + while ( ( node = elem[ i++ ] ) ) { + + // Do not traverse comment nodes + ret += getText( node ); + } + } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { + + // Use textContent for elements + // innerText usage removed for consistency of new lines (jQuery #11153) + if ( typeof elem.textContent === "string" ) { + return elem.textContent; + } else { + + // Traverse its children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + ret += getText( elem ); + } + } + } else if ( nodeType === 3 || nodeType === 4 ) { + return elem.nodeValue; + } + + // Do not include comment or processing instruction nodes + + return ret; +}; + +Expr = Sizzle.selectors = { + + // Can be adjusted by the user + cacheLength: 50, + + createPseudo: markFunction, + + match: matchExpr, + + attrHandle: {}, + + find: {}, + + relative: { + ">": { dir: "parentNode", first: true }, + " ": { dir: "parentNode" }, + "+": { dir: "previousSibling", first: true }, + "~": { dir: "previousSibling" } + }, + + preFilter: { + "ATTR": function( match ) { + match[ 1 ] = match[ 1 ].replace( runescape, funescape ); + + // Move the given value to match[3] whether quoted or unquoted + match[ 3 ] = ( match[ 3 ] || match[ 4 ] || + match[ 5 ] || "" ).replace( runescape, funescape ); + + if ( match[ 2 ] === "~=" ) { + match[ 3 ] = " " + match[ 3 ] + " "; + } + + return match.slice( 0, 4 ); + }, + + "CHILD": function( match ) { + + /* matches from matchExpr["CHILD"] + 1 type (only|nth|...) + 2 what (child|of-type) + 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) + 4 xn-component of xn+y argument ([+-]?\d*n|) + 5 sign of xn-component + 6 x of xn-component + 7 sign of y-component + 8 y of y-component + */ + match[ 1 ] = match[ 1 ].toLowerCase(); + + if ( match[ 1 ].slice( 0, 3 ) === "nth" ) { + + // nth-* requires argument + if ( !match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + // numeric x and y parameters for Expr.filter.CHILD + // remember that false/true cast respectively to 0/1 + match[ 4 ] = +( match[ 4 ] ? + match[ 5 ] + ( match[ 6 ] || 1 ) : + 2 * ( match[ 3 ] === "even" || match[ 3 ] === "odd" ) ); + match[ 5 ] = +( ( match[ 7 ] + match[ 8 ] ) || match[ 3 ] === "odd" ); + + // other types prohibit arguments + } else if ( match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + return match; + }, + + "PSEUDO": function( match ) { + var excess, + unquoted = !match[ 6 ] && match[ 2 ]; + + if ( matchExpr[ "CHILD" ].test( match[ 0 ] ) ) { + return null; + } + + // Accept quoted arguments as-is + if ( match[ 3 ] ) { + match[ 2 ] = match[ 4 ] || match[ 5 ] || ""; + + // Strip excess characters from unquoted arguments + } else if ( unquoted && rpseudo.test( unquoted ) && + + // Get excess from tokenize (recursively) + ( excess = tokenize( unquoted, true ) ) && + + // advance to the next closing parenthesis + ( excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length ) ) { + + // excess is a negative index + match[ 0 ] = match[ 0 ].slice( 0, excess ); + match[ 2 ] = unquoted.slice( 0, excess ); + } + + // Return only captures needed by the pseudo filter method (type and argument) + return match.slice( 0, 3 ); + } + }, + + filter: { + + "TAG": function( nodeNameSelector ) { + var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); + return nodeNameSelector === "*" ? + function() { + return true; + } : + function( elem ) { + return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; + }; + }, + + "CLASS": function( className ) { + var pattern = classCache[ className + " " ]; + + return pattern || + ( pattern = new RegExp( "(^|" + whitespace + + ")" + className + "(" + whitespace + "|$)" ) ) && classCache( + className, function( elem ) { + return pattern.test( + typeof elem.className === "string" && elem.className || + typeof elem.getAttribute !== "undefined" && + elem.getAttribute( "class" ) || + "" + ); + } ); + }, + + "ATTR": function( name, operator, check ) { + return function( elem ) { + var result = Sizzle.attr( elem, name ); + + if ( result == null ) { + return operator === "!="; + } + if ( !operator ) { + return true; + } + + result += ""; + + /* eslint-disable max-len */ + + return operator === "=" ? result === check : + operator === "!=" ? result !== check : + operator === "^=" ? check && result.indexOf( check ) === 0 : + operator === "*=" ? check && result.indexOf( check ) > -1 : + operator === "$=" ? check && result.slice( -check.length ) === check : + operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 : + operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : + false; + /* eslint-enable max-len */ + + }; + }, + + "CHILD": function( type, what, _argument, first, last ) { + var simple = type.slice( 0, 3 ) !== "nth", + forward = type.slice( -4 ) !== "last", + ofType = what === "of-type"; + + return first === 1 && last === 0 ? + + // Shortcut for :nth-*(n) + function( elem ) { + return !!elem.parentNode; + } : + + function( elem, _context, xml ) { + var cache, uniqueCache, outerCache, node, nodeIndex, start, + dir = simple !== forward ? "nextSibling" : "previousSibling", + parent = elem.parentNode, + name = ofType && elem.nodeName.toLowerCase(), + useCache = !xml && !ofType, + diff = false; + + if ( parent ) { + + // :(first|last|only)-(child|of-type) + if ( simple ) { + while ( dir ) { + node = elem; + while ( ( node = node[ dir ] ) ) { + if ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) { + + return false; + } + } + + // Reverse direction for :only-* (if we haven't yet done so) + start = dir = type === "only" && !start && "nextSibling"; + } + return true; + } + + start = [ forward ? parent.firstChild : parent.lastChild ]; + + // non-xml :nth-child(...) stores cache data on `parent` + if ( forward && useCache ) { + + // Seek `elem` from a previously-cached index + + // ...in a gzip-friendly way + node = parent; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex && cache[ 2 ]; + node = nodeIndex && parent.childNodes[ nodeIndex ]; + + while ( ( node = ++nodeIndex && node && node[ dir ] || + + // Fallback to seeking `elem` from the start + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + // When found, cache indexes on `parent` and break + if ( node.nodeType === 1 && ++diff && node === elem ) { + uniqueCache[ type ] = [ dirruns, nodeIndex, diff ]; + break; + } + } + + } else { + + // Use previously-cached element index if available + if ( useCache ) { + + // ...in a gzip-friendly way + node = elem; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex; + } + + // xml :nth-child(...) + // or :nth-last-child(...) or :nth(-last)?-of-type(...) + if ( diff === false ) { + + // Use the same loop as above to seek `elem` from the start + while ( ( node = ++nodeIndex && node && node[ dir ] || + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + if ( ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) && + ++diff ) { + + // Cache the index of each encountered element + if ( useCache ) { + outerCache = node[ expando ] || + ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + uniqueCache[ type ] = [ dirruns, diff ]; + } + + if ( node === elem ) { + break; + } + } + } + } + } + + // Incorporate the offset, then check against cycle size + diff -= last; + return diff === first || ( diff % first === 0 && diff / first >= 0 ); + } + }; + }, + + "PSEUDO": function( pseudo, argument ) { + + // pseudo-class names are case-insensitive + // http://www.w3.org/TR/selectors/#pseudo-classes + // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters + // Remember that setFilters inherits from pseudos + var args, + fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || + Sizzle.error( "unsupported pseudo: " + pseudo ); + + // The user may use createPseudo to indicate that + // arguments are needed to create the filter function + // just as Sizzle does + if ( fn[ expando ] ) { + return fn( argument ); + } + + // But maintain support for old signatures + if ( fn.length > 1 ) { + args = [ pseudo, pseudo, "", argument ]; + return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? + markFunction( function( seed, matches ) { + var idx, + matched = fn( seed, argument ), + i = matched.length; + while ( i-- ) { + idx = indexOf( seed, matched[ i ] ); + seed[ idx ] = !( matches[ idx ] = matched[ i ] ); + } + } ) : + function( elem ) { + return fn( elem, 0, args ); + }; + } + + return fn; + } + }, + + pseudos: { + + // Potentially complex pseudos + "not": markFunction( function( selector ) { + + // Trim the selector passed to compile + // to avoid treating leading and trailing + // spaces as combinators + var input = [], + results = [], + matcher = compile( selector.replace( rtrim, "$1" ) ); + + return matcher[ expando ] ? + markFunction( function( seed, matches, _context, xml ) { + var elem, + unmatched = matcher( seed, null, xml, [] ), + i = seed.length; + + // Match elements unmatched by `matcher` + while ( i-- ) { + if ( ( elem = unmatched[ i ] ) ) { + seed[ i ] = !( matches[ i ] = elem ); + } + } + } ) : + function( elem, _context, xml ) { + input[ 0 ] = elem; + matcher( input, null, xml, results ); + + // Don't keep the element (issue #299) + input[ 0 ] = null; + return !results.pop(); + }; + } ), + + "has": markFunction( function( selector ) { + return function( elem ) { + return Sizzle( selector, elem ).length > 0; + }; + } ), + + "contains": markFunction( function( text ) { + text = text.replace( runescape, funescape ); + return function( elem ) { + return ( elem.textContent || getText( elem ) ).indexOf( text ) > -1; + }; + } ), + + // "Whether an element is represented by a :lang() selector + // is based solely on the element's language value + // being equal to the identifier C, + // or beginning with the identifier C immediately followed by "-". + // The matching of C against the element's language value is performed case-insensitively. + // The identifier C does not have to be a valid language name." + // http://www.w3.org/TR/selectors/#lang-pseudo + "lang": markFunction( function( lang ) { + + // lang value must be a valid identifier + if ( !ridentifier.test( lang || "" ) ) { + Sizzle.error( "unsupported lang: " + lang ); + } + lang = lang.replace( runescape, funescape ).toLowerCase(); + return function( elem ) { + var elemLang; + do { + if ( ( elemLang = documentIsHTML ? + elem.lang : + elem.getAttribute( "xml:lang" ) || elem.getAttribute( "lang" ) ) ) { + + elemLang = elemLang.toLowerCase(); + return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; + } + } while ( ( elem = elem.parentNode ) && elem.nodeType === 1 ); + return false; + }; + } ), + + // Miscellaneous + "target": function( elem ) { + var hash = window.location && window.location.hash; + return hash && hash.slice( 1 ) === elem.id; + }, + + "root": function( elem ) { + return elem === docElem; + }, + + "focus": function( elem ) { + return elem === document.activeElement && + ( !document.hasFocus || document.hasFocus() ) && + !!( elem.type || elem.href || ~elem.tabIndex ); + }, + + // Boolean properties + "enabled": createDisabledPseudo( false ), + "disabled": createDisabledPseudo( true ), + + "checked": function( elem ) { + + // In CSS3, :checked should return both checked and selected elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + var nodeName = elem.nodeName.toLowerCase(); + return ( nodeName === "input" && !!elem.checked ) || + ( nodeName === "option" && !!elem.selected ); + }, + + "selected": function( elem ) { + + // Accessing this property makes selected-by-default + // options in Safari work properly + if ( elem.parentNode ) { + // eslint-disable-next-line no-unused-expressions + elem.parentNode.selectedIndex; + } + + return elem.selected === true; + }, + + // Contents + "empty": function( elem ) { + + // http://www.w3.org/TR/selectors/#empty-pseudo + // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), + // but not by others (comment: 8; processing instruction: 7; etc.) + // nodeType < 6 works because attributes (2) do not appear as children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + if ( elem.nodeType < 6 ) { + return false; + } + } + return true; + }, + + "parent": function( elem ) { + return !Expr.pseudos[ "empty" ]( elem ); + }, + + // Element/input types + "header": function( elem ) { + return rheader.test( elem.nodeName ); + }, + + "input": function( elem ) { + return rinputs.test( elem.nodeName ); + }, + + "button": function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === "button" || name === "button"; + }, + + "text": function( elem ) { + var attr; + return elem.nodeName.toLowerCase() === "input" && + elem.type === "text" && + + // Support: IE<8 + // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" + ( ( attr = elem.getAttribute( "type" ) ) == null || + attr.toLowerCase() === "text" ); + }, + + // Position-in-collection + "first": createPositionalPseudo( function() { + return [ 0 ]; + } ), + + "last": createPositionalPseudo( function( _matchIndexes, length ) { + return [ length - 1 ]; + } ), + + "eq": createPositionalPseudo( function( _matchIndexes, length, argument ) { + return [ argument < 0 ? argument + length : argument ]; + } ), + + "even": createPositionalPseudo( function( matchIndexes, length ) { + var i = 0; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "odd": createPositionalPseudo( function( matchIndexes, length ) { + var i = 1; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "lt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? + argument + length : + argument > length ? + length : + argument; + for ( ; --i >= 0; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "gt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? argument + length : argument; + for ( ; ++i < length; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ) + } +}; + +Expr.pseudos[ "nth" ] = Expr.pseudos[ "eq" ]; + +// Add button/input type pseudos +for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { + Expr.pseudos[ i ] = createInputPseudo( i ); +} +for ( i in { submit: true, reset: true } ) { + Expr.pseudos[ i ] = createButtonPseudo( i ); +} + +// Easy API for creating new setFilters +function setFilters() {} +setFilters.prototype = Expr.filters = Expr.pseudos; +Expr.setFilters = new setFilters(); + +tokenize = Sizzle.tokenize = function( selector, parseOnly ) { + var matched, match, tokens, type, + soFar, groups, preFilters, + cached = tokenCache[ selector + " " ]; + + if ( cached ) { + return parseOnly ? 0 : cached.slice( 0 ); + } + + soFar = selector; + groups = []; + preFilters = Expr.preFilter; + + while ( soFar ) { + + // Comma and first run + if ( !matched || ( match = rcomma.exec( soFar ) ) ) { + if ( match ) { + + // Don't consume trailing commas as valid + soFar = soFar.slice( match[ 0 ].length ) || soFar; + } + groups.push( ( tokens = [] ) ); + } + + matched = false; + + // Combinators + if ( ( match = rcombinators.exec( soFar ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + + // Cast descendant combinators to space + type: match[ 0 ].replace( rtrim, " " ) + } ); + soFar = soFar.slice( matched.length ); + } + + // Filters + for ( type in Expr.filter ) { + if ( ( match = matchExpr[ type ].exec( soFar ) ) && ( !preFilters[ type ] || + ( match = preFilters[ type ]( match ) ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + type: type, + matches: match + } ); + soFar = soFar.slice( matched.length ); + } + } + + if ( !matched ) { + break; + } + } + + // Return the length of the invalid excess + // if we're just parsing + // Otherwise, throw an error or return tokens + return parseOnly ? + soFar.length : + soFar ? + Sizzle.error( selector ) : + + // Cache the tokens + tokenCache( selector, groups ).slice( 0 ); +}; + +function toSelector( tokens ) { + var i = 0, + len = tokens.length, + selector = ""; + for ( ; i < len; i++ ) { + selector += tokens[ i ].value; + } + return selector; +} + +function addCombinator( matcher, combinator, base ) { + var dir = combinator.dir, + skip = combinator.next, + key = skip || dir, + checkNonElements = base && key === "parentNode", + doneName = done++; + + return combinator.first ? + + // Check against closest ancestor/preceding element + function( elem, context, xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + return matcher( elem, context, xml ); + } + } + return false; + } : + + // Check against all ancestor/preceding elements + function( elem, context, xml ) { + var oldCache, uniqueCache, outerCache, + newCache = [ dirruns, doneName ]; + + // We can't set arbitrary data on XML nodes, so they don't benefit from combinator caching + if ( xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + if ( matcher( elem, context, xml ) ) { + return true; + } + } + } + } else { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + outerCache = elem[ expando ] || ( elem[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ elem.uniqueID ] || + ( outerCache[ elem.uniqueID ] = {} ); + + if ( skip && skip === elem.nodeName.toLowerCase() ) { + elem = elem[ dir ] || elem; + } else if ( ( oldCache = uniqueCache[ key ] ) && + oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { + + // Assign to newCache so results back-propagate to previous elements + return ( newCache[ 2 ] = oldCache[ 2 ] ); + } else { + + // Reuse newcache so results back-propagate to previous elements + uniqueCache[ key ] = newCache; + + // A match means we're done; a fail means we have to keep checking + if ( ( newCache[ 2 ] = matcher( elem, context, xml ) ) ) { + return true; + } + } + } + } + } + return false; + }; +} + +function elementMatcher( matchers ) { + return matchers.length > 1 ? + function( elem, context, xml ) { + var i = matchers.length; + while ( i-- ) { + if ( !matchers[ i ]( elem, context, xml ) ) { + return false; + } + } + return true; + } : + matchers[ 0 ]; +} + +function multipleContexts( selector, contexts, results ) { + var i = 0, + len = contexts.length; + for ( ; i < len; i++ ) { + Sizzle( selector, contexts[ i ], results ); + } + return results; +} + +function condense( unmatched, map, filter, context, xml ) { + var elem, + newUnmatched = [], + i = 0, + len = unmatched.length, + mapped = map != null; + + for ( ; i < len; i++ ) { + if ( ( elem = unmatched[ i ] ) ) { + if ( !filter || filter( elem, context, xml ) ) { + newUnmatched.push( elem ); + if ( mapped ) { + map.push( i ); + } + } + } + } + + return newUnmatched; +} + +function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { + if ( postFilter && !postFilter[ expando ] ) { + postFilter = setMatcher( postFilter ); + } + if ( postFinder && !postFinder[ expando ] ) { + postFinder = setMatcher( postFinder, postSelector ); + } + return markFunction( function( seed, results, context, xml ) { + var temp, i, elem, + preMap = [], + postMap = [], + preexisting = results.length, + + // Get initial elements from seed or context + elems = seed || multipleContexts( + selector || "*", + context.nodeType ? [ context ] : context, + [] + ), + + // Prefilter to get matcher input, preserving a map for seed-results synchronization + matcherIn = preFilter && ( seed || !selector ) ? + condense( elems, preMap, preFilter, context, xml ) : + elems, + + matcherOut = matcher ? + + // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, + postFinder || ( seed ? preFilter : preexisting || postFilter ) ? + + // ...intermediate processing is necessary + [] : + + // ...otherwise use results directly + results : + matcherIn; + + // Find primary matches + if ( matcher ) { + matcher( matcherIn, matcherOut, context, xml ); + } + + // Apply postFilter + if ( postFilter ) { + temp = condense( matcherOut, postMap ); + postFilter( temp, [], context, xml ); + + // Un-match failing elements by moving them back to matcherIn + i = temp.length; + while ( i-- ) { + if ( ( elem = temp[ i ] ) ) { + matcherOut[ postMap[ i ] ] = !( matcherIn[ postMap[ i ] ] = elem ); + } + } + } + + if ( seed ) { + if ( postFinder || preFilter ) { + if ( postFinder ) { + + // Get the final matcherOut by condensing this intermediate into postFinder contexts + temp = []; + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) ) { + + // Restore matcherIn since elem is not yet a final match + temp.push( ( matcherIn[ i ] = elem ) ); + } + } + postFinder( null, ( matcherOut = [] ), temp, xml ); + } + + // Move matched elements from seed to results to keep them synchronized + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) && + ( temp = postFinder ? indexOf( seed, elem ) : preMap[ i ] ) > -1 ) { + + seed[ temp ] = !( results[ temp ] = elem ); + } + } + } + + // Add elements to results, through postFinder if defined + } else { + matcherOut = condense( + matcherOut === results ? + matcherOut.splice( preexisting, matcherOut.length ) : + matcherOut + ); + if ( postFinder ) { + postFinder( null, results, matcherOut, xml ); + } else { + push.apply( results, matcherOut ); + } + } + } ); +} + +function matcherFromTokens( tokens ) { + var checkContext, matcher, j, + len = tokens.length, + leadingRelative = Expr.relative[ tokens[ 0 ].type ], + implicitRelative = leadingRelative || Expr.relative[ " " ], + i = leadingRelative ? 1 : 0, + + // The foundational matcher ensures that elements are reachable from top-level context(s) + matchContext = addCombinator( function( elem ) { + return elem === checkContext; + }, implicitRelative, true ), + matchAnyContext = addCombinator( function( elem ) { + return indexOf( checkContext, elem ) > -1; + }, implicitRelative, true ), + matchers = [ function( elem, context, xml ) { + var ret = ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( + ( checkContext = context ).nodeType ? + matchContext( elem, context, xml ) : + matchAnyContext( elem, context, xml ) ); + + // Avoid hanging onto element (issue #299) + checkContext = null; + return ret; + } ]; + + for ( ; i < len; i++ ) { + if ( ( matcher = Expr.relative[ tokens[ i ].type ] ) ) { + matchers = [ addCombinator( elementMatcher( matchers ), matcher ) ]; + } else { + matcher = Expr.filter[ tokens[ i ].type ].apply( null, tokens[ i ].matches ); + + // Return special upon seeing a positional matcher + if ( matcher[ expando ] ) { + + // Find the next relative operator (if any) for proper handling + j = ++i; + for ( ; j < len; j++ ) { + if ( Expr.relative[ tokens[ j ].type ] ) { + break; + } + } + return setMatcher( + i > 1 && elementMatcher( matchers ), + i > 1 && toSelector( + + // If the preceding token was a descendant combinator, insert an implicit any-element `*` + tokens + .slice( 0, i - 1 ) + .concat( { value: tokens[ i - 2 ].type === " " ? "*" : "" } ) + ).replace( rtrim, "$1" ), + matcher, + i < j && matcherFromTokens( tokens.slice( i, j ) ), + j < len && matcherFromTokens( ( tokens = tokens.slice( j ) ) ), + j < len && toSelector( tokens ) + ); + } + matchers.push( matcher ); + } + } + + return elementMatcher( matchers ); +} + +function matcherFromGroupMatchers( elementMatchers, setMatchers ) { + var bySet = setMatchers.length > 0, + byElement = elementMatchers.length > 0, + superMatcher = function( seed, context, xml, results, outermost ) { + var elem, j, matcher, + matchedCount = 0, + i = "0", + unmatched = seed && [], + setMatched = [], + contextBackup = outermostContext, + + // We must always have either seed elements or outermost context + elems = seed || byElement && Expr.find[ "TAG" ]( "*", outermost ), + + // Use integer dirruns iff this is the outermost matcher + dirrunsUnique = ( dirruns += contextBackup == null ? 1 : Math.random() || 0.1 ), + len = elems.length; + + if ( outermost ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + outermostContext = context == document || context || outermost; + } + + // Add elements passing elementMatchers directly to results + // Support: IE<9, Safari + // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id + for ( ; i !== len && ( elem = elems[ i ] ) != null; i++ ) { + if ( byElement && elem ) { + j = 0; + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( !context && elem.ownerDocument != document ) { + setDocument( elem ); + xml = !documentIsHTML; + } + while ( ( matcher = elementMatchers[ j++ ] ) ) { + if ( matcher( elem, context || document, xml ) ) { + results.push( elem ); + break; + } + } + if ( outermost ) { + dirruns = dirrunsUnique; + } + } + + // Track unmatched elements for set filters + if ( bySet ) { + + // They will have gone through all possible matchers + if ( ( elem = !matcher && elem ) ) { + matchedCount--; + } + + // Lengthen the array for every element, matched or not + if ( seed ) { + unmatched.push( elem ); + } + } + } + + // `i` is now the count of elements visited above, and adding it to `matchedCount` + // makes the latter nonnegative. + matchedCount += i; + + // Apply set filters to unmatched elements + // NOTE: This can be skipped if there are no unmatched elements (i.e., `matchedCount` + // equals `i`), unless we didn't visit _any_ elements in the above loop because we have + // no element matchers and no seed. + // Incrementing an initially-string "0" `i` allows `i` to remain a string only in that + // case, which will result in a "00" `matchedCount` that differs from `i` but is also + // numerically zero. + if ( bySet && i !== matchedCount ) { + j = 0; + while ( ( matcher = setMatchers[ j++ ] ) ) { + matcher( unmatched, setMatched, context, xml ); + } + + if ( seed ) { + + // Reintegrate element matches to eliminate the need for sorting + if ( matchedCount > 0 ) { + while ( i-- ) { + if ( !( unmatched[ i ] || setMatched[ i ] ) ) { + setMatched[ i ] = pop.call( results ); + } + } + } + + // Discard index placeholder values to get only actual matches + setMatched = condense( setMatched ); + } + + // Add matches to results + push.apply( results, setMatched ); + + // Seedless set matches succeeding multiple successful matchers stipulate sorting + if ( outermost && !seed && setMatched.length > 0 && + ( matchedCount + setMatchers.length ) > 1 ) { + + Sizzle.uniqueSort( results ); + } + } + + // Override manipulation of globals by nested matchers + if ( outermost ) { + dirruns = dirrunsUnique; + outermostContext = contextBackup; + } + + return unmatched; + }; + + return bySet ? + markFunction( superMatcher ) : + superMatcher; +} + +compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { + var i, + setMatchers = [], + elementMatchers = [], + cached = compilerCache[ selector + " " ]; + + if ( !cached ) { + + // Generate a function of recursive functions that can be used to check each element + if ( !match ) { + match = tokenize( selector ); + } + i = match.length; + while ( i-- ) { + cached = matcherFromTokens( match[ i ] ); + if ( cached[ expando ] ) { + setMatchers.push( cached ); + } else { + elementMatchers.push( cached ); + } + } + + // Cache the compiled function + cached = compilerCache( + selector, + matcherFromGroupMatchers( elementMatchers, setMatchers ) + ); + + // Save selector and tokenization + cached.selector = selector; + } + return cached; +}; + +/** + * A low-level selection function that works with Sizzle's compiled + * selector functions + * @param {String|Function} selector A selector or a pre-compiled + * selector function built with Sizzle.compile + * @param {Element} context + * @param {Array} [results] + * @param {Array} [seed] A set of elements to match against + */ +select = Sizzle.select = function( selector, context, results, seed ) { + var i, tokens, token, type, find, + compiled = typeof selector === "function" && selector, + match = !seed && tokenize( ( selector = compiled.selector || selector ) ); + + results = results || []; + + // Try to minimize operations if there is only one selector in the list and no seed + // (the latter of which guarantees us context) + if ( match.length === 1 ) { + + // Reduce context if the leading compound selector is an ID + tokens = match[ 0 ] = match[ 0 ].slice( 0 ); + if ( tokens.length > 2 && ( token = tokens[ 0 ] ).type === "ID" && + context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[ 1 ].type ] ) { + + context = ( Expr.find[ "ID" ]( token.matches[ 0 ] + .replace( runescape, funescape ), context ) || [] )[ 0 ]; + if ( !context ) { + return results; + + // Precompiled matchers will still verify ancestry, so step up a level + } else if ( compiled ) { + context = context.parentNode; + } + + selector = selector.slice( tokens.shift().value.length ); + } + + // Fetch a seed set for right-to-left matching + i = matchExpr[ "needsContext" ].test( selector ) ? 0 : tokens.length; + while ( i-- ) { + token = tokens[ i ]; + + // Abort if we hit a combinator + if ( Expr.relative[ ( type = token.type ) ] ) { + break; + } + if ( ( find = Expr.find[ type ] ) ) { + + // Search, expanding context for leading sibling combinators + if ( ( seed = find( + token.matches[ 0 ].replace( runescape, funescape ), + rsibling.test( tokens[ 0 ].type ) && testContext( context.parentNode ) || + context + ) ) ) { + + // If seed is empty or no tokens remain, we can return early + tokens.splice( i, 1 ); + selector = seed.length && toSelector( tokens ); + if ( !selector ) { + push.apply( results, seed ); + return results; + } + + break; + } + } + } + } + + // Compile and execute a filtering function if one is not provided + // Provide `match` to avoid retokenization if we modified the selector above + ( compiled || compile( selector, match ) )( + seed, + context, + !documentIsHTML, + results, + !context || rsibling.test( selector ) && testContext( context.parentNode ) || context + ); + return results; +}; + +// One-time assignments + +// Sort stability +support.sortStable = expando.split( "" ).sort( sortOrder ).join( "" ) === expando; + +// Support: Chrome 14-35+ +// Always assume duplicates if they aren't passed to the comparison function +support.detectDuplicates = !!hasDuplicate; + +// Initialize against the default document +setDocument(); + +// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) +// Detached nodes confoundingly follow *each other* +support.sortDetached = assert( function( el ) { + + // Should return 1, but returns 4 (following) + return el.compareDocumentPosition( document.createElement( "fieldset" ) ) & 1; +} ); + +// Support: IE<8 +// Prevent attribute/property "interpolation" +// https://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx +if ( !assert( function( el ) { + el.innerHTML = ""; + return el.firstChild.getAttribute( "href" ) === "#"; +} ) ) { + addHandle( "type|href|height|width", function( elem, name, isXML ) { + if ( !isXML ) { + return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); + } + } ); +} + +// Support: IE<9 +// Use defaultValue in place of getAttribute("value") +if ( !support.attributes || !assert( function( el ) { + el.innerHTML = ""; + el.firstChild.setAttribute( "value", "" ); + return el.firstChild.getAttribute( "value" ) === ""; +} ) ) { + addHandle( "value", function( elem, _name, isXML ) { + if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { + return elem.defaultValue; + } + } ); +} + +// Support: IE<9 +// Use getAttributeNode to fetch booleans when getAttribute lies +if ( !assert( function( el ) { + return el.getAttribute( "disabled" ) == null; +} ) ) { + addHandle( booleans, function( elem, name, isXML ) { + var val; + if ( !isXML ) { + return elem[ name ] === true ? name.toLowerCase() : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; + } + } ); +} + +return Sizzle; + +} )( window ); + + + +jQuery.find = Sizzle; +jQuery.expr = Sizzle.selectors; + +// Deprecated +jQuery.expr[ ":" ] = jQuery.expr.pseudos; +jQuery.uniqueSort = jQuery.unique = Sizzle.uniqueSort; +jQuery.text = Sizzle.getText; +jQuery.isXMLDoc = Sizzle.isXML; +jQuery.contains = Sizzle.contains; +jQuery.escapeSelector = Sizzle.escape; + + + + +var dir = function( elem, dir, until ) { + var matched = [], + truncate = until !== undefined; + + while ( ( elem = elem[ dir ] ) && elem.nodeType !== 9 ) { + if ( elem.nodeType === 1 ) { + if ( truncate && jQuery( elem ).is( until ) ) { + break; + } + matched.push( elem ); + } + } + return matched; +}; + + +var siblings = function( n, elem ) { + var matched = []; + + for ( ; n; n = n.nextSibling ) { + if ( n.nodeType === 1 && n !== elem ) { + matched.push( n ); + } + } + + return matched; +}; + + +var rneedsContext = jQuery.expr.match.needsContext; + + + +function nodeName( elem, name ) { + + return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); + +} +var rsingleTag = ( /^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i ); + + + +// Implement the identical functionality for filter and not +function winnow( elements, qualifier, not ) { + if ( isFunction( qualifier ) ) { + return jQuery.grep( elements, function( elem, i ) { + return !!qualifier.call( elem, i, elem ) !== not; + } ); + } + + // Single element + if ( qualifier.nodeType ) { + return jQuery.grep( elements, function( elem ) { + return ( elem === qualifier ) !== not; + } ); + } + + // Arraylike of elements (jQuery, arguments, Array) + if ( typeof qualifier !== "string" ) { + return jQuery.grep( elements, function( elem ) { + return ( indexOf.call( qualifier, elem ) > -1 ) !== not; + } ); + } + + // Filtered directly for both simple and complex selectors + return jQuery.filter( qualifier, elements, not ); +} + +jQuery.filter = function( expr, elems, not ) { + var elem = elems[ 0 ]; + + if ( not ) { + expr = ":not(" + expr + ")"; + } + + if ( elems.length === 1 && elem.nodeType === 1 ) { + return jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : []; + } + + return jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { + return elem.nodeType === 1; + } ) ); +}; + +jQuery.fn.extend( { + find: function( selector ) { + var i, ret, + len = this.length, + self = this; + + if ( typeof selector !== "string" ) { + return this.pushStack( jQuery( selector ).filter( function() { + for ( i = 0; i < len; i++ ) { + if ( jQuery.contains( self[ i ], this ) ) { + return true; + } + } + } ) ); + } + + ret = this.pushStack( [] ); + + for ( i = 0; i < len; i++ ) { + jQuery.find( selector, self[ i ], ret ); + } + + return len > 1 ? jQuery.uniqueSort( ret ) : ret; + }, + filter: function( selector ) { + return this.pushStack( winnow( this, selector || [], false ) ); + }, + not: function( selector ) { + return this.pushStack( winnow( this, selector || [], true ) ); + }, + is: function( selector ) { + return !!winnow( + this, + + // If this is a positional/relative selector, check membership in the returned set + // so $("p:first").is("p:last") won't return true for a doc with two "p". + typeof selector === "string" && rneedsContext.test( selector ) ? + jQuery( selector ) : + selector || [], + false + ).length; + } +} ); + + +// Initialize a jQuery object + + +// A central reference to the root jQuery(document) +var rootjQuery, + + // A simple way to check for HTML strings + // Prioritize #id over to avoid XSS via location.hash (#9521) + // Strict HTML recognition (#11290: must start with <) + // Shortcut simple #id case for speed + rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/, + + init = jQuery.fn.init = function( selector, context, root ) { + var match, elem; + + // HANDLE: $(""), $(null), $(undefined), $(false) + if ( !selector ) { + return this; + } + + // Method init() accepts an alternate rootjQuery + // so migrate can support jQuery.sub (gh-2101) + root = root || rootjQuery; + + // Handle HTML strings + if ( typeof selector === "string" ) { + if ( selector[ 0 ] === "<" && + selector[ selector.length - 1 ] === ">" && + selector.length >= 3 ) { + + // Assume that strings that start and end with <> are HTML and skip the regex check + match = [ null, selector, null ]; + + } else { + match = rquickExpr.exec( selector ); + } + + // Match html or make sure no context is specified for #id + if ( match && ( match[ 1 ] || !context ) ) { + + // HANDLE: $(html) -> $(array) + if ( match[ 1 ] ) { + context = context instanceof jQuery ? context[ 0 ] : context; + + // Option to run scripts is true for back-compat + // Intentionally let the error be thrown if parseHTML is not present + jQuery.merge( this, jQuery.parseHTML( + match[ 1 ], + context && context.nodeType ? context.ownerDocument || context : document, + true + ) ); + + // HANDLE: $(html, props) + if ( rsingleTag.test( match[ 1 ] ) && jQuery.isPlainObject( context ) ) { + for ( match in context ) { + + // Properties of context are called as methods if possible + if ( isFunction( this[ match ] ) ) { + this[ match ]( context[ match ] ); + + // ...and otherwise set as attributes + } else { + this.attr( match, context[ match ] ); + } + } + } + + return this; + + // HANDLE: $(#id) + } else { + elem = document.getElementById( match[ 2 ] ); + + if ( elem ) { + + // Inject the element directly into the jQuery object + this[ 0 ] = elem; + this.length = 1; + } + return this; + } + + // HANDLE: $(expr, $(...)) + } else if ( !context || context.jquery ) { + return ( context || root ).find( selector ); + + // HANDLE: $(expr, context) + // (which is just equivalent to: $(context).find(expr) + } else { + return this.constructor( context ).find( selector ); + } + + // HANDLE: $(DOMElement) + } else if ( selector.nodeType ) { + this[ 0 ] = selector; + this.length = 1; + return this; + + // HANDLE: $(function) + // Shortcut for document ready + } else if ( isFunction( selector ) ) { + return root.ready !== undefined ? + root.ready( selector ) : + + // Execute immediately if ready is not present + selector( jQuery ); + } + + return jQuery.makeArray( selector, this ); + }; + +// Give the init function the jQuery prototype for later instantiation +init.prototype = jQuery.fn; + +// Initialize central reference +rootjQuery = jQuery( document ); + + +var rparentsprev = /^(?:parents|prev(?:Until|All))/, + + // Methods guaranteed to produce a unique set when starting from a unique set + guaranteedUnique = { + children: true, + contents: true, + next: true, + prev: true + }; + +jQuery.fn.extend( { + has: function( target ) { + var targets = jQuery( target, this ), + l = targets.length; + + return this.filter( function() { + var i = 0; + for ( ; i < l; i++ ) { + if ( jQuery.contains( this, targets[ i ] ) ) { + return true; + } + } + } ); + }, + + closest: function( selectors, context ) { + var cur, + i = 0, + l = this.length, + matched = [], + targets = typeof selectors !== "string" && jQuery( selectors ); + + // Positional selectors never match, since there's no _selection_ context + if ( !rneedsContext.test( selectors ) ) { + for ( ; i < l; i++ ) { + for ( cur = this[ i ]; cur && cur !== context; cur = cur.parentNode ) { + + // Always skip document fragments + if ( cur.nodeType < 11 && ( targets ? + targets.index( cur ) > -1 : + + // Don't pass non-elements to Sizzle + cur.nodeType === 1 && + jQuery.find.matchesSelector( cur, selectors ) ) ) { + + matched.push( cur ); + break; + } + } + } + } + + return this.pushStack( matched.length > 1 ? jQuery.uniqueSort( matched ) : matched ); + }, + + // Determine the position of an element within the set + index: function( elem ) { + + // No argument, return index in parent + if ( !elem ) { + return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; + } + + // Index in selector + if ( typeof elem === "string" ) { + return indexOf.call( jQuery( elem ), this[ 0 ] ); + } + + // Locate the position of the desired element + return indexOf.call( this, + + // If it receives a jQuery object, the first element is used + elem.jquery ? elem[ 0 ] : elem + ); + }, + + add: function( selector, context ) { + return this.pushStack( + jQuery.uniqueSort( + jQuery.merge( this.get(), jQuery( selector, context ) ) + ) + ); + }, + + addBack: function( selector ) { + return this.add( selector == null ? + this.prevObject : this.prevObject.filter( selector ) + ); + } +} ); + +function sibling( cur, dir ) { + while ( ( cur = cur[ dir ] ) && cur.nodeType !== 1 ) {} + return cur; +} + +jQuery.each( { + parent: function( elem ) { + var parent = elem.parentNode; + return parent && parent.nodeType !== 11 ? parent : null; + }, + parents: function( elem ) { + return dir( elem, "parentNode" ); + }, + parentsUntil: function( elem, _i, until ) { + return dir( elem, "parentNode", until ); + }, + next: function( elem ) { + return sibling( elem, "nextSibling" ); + }, + prev: function( elem ) { + return sibling( elem, "previousSibling" ); + }, + nextAll: function( elem ) { + return dir( elem, "nextSibling" ); + }, + prevAll: function( elem ) { + return dir( elem, "previousSibling" ); + }, + nextUntil: function( elem, _i, until ) { + return dir( elem, "nextSibling", until ); + }, + prevUntil: function( elem, _i, until ) { + return dir( elem, "previousSibling", until ); + }, + siblings: function( elem ) { + return siblings( ( elem.parentNode || {} ).firstChild, elem ); + }, + children: function( elem ) { + return siblings( elem.firstChild ); + }, + contents: function( elem ) { + if ( elem.contentDocument != null && + + // Support: IE 11+ + // elements with no `data` attribute has an object + // `contentDocument` with a `null` prototype. + getProto( elem.contentDocument ) ) { + + return elem.contentDocument; + } + + // Support: IE 9 - 11 only, iOS 7 only, Android Browser <=4.3 only + // Treat the template element as a regular one in browsers that + // don't support it. + if ( nodeName( elem, "template" ) ) { + elem = elem.content || elem; + } + + return jQuery.merge( [], elem.childNodes ); + } +}, function( name, fn ) { + jQuery.fn[ name ] = function( until, selector ) { + var matched = jQuery.map( this, fn, until ); + + if ( name.slice( -5 ) !== "Until" ) { + selector = until; + } + + if ( selector && typeof selector === "string" ) { + matched = jQuery.filter( selector, matched ); + } + + if ( this.length > 1 ) { + + // Remove duplicates + if ( !guaranteedUnique[ name ] ) { + jQuery.uniqueSort( matched ); + } + + // Reverse order for parents* and prev-derivatives + if ( rparentsprev.test( name ) ) { + matched.reverse(); + } + } + + return this.pushStack( matched ); + }; +} ); +var rnothtmlwhite = ( /[^\x20\t\r\n\f]+/g ); + + + +// Convert String-formatted options into Object-formatted ones +function createOptions( options ) { + var object = {}; + jQuery.each( options.match( rnothtmlwhite ) || [], function( _, flag ) { + object[ flag ] = true; + } ); + return object; +} + +/* + * Create a callback list using the following parameters: + * + * options: an optional list of space-separated options that will change how + * the callback list behaves or a more traditional option object + * + * By default a callback list will act like an event callback list and can be + * "fired" multiple times. + * + * Possible options: + * + * once: will ensure the callback list can only be fired once (like a Deferred) + * + * memory: will keep track of previous values and will call any callback added + * after the list has been fired right away with the latest "memorized" + * values (like a Deferred) + * + * unique: will ensure a callback can only be added once (no duplicate in the list) + * + * stopOnFalse: interrupt callings when a callback returns false + * + */ +jQuery.Callbacks = function( options ) { + + // Convert options from String-formatted to Object-formatted if needed + // (we check in cache first) + options = typeof options === "string" ? + createOptions( options ) : + jQuery.extend( {}, options ); + + var // Flag to know if list is currently firing + firing, + + // Last fire value for non-forgettable lists + memory, + + // Flag to know if list was already fired + fired, + + // Flag to prevent firing + locked, + + // Actual callback list + list = [], + + // Queue of execution data for repeatable lists + queue = [], + + // Index of currently firing callback (modified by add/remove as needed) + firingIndex = -1, + + // Fire callbacks + fire = function() { + + // Enforce single-firing + locked = locked || options.once; + + // Execute callbacks for all pending executions, + // respecting firingIndex overrides and runtime changes + fired = firing = true; + for ( ; queue.length; firingIndex = -1 ) { + memory = queue.shift(); + while ( ++firingIndex < list.length ) { + + // Run callback and check for early termination + if ( list[ firingIndex ].apply( memory[ 0 ], memory[ 1 ] ) === false && + options.stopOnFalse ) { + + // Jump to end and forget the data so .add doesn't re-fire + firingIndex = list.length; + memory = false; + } + } + } + + // Forget the data if we're done with it + if ( !options.memory ) { + memory = false; + } + + firing = false; + + // Clean up if we're done firing for good + if ( locked ) { + + // Keep an empty list if we have data for future add calls + if ( memory ) { + list = []; + + // Otherwise, this object is spent + } else { + list = ""; + } + } + }, + + // Actual Callbacks object + self = { + + // Add a callback or a collection of callbacks to the list + add: function() { + if ( list ) { + + // If we have memory from a past run, we should fire after adding + if ( memory && !firing ) { + firingIndex = list.length - 1; + queue.push( memory ); + } + + ( function add( args ) { + jQuery.each( args, function( _, arg ) { + if ( isFunction( arg ) ) { + if ( !options.unique || !self.has( arg ) ) { + list.push( arg ); + } + } else if ( arg && arg.length && toType( arg ) !== "string" ) { + + // Inspect recursively + add( arg ); + } + } ); + } )( arguments ); + + if ( memory && !firing ) { + fire(); + } + } + return this; + }, + + // Remove a callback from the list + remove: function() { + jQuery.each( arguments, function( _, arg ) { + var index; + while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { + list.splice( index, 1 ); + + // Handle firing indexes + if ( index <= firingIndex ) { + firingIndex--; + } + } + } ); + return this; + }, + + // Check if a given callback is in the list. + // If no argument is given, return whether or not list has callbacks attached. + has: function( fn ) { + return fn ? + jQuery.inArray( fn, list ) > -1 : + list.length > 0; + }, + + // Remove all callbacks from the list + empty: function() { + if ( list ) { + list = []; + } + return this; + }, + + // Disable .fire and .add + // Abort any current/pending executions + // Clear all callbacks and values + disable: function() { + locked = queue = []; + list = memory = ""; + return this; + }, + disabled: function() { + return !list; + }, + + // Disable .fire + // Also disable .add unless we have memory (since it would have no effect) + // Abort any pending executions + lock: function() { + locked = queue = []; + if ( !memory && !firing ) { + list = memory = ""; + } + return this; + }, + locked: function() { + return !!locked; + }, + + // Call all callbacks with the given context and arguments + fireWith: function( context, args ) { + if ( !locked ) { + args = args || []; + args = [ context, args.slice ? args.slice() : args ]; + queue.push( args ); + if ( !firing ) { + fire(); + } + } + return this; + }, + + // Call all the callbacks with the given arguments + fire: function() { + self.fireWith( this, arguments ); + return this; + }, + + // To know if the callbacks have already been called at least once + fired: function() { + return !!fired; + } + }; + + return self; +}; + + +function Identity( v ) { + return v; +} +function Thrower( ex ) { + throw ex; +} + +function adoptValue( value, resolve, reject, noValue ) { + var method; + + try { + + // Check for promise aspect first to privilege synchronous behavior + if ( value && isFunction( ( method = value.promise ) ) ) { + method.call( value ).done( resolve ).fail( reject ); + + // Other thenables + } else if ( value && isFunction( ( method = value.then ) ) ) { + method.call( value, resolve, reject ); + + // Other non-thenables + } else { + + // Control `resolve` arguments by letting Array#slice cast boolean `noValue` to integer: + // * false: [ value ].slice( 0 ) => resolve( value ) + // * true: [ value ].slice( 1 ) => resolve() + resolve.apply( undefined, [ value ].slice( noValue ) ); + } + + // For Promises/A+, convert exceptions into rejections + // Since jQuery.when doesn't unwrap thenables, we can skip the extra checks appearing in + // Deferred#then to conditionally suppress rejection. + } catch ( value ) { + + // Support: Android 4.0 only + // Strict mode functions invoked without .call/.apply get global-object context + reject.apply( undefined, [ value ] ); + } +} + +jQuery.extend( { + + Deferred: function( func ) { + var tuples = [ + + // action, add listener, callbacks, + // ... .then handlers, argument index, [final state] + [ "notify", "progress", jQuery.Callbacks( "memory" ), + jQuery.Callbacks( "memory" ), 2 ], + [ "resolve", "done", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 0, "resolved" ], + [ "reject", "fail", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 1, "rejected" ] + ], + state = "pending", + promise = { + state: function() { + return state; + }, + always: function() { + deferred.done( arguments ).fail( arguments ); + return this; + }, + "catch": function( fn ) { + return promise.then( null, fn ); + }, + + // Keep pipe for back-compat + pipe: function( /* fnDone, fnFail, fnProgress */ ) { + var fns = arguments; + + return jQuery.Deferred( function( newDefer ) { + jQuery.each( tuples, function( _i, tuple ) { + + // Map tuples (progress, done, fail) to arguments (done, fail, progress) + var fn = isFunction( fns[ tuple[ 4 ] ] ) && fns[ tuple[ 4 ] ]; + + // deferred.progress(function() { bind to newDefer or newDefer.notify }) + // deferred.done(function() { bind to newDefer or newDefer.resolve }) + // deferred.fail(function() { bind to newDefer or newDefer.reject }) + deferred[ tuple[ 1 ] ]( function() { + var returned = fn && fn.apply( this, arguments ); + if ( returned && isFunction( returned.promise ) ) { + returned.promise() + .progress( newDefer.notify ) + .done( newDefer.resolve ) + .fail( newDefer.reject ); + } else { + newDefer[ tuple[ 0 ] + "With" ]( + this, + fn ? [ returned ] : arguments + ); + } + } ); + } ); + fns = null; + } ).promise(); + }, + then: function( onFulfilled, onRejected, onProgress ) { + var maxDepth = 0; + function resolve( depth, deferred, handler, special ) { + return function() { + var that = this, + args = arguments, + mightThrow = function() { + var returned, then; + + // Support: Promises/A+ section 2.3.3.3.3 + // https://promisesaplus.com/#point-59 + // Ignore double-resolution attempts + if ( depth < maxDepth ) { + return; + } + + returned = handler.apply( that, args ); + + // Support: Promises/A+ section 2.3.1 + // https://promisesaplus.com/#point-48 + if ( returned === deferred.promise() ) { + throw new TypeError( "Thenable self-resolution" ); + } + + // Support: Promises/A+ sections 2.3.3.1, 3.5 + // https://promisesaplus.com/#point-54 + // https://promisesaplus.com/#point-75 + // Retrieve `then` only once + then = returned && + + // Support: Promises/A+ section 2.3.4 + // https://promisesaplus.com/#point-64 + // Only check objects and functions for thenability + ( typeof returned === "object" || + typeof returned === "function" ) && + returned.then; + + // Handle a returned thenable + if ( isFunction( then ) ) { + + // Special processors (notify) just wait for resolution + if ( special ) { + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ) + ); + + // Normal processors (resolve) also hook into progress + } else { + + // ...and disregard older resolution values + maxDepth++; + + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ), + resolve( maxDepth, deferred, Identity, + deferred.notifyWith ) + ); + } + + // Handle all other returned values + } else { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Identity ) { + that = undefined; + args = [ returned ]; + } + + // Process the value(s) + // Default process is resolve + ( special || deferred.resolveWith )( that, args ); + } + }, + + // Only normal processors (resolve) catch and reject exceptions + process = special ? + mightThrow : + function() { + try { + mightThrow(); + } catch ( e ) { + + if ( jQuery.Deferred.exceptionHook ) { + jQuery.Deferred.exceptionHook( e, + process.stackTrace ); + } + + // Support: Promises/A+ section 2.3.3.3.4.1 + // https://promisesaplus.com/#point-61 + // Ignore post-resolution exceptions + if ( depth + 1 >= maxDepth ) { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Thrower ) { + that = undefined; + args = [ e ]; + } + + deferred.rejectWith( that, args ); + } + } + }; + + // Support: Promises/A+ section 2.3.3.3.1 + // https://promisesaplus.com/#point-57 + // Re-resolve promises immediately to dodge false rejection from + // subsequent errors + if ( depth ) { + process(); + } else { + + // Call an optional hook to record the stack, in case of exception + // since it's otherwise lost when execution goes async + if ( jQuery.Deferred.getStackHook ) { + process.stackTrace = jQuery.Deferred.getStackHook(); + } + window.setTimeout( process ); + } + }; + } + + return jQuery.Deferred( function( newDefer ) { + + // progress_handlers.add( ... ) + tuples[ 0 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onProgress ) ? + onProgress : + Identity, + newDefer.notifyWith + ) + ); + + // fulfilled_handlers.add( ... ) + tuples[ 1 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onFulfilled ) ? + onFulfilled : + Identity + ) + ); + + // rejected_handlers.add( ... ) + tuples[ 2 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onRejected ) ? + onRejected : + Thrower + ) + ); + } ).promise(); + }, + + // Get a promise for this deferred + // If obj is provided, the promise aspect is added to the object + promise: function( obj ) { + return obj != null ? jQuery.extend( obj, promise ) : promise; + } + }, + deferred = {}; + + // Add list-specific methods + jQuery.each( tuples, function( i, tuple ) { + var list = tuple[ 2 ], + stateString = tuple[ 5 ]; + + // promise.progress = list.add + // promise.done = list.add + // promise.fail = list.add + promise[ tuple[ 1 ] ] = list.add; + + // Handle state + if ( stateString ) { + list.add( + function() { + + // state = "resolved" (i.e., fulfilled) + // state = "rejected" + state = stateString; + }, + + // rejected_callbacks.disable + // fulfilled_callbacks.disable + tuples[ 3 - i ][ 2 ].disable, + + // rejected_handlers.disable + // fulfilled_handlers.disable + tuples[ 3 - i ][ 3 ].disable, + + // progress_callbacks.lock + tuples[ 0 ][ 2 ].lock, + + // progress_handlers.lock + tuples[ 0 ][ 3 ].lock + ); + } + + // progress_handlers.fire + // fulfilled_handlers.fire + // rejected_handlers.fire + list.add( tuple[ 3 ].fire ); + + // deferred.notify = function() { deferred.notifyWith(...) } + // deferred.resolve = function() { deferred.resolveWith(...) } + // deferred.reject = function() { deferred.rejectWith(...) } + deferred[ tuple[ 0 ] ] = function() { + deferred[ tuple[ 0 ] + "With" ]( this === deferred ? undefined : this, arguments ); + return this; + }; + + // deferred.notifyWith = list.fireWith + // deferred.resolveWith = list.fireWith + // deferred.rejectWith = list.fireWith + deferred[ tuple[ 0 ] + "With" ] = list.fireWith; + } ); + + // Make the deferred a promise + promise.promise( deferred ); + + // Call given func if any + if ( func ) { + func.call( deferred, deferred ); + } + + // All done! + return deferred; + }, + + // Deferred helper + when: function( singleValue ) { + var + + // count of uncompleted subordinates + remaining = arguments.length, + + // count of unprocessed arguments + i = remaining, + + // subordinate fulfillment data + resolveContexts = Array( i ), + resolveValues = slice.call( arguments ), + + // the primary Deferred + primary = jQuery.Deferred(), + + // subordinate callback factory + updateFunc = function( i ) { + return function( value ) { + resolveContexts[ i ] = this; + resolveValues[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; + if ( !( --remaining ) ) { + primary.resolveWith( resolveContexts, resolveValues ); + } + }; + }; + + // Single- and empty arguments are adopted like Promise.resolve + if ( remaining <= 1 ) { + adoptValue( singleValue, primary.done( updateFunc( i ) ).resolve, primary.reject, + !remaining ); + + // Use .then() to unwrap secondary thenables (cf. gh-3000) + if ( primary.state() === "pending" || + isFunction( resolveValues[ i ] && resolveValues[ i ].then ) ) { + + return primary.then(); + } + } + + // Multiple arguments are aggregated like Promise.all array elements + while ( i-- ) { + adoptValue( resolveValues[ i ], updateFunc( i ), primary.reject ); + } + + return primary.promise(); + } +} ); + + +// These usually indicate a programmer mistake during development, +// warn about them ASAP rather than swallowing them by default. +var rerrorNames = /^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/; + +jQuery.Deferred.exceptionHook = function( error, stack ) { + + // Support: IE 8 - 9 only + // Console exists when dev tools are open, which can happen at any time + if ( window.console && window.console.warn && error && rerrorNames.test( error.name ) ) { + window.console.warn( "jQuery.Deferred exception: " + error.message, error.stack, stack ); + } +}; + + + + +jQuery.readyException = function( error ) { + window.setTimeout( function() { + throw error; + } ); +}; + + + + +// The deferred used on DOM ready +var readyList = jQuery.Deferred(); + +jQuery.fn.ready = function( fn ) { + + readyList + .then( fn ) + + // Wrap jQuery.readyException in a function so that the lookup + // happens at the time of error handling instead of callback + // registration. + .catch( function( error ) { + jQuery.readyException( error ); + } ); + + return this; +}; + +jQuery.extend( { + + // Is the DOM ready to be used? Set to true once it occurs. + isReady: false, + + // A counter to track how many items to wait for before + // the ready event fires. See #6781 + readyWait: 1, + + // Handle when the DOM is ready + ready: function( wait ) { + + // Abort if there are pending holds or we're already ready + if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { + return; + } + + // Remember that the DOM is ready + jQuery.isReady = true; + + // If a normal DOM Ready event fired, decrement, and wait if need be + if ( wait !== true && --jQuery.readyWait > 0 ) { + return; + } + + // If there are functions bound, to execute + readyList.resolveWith( document, [ jQuery ] ); + } +} ); + +jQuery.ready.then = readyList.then; + +// The ready event handler and self cleanup method +function completed() { + document.removeEventListener( "DOMContentLoaded", completed ); + window.removeEventListener( "load", completed ); + jQuery.ready(); +} + +// Catch cases where $(document).ready() is called +// after the browser event has already occurred. +// Support: IE <=9 - 10 only +// Older IE sometimes signals "interactive" too soon +if ( document.readyState === "complete" || + ( document.readyState !== "loading" && !document.documentElement.doScroll ) ) { + + // Handle it asynchronously to allow scripts the opportunity to delay ready + window.setTimeout( jQuery.ready ); + +} else { + + // Use the handy event callback + document.addEventListener( "DOMContentLoaded", completed ); + + // A fallback to window.onload, that will always work + window.addEventListener( "load", completed ); +} + + + + +// Multifunctional method to get and set values of a collection +// The value/s can optionally be executed if it's a function +var access = function( elems, fn, key, value, chainable, emptyGet, raw ) { + var i = 0, + len = elems.length, + bulk = key == null; + + // Sets many values + if ( toType( key ) === "object" ) { + chainable = true; + for ( i in key ) { + access( elems, fn, i, key[ i ], true, emptyGet, raw ); + } + + // Sets one value + } else if ( value !== undefined ) { + chainable = true; + + if ( !isFunction( value ) ) { + raw = true; + } + + if ( bulk ) { + + // Bulk operations run against the entire set + if ( raw ) { + fn.call( elems, value ); + fn = null; + + // ...except when executing function values + } else { + bulk = fn; + fn = function( elem, _key, value ) { + return bulk.call( jQuery( elem ), value ); + }; + } + } + + if ( fn ) { + for ( ; i < len; i++ ) { + fn( + elems[ i ], key, raw ? + value : + value.call( elems[ i ], i, fn( elems[ i ], key ) ) + ); + } + } + } + + if ( chainable ) { + return elems; + } + + // Gets + if ( bulk ) { + return fn.call( elems ); + } + + return len ? fn( elems[ 0 ], key ) : emptyGet; +}; + + +// Matches dashed string for camelizing +var rmsPrefix = /^-ms-/, + rdashAlpha = /-([a-z])/g; + +// Used by camelCase as callback to replace() +function fcamelCase( _all, letter ) { + return letter.toUpperCase(); +} + +// Convert dashed to camelCase; used by the css and data modules +// Support: IE <=9 - 11, Edge 12 - 15 +// Microsoft forgot to hump their vendor prefix (#9572) +function camelCase( string ) { + return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); +} +var acceptData = function( owner ) { + + // Accepts only: + // - Node + // - Node.ELEMENT_NODE + // - Node.DOCUMENT_NODE + // - Object + // - Any + return owner.nodeType === 1 || owner.nodeType === 9 || !( +owner.nodeType ); +}; + + + + +function Data() { + this.expando = jQuery.expando + Data.uid++; +} + +Data.uid = 1; + +Data.prototype = { + + cache: function( owner ) { + + // Check if the owner object already has a cache + var value = owner[ this.expando ]; + + // If not, create one + if ( !value ) { + value = {}; + + // We can accept data for non-element nodes in modern browsers, + // but we should not, see #8335. + // Always return an empty object. + if ( acceptData( owner ) ) { + + // If it is a node unlikely to be stringify-ed or looped over + // use plain assignment + if ( owner.nodeType ) { + owner[ this.expando ] = value; + + // Otherwise secure it in a non-enumerable property + // configurable must be true to allow the property to be + // deleted when data is removed + } else { + Object.defineProperty( owner, this.expando, { + value: value, + configurable: true + } ); + } + } + } + + return value; + }, + set: function( owner, data, value ) { + var prop, + cache = this.cache( owner ); + + // Handle: [ owner, key, value ] args + // Always use camelCase key (gh-2257) + if ( typeof data === "string" ) { + cache[ camelCase( data ) ] = value; + + // Handle: [ owner, { properties } ] args + } else { + + // Copy the properties one-by-one to the cache object + for ( prop in data ) { + cache[ camelCase( prop ) ] = data[ prop ]; + } + } + return cache; + }, + get: function( owner, key ) { + return key === undefined ? + this.cache( owner ) : + + // Always use camelCase key (gh-2257) + owner[ this.expando ] && owner[ this.expando ][ camelCase( key ) ]; + }, + access: function( owner, key, value ) { + + // In cases where either: + // + // 1. No key was specified + // 2. A string key was specified, but no value provided + // + // Take the "read" path and allow the get method to determine + // which value to return, respectively either: + // + // 1. The entire cache object + // 2. The data stored at the key + // + if ( key === undefined || + ( ( key && typeof key === "string" ) && value === undefined ) ) { + + return this.get( owner, key ); + } + + // When the key is not a string, or both a key and value + // are specified, set or extend (existing objects) with either: + // + // 1. An object of properties + // 2. A key and value + // + this.set( owner, key, value ); + + // Since the "set" path can have two possible entry points + // return the expected data based on which path was taken[*] + return value !== undefined ? value : key; + }, + remove: function( owner, key ) { + var i, + cache = owner[ this.expando ]; + + if ( cache === undefined ) { + return; + } + + if ( key !== undefined ) { + + // Support array or space separated string of keys + if ( Array.isArray( key ) ) { + + // If key is an array of keys... + // We always set camelCase keys, so remove that. + key = key.map( camelCase ); + } else { + key = camelCase( key ); + + // If a key with the spaces exists, use it. + // Otherwise, create an array by matching non-whitespace + key = key in cache ? + [ key ] : + ( key.match( rnothtmlwhite ) || [] ); + } + + i = key.length; + + while ( i-- ) { + delete cache[ key[ i ] ]; + } + } + + // Remove the expando if there's no more data + if ( key === undefined || jQuery.isEmptyObject( cache ) ) { + + // Support: Chrome <=35 - 45 + // Webkit & Blink performance suffers when deleting properties + // from DOM nodes, so set to undefined instead + // https://bugs.chromium.org/p/chromium/issues/detail?id=378607 (bug restricted) + if ( owner.nodeType ) { + owner[ this.expando ] = undefined; + } else { + delete owner[ this.expando ]; + } + } + }, + hasData: function( owner ) { + var cache = owner[ this.expando ]; + return cache !== undefined && !jQuery.isEmptyObject( cache ); + } +}; +var dataPriv = new Data(); + +var dataUser = new Data(); + + + +// Implementation Summary +// +// 1. Enforce API surface and semantic compatibility with 1.9.x branch +// 2. Improve the module's maintainability by reducing the storage +// paths to a single mechanism. +// 3. Use the same single mechanism to support "private" and "user" data. +// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) +// 5. Avoid exposing implementation details on user objects (eg. expando properties) +// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 + +var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, + rmultiDash = /[A-Z]/g; + +function getData( data ) { + if ( data === "true" ) { + return true; + } + + if ( data === "false" ) { + return false; + } + + if ( data === "null" ) { + return null; + } + + // Only convert to a number if it doesn't change the string + if ( data === +data + "" ) { + return +data; + } + + if ( rbrace.test( data ) ) { + return JSON.parse( data ); + } + + return data; +} + +function dataAttr( elem, key, data ) { + var name; + + // If nothing was found internally, try to fetch any + // data from the HTML5 data-* attribute + if ( data === undefined && elem.nodeType === 1 ) { + name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); + data = elem.getAttribute( name ); + + if ( typeof data === "string" ) { + try { + data = getData( data ); + } catch ( e ) {} + + // Make sure we set the data so it isn't changed later + dataUser.set( elem, key, data ); + } else { + data = undefined; + } + } + return data; +} + +jQuery.extend( { + hasData: function( elem ) { + return dataUser.hasData( elem ) || dataPriv.hasData( elem ); + }, + + data: function( elem, name, data ) { + return dataUser.access( elem, name, data ); + }, + + removeData: function( elem, name ) { + dataUser.remove( elem, name ); + }, + + // TODO: Now that all calls to _data and _removeData have been replaced + // with direct calls to dataPriv methods, these can be deprecated. + _data: function( elem, name, data ) { + return dataPriv.access( elem, name, data ); + }, + + _removeData: function( elem, name ) { + dataPriv.remove( elem, name ); + } +} ); + +jQuery.fn.extend( { + data: function( key, value ) { + var i, name, data, + elem = this[ 0 ], + attrs = elem && elem.attributes; + + // Gets all values + if ( key === undefined ) { + if ( this.length ) { + data = dataUser.get( elem ); + + if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { + i = attrs.length; + while ( i-- ) { + + // Support: IE 11 only + // The attrs elements can be null (#14894) + if ( attrs[ i ] ) { + name = attrs[ i ].name; + if ( name.indexOf( "data-" ) === 0 ) { + name = camelCase( name.slice( 5 ) ); + dataAttr( elem, name, data[ name ] ); + } + } + } + dataPriv.set( elem, "hasDataAttrs", true ); + } + } + + return data; + } + + // Sets multiple values + if ( typeof key === "object" ) { + return this.each( function() { + dataUser.set( this, key ); + } ); + } + + return access( this, function( value ) { + var data; + + // The calling jQuery object (element matches) is not empty + // (and therefore has an element appears at this[ 0 ]) and the + // `value` parameter was not undefined. An empty jQuery object + // will result in `undefined` for elem = this[ 0 ] which will + // throw an exception if an attempt to read a data cache is made. + if ( elem && value === undefined ) { + + // Attempt to get data from the cache + // The key will always be camelCased in Data + data = dataUser.get( elem, key ); + if ( data !== undefined ) { + return data; + } + + // Attempt to "discover" the data in + // HTML5 custom data-* attrs + data = dataAttr( elem, key ); + if ( data !== undefined ) { + return data; + } + + // We tried really hard, but the data doesn't exist. + return; + } + + // Set the data... + this.each( function() { + + // We always store the camelCased key + dataUser.set( this, key, value ); + } ); + }, null, value, arguments.length > 1, null, true ); + }, + + removeData: function( key ) { + return this.each( function() { + dataUser.remove( this, key ); + } ); + } +} ); + + +jQuery.extend( { + queue: function( elem, type, data ) { + var queue; + + if ( elem ) { + type = ( type || "fx" ) + "queue"; + queue = dataPriv.get( elem, type ); + + // Speed up dequeue by getting out quickly if this is just a lookup + if ( data ) { + if ( !queue || Array.isArray( data ) ) { + queue = dataPriv.access( elem, type, jQuery.makeArray( data ) ); + } else { + queue.push( data ); + } + } + return queue || []; + } + }, + + dequeue: function( elem, type ) { + type = type || "fx"; + + var queue = jQuery.queue( elem, type ), + startLength = queue.length, + fn = queue.shift(), + hooks = jQuery._queueHooks( elem, type ), + next = function() { + jQuery.dequeue( elem, type ); + }; + + // If the fx queue is dequeued, always remove the progress sentinel + if ( fn === "inprogress" ) { + fn = queue.shift(); + startLength--; + } + + if ( fn ) { + + // Add a progress sentinel to prevent the fx queue from being + // automatically dequeued + if ( type === "fx" ) { + queue.unshift( "inprogress" ); + } + + // Clear up the last queue stop function + delete hooks.stop; + fn.call( elem, next, hooks ); + } + + if ( !startLength && hooks ) { + hooks.empty.fire(); + } + }, + + // Not public - generate a queueHooks object, or return the current one + _queueHooks: function( elem, type ) { + var key = type + "queueHooks"; + return dataPriv.get( elem, key ) || dataPriv.access( elem, key, { + empty: jQuery.Callbacks( "once memory" ).add( function() { + dataPriv.remove( elem, [ type + "queue", key ] ); + } ) + } ); + } +} ); + +jQuery.fn.extend( { + queue: function( type, data ) { + var setter = 2; + + if ( typeof type !== "string" ) { + data = type; + type = "fx"; + setter--; + } + + if ( arguments.length < setter ) { + return jQuery.queue( this[ 0 ], type ); + } + + return data === undefined ? + this : + this.each( function() { + var queue = jQuery.queue( this, type, data ); + + // Ensure a hooks for this queue + jQuery._queueHooks( this, type ); + + if ( type === "fx" && queue[ 0 ] !== "inprogress" ) { + jQuery.dequeue( this, type ); + } + } ); + }, + dequeue: function( type ) { + return this.each( function() { + jQuery.dequeue( this, type ); + } ); + }, + clearQueue: function( type ) { + return this.queue( type || "fx", [] ); + }, + + // Get a promise resolved when queues of a certain type + // are emptied (fx is the type by default) + promise: function( type, obj ) { + var tmp, + count = 1, + defer = jQuery.Deferred(), + elements = this, + i = this.length, + resolve = function() { + if ( !( --count ) ) { + defer.resolveWith( elements, [ elements ] ); + } + }; + + if ( typeof type !== "string" ) { + obj = type; + type = undefined; + } + type = type || "fx"; + + while ( i-- ) { + tmp = dataPriv.get( elements[ i ], type + "queueHooks" ); + if ( tmp && tmp.empty ) { + count++; + tmp.empty.add( resolve ); + } + } + resolve(); + return defer.promise( obj ); + } +} ); +var pnum = ( /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/ ).source; + +var rcssNum = new RegExp( "^(?:([+-])=|)(" + pnum + ")([a-z%]*)$", "i" ); + + +var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; + +var documentElement = document.documentElement; + + + + var isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ); + }, + composed = { composed: true }; + + // Support: IE 9 - 11+, Edge 12 - 18+, iOS 10.0 - 10.2 only + // Check attachment across shadow DOM boundaries when possible (gh-3504) + // Support: iOS 10.0-10.2 only + // Early iOS 10 versions support `attachShadow` but not `getRootNode`, + // leading to errors. We need to check for `getRootNode`. + if ( documentElement.getRootNode ) { + isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ) || + elem.getRootNode( composed ) === elem.ownerDocument; + }; + } +var isHiddenWithinTree = function( elem, el ) { + + // isHiddenWithinTree might be called from jQuery#filter function; + // in that case, element will be second argument + elem = el || elem; + + // Inline style trumps all + return elem.style.display === "none" || + elem.style.display === "" && + + // Otherwise, check computed style + // Support: Firefox <=43 - 45 + // Disconnected elements can have computed display: none, so first confirm that elem is + // in the document. + isAttached( elem ) && + + jQuery.css( elem, "display" ) === "none"; + }; + + + +function adjustCSS( elem, prop, valueParts, tween ) { + var adjusted, scale, + maxIterations = 20, + currentValue = tween ? + function() { + return tween.cur(); + } : + function() { + return jQuery.css( elem, prop, "" ); + }, + initial = currentValue(), + unit = valueParts && valueParts[ 3 ] || ( jQuery.cssNumber[ prop ] ? "" : "px" ), + + // Starting value computation is required for potential unit mismatches + initialInUnit = elem.nodeType && + ( jQuery.cssNumber[ prop ] || unit !== "px" && +initial ) && + rcssNum.exec( jQuery.css( elem, prop ) ); + + if ( initialInUnit && initialInUnit[ 3 ] !== unit ) { + + // Support: Firefox <=54 + // Halve the iteration target value to prevent interference from CSS upper bounds (gh-2144) + initial = initial / 2; + + // Trust units reported by jQuery.css + unit = unit || initialInUnit[ 3 ]; + + // Iteratively approximate from a nonzero starting point + initialInUnit = +initial || 1; + + while ( maxIterations-- ) { + + // Evaluate and update our best guess (doubling guesses that zero out). + // Finish if the scale equals or crosses 1 (making the old*new product non-positive). + jQuery.style( elem, prop, initialInUnit + unit ); + if ( ( 1 - scale ) * ( 1 - ( scale = currentValue() / initial || 0.5 ) ) <= 0 ) { + maxIterations = 0; + } + initialInUnit = initialInUnit / scale; + + } + + initialInUnit = initialInUnit * 2; + jQuery.style( elem, prop, initialInUnit + unit ); + + // Make sure we update the tween properties later on + valueParts = valueParts || []; + } + + if ( valueParts ) { + initialInUnit = +initialInUnit || +initial || 0; + + // Apply relative offset (+=/-=) if specified + adjusted = valueParts[ 1 ] ? + initialInUnit + ( valueParts[ 1 ] + 1 ) * valueParts[ 2 ] : + +valueParts[ 2 ]; + if ( tween ) { + tween.unit = unit; + tween.start = initialInUnit; + tween.end = adjusted; + } + } + return adjusted; +} + + +var defaultDisplayMap = {}; + +function getDefaultDisplay( elem ) { + var temp, + doc = elem.ownerDocument, + nodeName = elem.nodeName, + display = defaultDisplayMap[ nodeName ]; + + if ( display ) { + return display; + } + + temp = doc.body.appendChild( doc.createElement( nodeName ) ); + display = jQuery.css( temp, "display" ); + + temp.parentNode.removeChild( temp ); + + if ( display === "none" ) { + display = "block"; + } + defaultDisplayMap[ nodeName ] = display; + + return display; +} + +function showHide( elements, show ) { + var display, elem, + values = [], + index = 0, + length = elements.length; + + // Determine new display value for elements that need to change + for ( ; index < length; index++ ) { + elem = elements[ index ]; + if ( !elem.style ) { + continue; + } + + display = elem.style.display; + if ( show ) { + + // Since we force visibility upon cascade-hidden elements, an immediate (and slow) + // check is required in this first loop unless we have a nonempty display value (either + // inline or about-to-be-restored) + if ( display === "none" ) { + values[ index ] = dataPriv.get( elem, "display" ) || null; + if ( !values[ index ] ) { + elem.style.display = ""; + } + } + if ( elem.style.display === "" && isHiddenWithinTree( elem ) ) { + values[ index ] = getDefaultDisplay( elem ); + } + } else { + if ( display !== "none" ) { + values[ index ] = "none"; + + // Remember what we're overwriting + dataPriv.set( elem, "display", display ); + } + } + } + + // Set the display of the elements in a second loop to avoid constant reflow + for ( index = 0; index < length; index++ ) { + if ( values[ index ] != null ) { + elements[ index ].style.display = values[ index ]; + } + } + + return elements; +} + +jQuery.fn.extend( { + show: function() { + return showHide( this, true ); + }, + hide: function() { + return showHide( this ); + }, + toggle: function( state ) { + if ( typeof state === "boolean" ) { + return state ? this.show() : this.hide(); + } + + return this.each( function() { + if ( isHiddenWithinTree( this ) ) { + jQuery( this ).show(); + } else { + jQuery( this ).hide(); + } + } ); + } +} ); +var rcheckableType = ( /^(?:checkbox|radio)$/i ); + +var rtagName = ( /<([a-z][^\/\0>\x20\t\r\n\f]*)/i ); + +var rscriptType = ( /^$|^module$|\/(?:java|ecma)script/i ); + + + +( function() { + var fragment = document.createDocumentFragment(), + div = fragment.appendChild( document.createElement( "div" ) ), + input = document.createElement( "input" ); + + // Support: Android 4.0 - 4.3 only + // Check state lost if the name is set (#11217) + // Support: Windows Web Apps (WWA) + // `name` and `type` must use .setAttribute for WWA (#14901) + input.setAttribute( "type", "radio" ); + input.setAttribute( "checked", "checked" ); + input.setAttribute( "name", "t" ); + + div.appendChild( input ); + + // Support: Android <=4.1 only + // Older WebKit doesn't clone checked state correctly in fragments + support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; + + // Support: IE <=11 only + // Make sure textarea (and checkbox) defaultValue is properly cloned + div.innerHTML = ""; + support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; + + // Support: IE <=9 only + // IE <=9 replaces "; + support.option = !!div.lastChild; +} )(); + + +// We have to close these tags to support XHTML (#13200) +var wrapMap = { + + // XHTML parsers do not magically insert elements in the + // same way that tag soup parsers do. So we cannot shorten + // this by omitting or other required elements. + thead: [ 1, "", "
" ], + col: [ 2, "", "
" ], + tr: [ 2, "", "
" ], + td: [ 3, "", "
" ], + + _default: [ 0, "", "" ] +}; + +wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; +wrapMap.th = wrapMap.td; + +// Support: IE <=9 only +if ( !support.option ) { + wrapMap.optgroup = wrapMap.option = [ 1, "" ]; +} + + +function getAll( context, tag ) { + + // Support: IE <=9 - 11 only + // Use typeof to avoid zero-argument method invocation on host objects (#15151) + var ret; + + if ( typeof context.getElementsByTagName !== "undefined" ) { + ret = context.getElementsByTagName( tag || "*" ); + + } else if ( typeof context.querySelectorAll !== "undefined" ) { + ret = context.querySelectorAll( tag || "*" ); + + } else { + ret = []; + } + + if ( tag === undefined || tag && nodeName( context, tag ) ) { + return jQuery.merge( [ context ], ret ); + } + + return ret; +} + + +// Mark scripts as having already been evaluated +function setGlobalEval( elems, refElements ) { + var i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + dataPriv.set( + elems[ i ], + "globalEval", + !refElements || dataPriv.get( refElements[ i ], "globalEval" ) + ); + } +} + + +var rhtml = /<|&#?\w+;/; + +function buildFragment( elems, context, scripts, selection, ignored ) { + var elem, tmp, tag, wrap, attached, j, + fragment = context.createDocumentFragment(), + nodes = [], + i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + elem = elems[ i ]; + + if ( elem || elem === 0 ) { + + // Add nodes directly + if ( toType( elem ) === "object" ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); + + // Convert non-html into a text node + } else if ( !rhtml.test( elem ) ) { + nodes.push( context.createTextNode( elem ) ); + + // Convert html into DOM nodes + } else { + tmp = tmp || fragment.appendChild( context.createElement( "div" ) ); + + // Deserialize a standard representation + tag = ( rtagName.exec( elem ) || [ "", "" ] )[ 1 ].toLowerCase(); + wrap = wrapMap[ tag ] || wrapMap._default; + tmp.innerHTML = wrap[ 1 ] + jQuery.htmlPrefilter( elem ) + wrap[ 2 ]; + + // Descend through wrappers to the right content + j = wrap[ 0 ]; + while ( j-- ) { + tmp = tmp.lastChild; + } + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, tmp.childNodes ); + + // Remember the top-level container + tmp = fragment.firstChild; + + // Ensure the created nodes are orphaned (#12392) + tmp.textContent = ""; + } + } + } + + // Remove wrapper from fragment + fragment.textContent = ""; + + i = 0; + while ( ( elem = nodes[ i++ ] ) ) { + + // Skip elements already in the context collection (trac-4087) + if ( selection && jQuery.inArray( elem, selection ) > -1 ) { + if ( ignored ) { + ignored.push( elem ); + } + continue; + } + + attached = isAttached( elem ); + + // Append to fragment + tmp = getAll( fragment.appendChild( elem ), "script" ); + + // Preserve script evaluation history + if ( attached ) { + setGlobalEval( tmp ); + } + + // Capture executables + if ( scripts ) { + j = 0; + while ( ( elem = tmp[ j++ ] ) ) { + if ( rscriptType.test( elem.type || "" ) ) { + scripts.push( elem ); + } + } + } + } + + return fragment; +} + + +var rtypenamespace = /^([^.]*)(?:\.(.+)|)/; + +function returnTrue() { + return true; +} + +function returnFalse() { + return false; +} + +// Support: IE <=9 - 11+ +// focus() and blur() are asynchronous, except when they are no-op. +// So expect focus to be synchronous when the element is already active, +// and blur to be synchronous when the element is not already active. +// (focus and blur are always synchronous in other supported browsers, +// this just defines when we can count on it). +function expectSync( elem, type ) { + return ( elem === safeActiveElement() ) === ( type === "focus" ); +} + +// Support: IE <=9 only +// Accessing document.activeElement can throw unexpectedly +// https://bugs.jquery.com/ticket/13393 +function safeActiveElement() { + try { + return document.activeElement; + } catch ( err ) { } +} + +function on( elem, types, selector, data, fn, one ) { + var origFn, type; + + // Types can be a map of types/handlers + if ( typeof types === "object" ) { + + // ( types-Object, selector, data ) + if ( typeof selector !== "string" ) { + + // ( types-Object, data ) + data = data || selector; + selector = undefined; + } + for ( type in types ) { + on( elem, type, selector, data, types[ type ], one ); + } + return elem; + } + + if ( data == null && fn == null ) { + + // ( types, fn ) + fn = selector; + data = selector = undefined; + } else if ( fn == null ) { + if ( typeof selector === "string" ) { + + // ( types, selector, fn ) + fn = data; + data = undefined; + } else { + + // ( types, data, fn ) + fn = data; + data = selector; + selector = undefined; + } + } + if ( fn === false ) { + fn = returnFalse; + } else if ( !fn ) { + return elem; + } + + if ( one === 1 ) { + origFn = fn; + fn = function( event ) { + + // Can use an empty set, since event contains the info + jQuery().off( event ); + return origFn.apply( this, arguments ); + }; + + // Use same guid so caller can remove using origFn + fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); + } + return elem.each( function() { + jQuery.event.add( this, types, fn, data, selector ); + } ); +} + +/* + * Helper functions for managing events -- not part of the public interface. + * Props to Dean Edwards' addEvent library for many of the ideas. + */ +jQuery.event = { + + global: {}, + + add: function( elem, types, handler, data, selector ) { + + var handleObjIn, eventHandle, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.get( elem ); + + // Only attach events to objects that accept data + if ( !acceptData( elem ) ) { + return; + } + + // Caller can pass in an object of custom data in lieu of the handler + if ( handler.handler ) { + handleObjIn = handler; + handler = handleObjIn.handler; + selector = handleObjIn.selector; + } + + // Ensure that invalid selectors throw exceptions at attach time + // Evaluate against documentElement in case elem is a non-element node (e.g., document) + if ( selector ) { + jQuery.find.matchesSelector( documentElement, selector ); + } + + // Make sure that the handler has a unique ID, used to find/remove it later + if ( !handler.guid ) { + handler.guid = jQuery.guid++; + } + + // Init the element's event structure and main handler, if this is the first + if ( !( events = elemData.events ) ) { + events = elemData.events = Object.create( null ); + } + if ( !( eventHandle = elemData.handle ) ) { + eventHandle = elemData.handle = function( e ) { + + // Discard the second event of a jQuery.event.trigger() and + // when an event is called after a page has unloaded + return typeof jQuery !== "undefined" && jQuery.event.triggered !== e.type ? + jQuery.event.dispatch.apply( elem, arguments ) : undefined; + }; + } + + // Handle multiple events separated by a space + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // There *must* be a type, no attaching namespace-only handlers + if ( !type ) { + continue; + } + + // If event changes its type, use the special event handlers for the changed type + special = jQuery.event.special[ type ] || {}; + + // If selector defined, determine special event api type, otherwise given type + type = ( selector ? special.delegateType : special.bindType ) || type; + + // Update special based on newly reset type + special = jQuery.event.special[ type ] || {}; + + // handleObj is passed to all event handlers + handleObj = jQuery.extend( { + type: type, + origType: origType, + data: data, + handler: handler, + guid: handler.guid, + selector: selector, + needsContext: selector && jQuery.expr.match.needsContext.test( selector ), + namespace: namespaces.join( "." ) + }, handleObjIn ); + + // Init the event handler queue if we're the first + if ( !( handlers = events[ type ] ) ) { + handlers = events[ type ] = []; + handlers.delegateCount = 0; + + // Only use addEventListener if the special events handler returns false + if ( !special.setup || + special.setup.call( elem, data, namespaces, eventHandle ) === false ) { + + if ( elem.addEventListener ) { + elem.addEventListener( type, eventHandle ); + } + } + } + + if ( special.add ) { + special.add.call( elem, handleObj ); + + if ( !handleObj.handler.guid ) { + handleObj.handler.guid = handler.guid; + } + } + + // Add to the element's handler list, delegates in front + if ( selector ) { + handlers.splice( handlers.delegateCount++, 0, handleObj ); + } else { + handlers.push( handleObj ); + } + + // Keep track of which events have ever been used, for event optimization + jQuery.event.global[ type ] = true; + } + + }, + + // Detach an event or set of events from an element + remove: function( elem, types, handler, selector, mappedTypes ) { + + var j, origCount, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.hasData( elem ) && dataPriv.get( elem ); + + if ( !elemData || !( events = elemData.events ) ) { + return; + } + + // Once for each type.namespace in types; type may be omitted + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // Unbind all events (on this namespace, if provided) for the element + if ( !type ) { + for ( type in events ) { + jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); + } + continue; + } + + special = jQuery.event.special[ type ] || {}; + type = ( selector ? special.delegateType : special.bindType ) || type; + handlers = events[ type ] || []; + tmp = tmp[ 2 ] && + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ); + + // Remove matching events + origCount = j = handlers.length; + while ( j-- ) { + handleObj = handlers[ j ]; + + if ( ( mappedTypes || origType === handleObj.origType ) && + ( !handler || handler.guid === handleObj.guid ) && + ( !tmp || tmp.test( handleObj.namespace ) ) && + ( !selector || selector === handleObj.selector || + selector === "**" && handleObj.selector ) ) { + handlers.splice( j, 1 ); + + if ( handleObj.selector ) { + handlers.delegateCount--; + } + if ( special.remove ) { + special.remove.call( elem, handleObj ); + } + } + } + + // Remove generic event handler if we removed something and no more handlers exist + // (avoids potential for endless recursion during removal of special event handlers) + if ( origCount && !handlers.length ) { + if ( !special.teardown || + special.teardown.call( elem, namespaces, elemData.handle ) === false ) { + + jQuery.removeEvent( elem, type, elemData.handle ); + } + + delete events[ type ]; + } + } + + // Remove data and the expando if it's no longer used + if ( jQuery.isEmptyObject( events ) ) { + dataPriv.remove( elem, "handle events" ); + } + }, + + dispatch: function( nativeEvent ) { + + var i, j, ret, matched, handleObj, handlerQueue, + args = new Array( arguments.length ), + + // Make a writable jQuery.Event from the native event object + event = jQuery.event.fix( nativeEvent ), + + handlers = ( + dataPriv.get( this, "events" ) || Object.create( null ) + )[ event.type ] || [], + special = jQuery.event.special[ event.type ] || {}; + + // Use the fix-ed jQuery.Event rather than the (read-only) native event + args[ 0 ] = event; + + for ( i = 1; i < arguments.length; i++ ) { + args[ i ] = arguments[ i ]; + } + + event.delegateTarget = this; + + // Call the preDispatch hook for the mapped type, and let it bail if desired + if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { + return; + } + + // Determine handlers + handlerQueue = jQuery.event.handlers.call( this, event, handlers ); + + // Run delegates first; they may want to stop propagation beneath us + i = 0; + while ( ( matched = handlerQueue[ i++ ] ) && !event.isPropagationStopped() ) { + event.currentTarget = matched.elem; + + j = 0; + while ( ( handleObj = matched.handlers[ j++ ] ) && + !event.isImmediatePropagationStopped() ) { + + // If the event is namespaced, then each handler is only invoked if it is + // specially universal or its namespaces are a superset of the event's. + if ( !event.rnamespace || handleObj.namespace === false || + event.rnamespace.test( handleObj.namespace ) ) { + + event.handleObj = handleObj; + event.data = handleObj.data; + + ret = ( ( jQuery.event.special[ handleObj.origType ] || {} ).handle || + handleObj.handler ).apply( matched.elem, args ); + + if ( ret !== undefined ) { + if ( ( event.result = ret ) === false ) { + event.preventDefault(); + event.stopPropagation(); + } + } + } + } + } + + // Call the postDispatch hook for the mapped type + if ( special.postDispatch ) { + special.postDispatch.call( this, event ); + } + + return event.result; + }, + + handlers: function( event, handlers ) { + var i, handleObj, sel, matchedHandlers, matchedSelectors, + handlerQueue = [], + delegateCount = handlers.delegateCount, + cur = event.target; + + // Find delegate handlers + if ( delegateCount && + + // Support: IE <=9 + // Black-hole SVG instance trees (trac-13180) + cur.nodeType && + + // Support: Firefox <=42 + // Suppress spec-violating clicks indicating a non-primary pointer button (trac-3861) + // https://www.w3.org/TR/DOM-Level-3-Events/#event-type-click + // Support: IE 11 only + // ...but not arrow key "clicks" of radio inputs, which can have `button` -1 (gh-2343) + !( event.type === "click" && event.button >= 1 ) ) { + + for ( ; cur !== this; cur = cur.parentNode || this ) { + + // Don't check non-elements (#13208) + // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) + if ( cur.nodeType === 1 && !( event.type === "click" && cur.disabled === true ) ) { + matchedHandlers = []; + matchedSelectors = {}; + for ( i = 0; i < delegateCount; i++ ) { + handleObj = handlers[ i ]; + + // Don't conflict with Object.prototype properties (#13203) + sel = handleObj.selector + " "; + + if ( matchedSelectors[ sel ] === undefined ) { + matchedSelectors[ sel ] = handleObj.needsContext ? + jQuery( sel, this ).index( cur ) > -1 : + jQuery.find( sel, this, null, [ cur ] ).length; + } + if ( matchedSelectors[ sel ] ) { + matchedHandlers.push( handleObj ); + } + } + if ( matchedHandlers.length ) { + handlerQueue.push( { elem: cur, handlers: matchedHandlers } ); + } + } + } + } + + // Add the remaining (directly-bound) handlers + cur = this; + if ( delegateCount < handlers.length ) { + handlerQueue.push( { elem: cur, handlers: handlers.slice( delegateCount ) } ); + } + + return handlerQueue; + }, + + addProp: function( name, hook ) { + Object.defineProperty( jQuery.Event.prototype, name, { + enumerable: true, + configurable: true, + + get: isFunction( hook ) ? + function() { + if ( this.originalEvent ) { + return hook( this.originalEvent ); + } + } : + function() { + if ( this.originalEvent ) { + return this.originalEvent[ name ]; + } + }, + + set: function( value ) { + Object.defineProperty( this, name, { + enumerable: true, + configurable: true, + writable: true, + value: value + } ); + } + } ); + }, + + fix: function( originalEvent ) { + return originalEvent[ jQuery.expando ] ? + originalEvent : + new jQuery.Event( originalEvent ); + }, + + special: { + load: { + + // Prevent triggered image.load events from bubbling to window.load + noBubble: true + }, + click: { + + // Utilize native event to ensure correct state for checkable inputs + setup: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Claim the first handler + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + // dataPriv.set( el, "click", ... ) + leverageNative( el, "click", returnTrue ); + } + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Force setup before triggering a click + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + leverageNative( el, "click" ); + } + + // Return non-false to allow normal event-path propagation + return true; + }, + + // For cross-browser consistency, suppress native .click() on links + // Also prevent it if we're currently inside a leveraged native-event stack + _default: function( event ) { + var target = event.target; + return rcheckableType.test( target.type ) && + target.click && nodeName( target, "input" ) && + dataPriv.get( target, "click" ) || + nodeName( target, "a" ); + } + }, + + beforeunload: { + postDispatch: function( event ) { + + // Support: Firefox 20+ + // Firefox doesn't alert if the returnValue field is not set. + if ( event.result !== undefined && event.originalEvent ) { + event.originalEvent.returnValue = event.result; + } + } + } + } +}; + +// Ensure the presence of an event listener that handles manually-triggered +// synthetic events by interrupting progress until reinvoked in response to +// *native* events that it fires directly, ensuring that state changes have +// already occurred before other listeners are invoked. +function leverageNative( el, type, expectSync ) { + + // Missing expectSync indicates a trigger call, which must force setup through jQuery.event.add + if ( !expectSync ) { + if ( dataPriv.get( el, type ) === undefined ) { + jQuery.event.add( el, type, returnTrue ); + } + return; + } + + // Register the controller as a special universal handler for all event namespaces + dataPriv.set( el, type, false ); + jQuery.event.add( el, type, { + namespace: false, + handler: function( event ) { + var notAsync, result, + saved = dataPriv.get( this, type ); + + if ( ( event.isTrigger & 1 ) && this[ type ] ) { + + // Interrupt processing of the outer synthetic .trigger()ed event + // Saved data should be false in such cases, but might be a leftover capture object + // from an async native handler (gh-4350) + if ( !saved.length ) { + + // Store arguments for use when handling the inner native event + // There will always be at least one argument (an event object), so this array + // will not be confused with a leftover capture object. + saved = slice.call( arguments ); + dataPriv.set( this, type, saved ); + + // Trigger the native event and capture its result + // Support: IE <=9 - 11+ + // focus() and blur() are asynchronous + notAsync = expectSync( this, type ); + this[ type ](); + result = dataPriv.get( this, type ); + if ( saved !== result || notAsync ) { + dataPriv.set( this, type, false ); + } else { + result = {}; + } + if ( saved !== result ) { + + // Cancel the outer synthetic event + event.stopImmediatePropagation(); + event.preventDefault(); + + // Support: Chrome 86+ + // In Chrome, if an element having a focusout handler is blurred by + // clicking outside of it, it invokes the handler synchronously. If + // that handler calls `.remove()` on the element, the data is cleared, + // leaving `result` undefined. We need to guard against this. + return result && result.value; + } + + // If this is an inner synthetic event for an event with a bubbling surrogate + // (focus or blur), assume that the surrogate already propagated from triggering the + // native event and prevent that from happening again here. + // This technically gets the ordering wrong w.r.t. to `.trigger()` (in which the + // bubbling surrogate propagates *after* the non-bubbling base), but that seems + // less bad than duplication. + } else if ( ( jQuery.event.special[ type ] || {} ).delegateType ) { + event.stopPropagation(); + } + + // If this is a native event triggered above, everything is now in order + // Fire an inner synthetic event with the original arguments + } else if ( saved.length ) { + + // ...and capture the result + dataPriv.set( this, type, { + value: jQuery.event.trigger( + + // Support: IE <=9 - 11+ + // Extend with the prototype to reset the above stopImmediatePropagation() + jQuery.extend( saved[ 0 ], jQuery.Event.prototype ), + saved.slice( 1 ), + this + ) + } ); + + // Abort handling of the native event + event.stopImmediatePropagation(); + } + } + } ); +} + +jQuery.removeEvent = function( elem, type, handle ) { + + // This "if" is needed for plain objects + if ( elem.removeEventListener ) { + elem.removeEventListener( type, handle ); + } +}; + +jQuery.Event = function( src, props ) { + + // Allow instantiation without the 'new' keyword + if ( !( this instanceof jQuery.Event ) ) { + return new jQuery.Event( src, props ); + } + + // Event object + if ( src && src.type ) { + this.originalEvent = src; + this.type = src.type; + + // Events bubbling up the document may have been marked as prevented + // by a handler lower down the tree; reflect the correct value. + this.isDefaultPrevented = src.defaultPrevented || + src.defaultPrevented === undefined && + + // Support: Android <=2.3 only + src.returnValue === false ? + returnTrue : + returnFalse; + + // Create target properties + // Support: Safari <=6 - 7 only + // Target should not be a text node (#504, #13143) + this.target = ( src.target && src.target.nodeType === 3 ) ? + src.target.parentNode : + src.target; + + this.currentTarget = src.currentTarget; + this.relatedTarget = src.relatedTarget; + + // Event type + } else { + this.type = src; + } + + // Put explicitly provided properties onto the event object + if ( props ) { + jQuery.extend( this, props ); + } + + // Create a timestamp if incoming event doesn't have one + this.timeStamp = src && src.timeStamp || Date.now(); + + // Mark it as fixed + this[ jQuery.expando ] = true; +}; + +// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding +// https://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html +jQuery.Event.prototype = { + constructor: jQuery.Event, + isDefaultPrevented: returnFalse, + isPropagationStopped: returnFalse, + isImmediatePropagationStopped: returnFalse, + isSimulated: false, + + preventDefault: function() { + var e = this.originalEvent; + + this.isDefaultPrevented = returnTrue; + + if ( e && !this.isSimulated ) { + e.preventDefault(); + } + }, + stopPropagation: function() { + var e = this.originalEvent; + + this.isPropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopPropagation(); + } + }, + stopImmediatePropagation: function() { + var e = this.originalEvent; + + this.isImmediatePropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopImmediatePropagation(); + } + + this.stopPropagation(); + } +}; + +// Includes all common event props including KeyEvent and MouseEvent specific props +jQuery.each( { + altKey: true, + bubbles: true, + cancelable: true, + changedTouches: true, + ctrlKey: true, + detail: true, + eventPhase: true, + metaKey: true, + pageX: true, + pageY: true, + shiftKey: true, + view: true, + "char": true, + code: true, + charCode: true, + key: true, + keyCode: true, + button: true, + buttons: true, + clientX: true, + clientY: true, + offsetX: true, + offsetY: true, + pointerId: true, + pointerType: true, + screenX: true, + screenY: true, + targetTouches: true, + toElement: true, + touches: true, + which: true +}, jQuery.event.addProp ); + +jQuery.each( { focus: "focusin", blur: "focusout" }, function( type, delegateType ) { + jQuery.event.special[ type ] = { + + // Utilize native event if possible so blur/focus sequence is correct + setup: function() { + + // Claim the first handler + // dataPriv.set( this, "focus", ... ) + // dataPriv.set( this, "blur", ... ) + leverageNative( this, type, expectSync ); + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function() { + + // Force setup before trigger + leverageNative( this, type ); + + // Return non-false to allow normal event-path propagation + return true; + }, + + // Suppress native focus or blur as it's already being fired + // in leverageNative. + _default: function() { + return true; + }, + + delegateType: delegateType + }; +} ); + +// Create mouseenter/leave events using mouseover/out and event-time checks +// so that event delegation works in jQuery. +// Do the same for pointerenter/pointerleave and pointerover/pointerout +// +// Support: Safari 7 only +// Safari sends mouseenter too often; see: +// https://bugs.chromium.org/p/chromium/issues/detail?id=470258 +// for the description of the bug (it existed in older Chrome versions as well). +jQuery.each( { + mouseenter: "mouseover", + mouseleave: "mouseout", + pointerenter: "pointerover", + pointerleave: "pointerout" +}, function( orig, fix ) { + jQuery.event.special[ orig ] = { + delegateType: fix, + bindType: fix, + + handle: function( event ) { + var ret, + target = this, + related = event.relatedTarget, + handleObj = event.handleObj; + + // For mouseenter/leave call the handler if related is outside the target. + // NB: No relatedTarget if the mouse left/entered the browser window + if ( !related || ( related !== target && !jQuery.contains( target, related ) ) ) { + event.type = handleObj.origType; + ret = handleObj.handler.apply( this, arguments ); + event.type = fix; + } + return ret; + } + }; +} ); + +jQuery.fn.extend( { + + on: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn ); + }, + one: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn, 1 ); + }, + off: function( types, selector, fn ) { + var handleObj, type; + if ( types && types.preventDefault && types.handleObj ) { + + // ( event ) dispatched jQuery.Event + handleObj = types.handleObj; + jQuery( types.delegateTarget ).off( + handleObj.namespace ? + handleObj.origType + "." + handleObj.namespace : + handleObj.origType, + handleObj.selector, + handleObj.handler + ); + return this; + } + if ( typeof types === "object" ) { + + // ( types-object [, selector] ) + for ( type in types ) { + this.off( type, selector, types[ type ] ); + } + return this; + } + if ( selector === false || typeof selector === "function" ) { + + // ( types [, fn] ) + fn = selector; + selector = undefined; + } + if ( fn === false ) { + fn = returnFalse; + } + return this.each( function() { + jQuery.event.remove( this, types, fn, selector ); + } ); + } +} ); + + +var + + // Support: IE <=10 - 11, Edge 12 - 13 only + // In IE/Edge using regex groups here causes severe slowdowns. + // See https://connect.microsoft.com/IE/feedback/details/1736512/ + rnoInnerhtml = /\s*$/g; + +// Prefer a tbody over its parent table for containing new rows +function manipulationTarget( elem, content ) { + if ( nodeName( elem, "table" ) && + nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ) { + + return jQuery( elem ).children( "tbody" )[ 0 ] || elem; + } + + return elem; +} + +// Replace/restore the type attribute of script elements for safe DOM manipulation +function disableScript( elem ) { + elem.type = ( elem.getAttribute( "type" ) !== null ) + "/" + elem.type; + return elem; +} +function restoreScript( elem ) { + if ( ( elem.type || "" ).slice( 0, 5 ) === "true/" ) { + elem.type = elem.type.slice( 5 ); + } else { + elem.removeAttribute( "type" ); + } + + return elem; +} + +function cloneCopyEvent( src, dest ) { + var i, l, type, pdataOld, udataOld, udataCur, events; + + if ( dest.nodeType !== 1 ) { + return; + } + + // 1. Copy private data: events, handlers, etc. + if ( dataPriv.hasData( src ) ) { + pdataOld = dataPriv.get( src ); + events = pdataOld.events; + + if ( events ) { + dataPriv.remove( dest, "handle events" ); + + for ( type in events ) { + for ( i = 0, l = events[ type ].length; i < l; i++ ) { + jQuery.event.add( dest, type, events[ type ][ i ] ); + } + } + } + } + + // 2. Copy user data + if ( dataUser.hasData( src ) ) { + udataOld = dataUser.access( src ); + udataCur = jQuery.extend( {}, udataOld ); + + dataUser.set( dest, udataCur ); + } +} + +// Fix IE bugs, see support tests +function fixInput( src, dest ) { + var nodeName = dest.nodeName.toLowerCase(); + + // Fails to persist the checked state of a cloned checkbox or radio button. + if ( nodeName === "input" && rcheckableType.test( src.type ) ) { + dest.checked = src.checked; + + // Fails to return the selected option to the default selected state when cloning options + } else if ( nodeName === "input" || nodeName === "textarea" ) { + dest.defaultValue = src.defaultValue; + } +} + +function domManip( collection, args, callback, ignored ) { + + // Flatten any nested arrays + args = flat( args ); + + var fragment, first, scripts, hasScripts, node, doc, + i = 0, + l = collection.length, + iNoClone = l - 1, + value = args[ 0 ], + valueIsFunction = isFunction( value ); + + // We can't cloneNode fragments that contain checked, in WebKit + if ( valueIsFunction || + ( l > 1 && typeof value === "string" && + !support.checkClone && rchecked.test( value ) ) ) { + return collection.each( function( index ) { + var self = collection.eq( index ); + if ( valueIsFunction ) { + args[ 0 ] = value.call( this, index, self.html() ); + } + domManip( self, args, callback, ignored ); + } ); + } + + if ( l ) { + fragment = buildFragment( args, collection[ 0 ].ownerDocument, false, collection, ignored ); + first = fragment.firstChild; + + if ( fragment.childNodes.length === 1 ) { + fragment = first; + } + + // Require either new content or an interest in ignored elements to invoke the callback + if ( first || ignored ) { + scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); + hasScripts = scripts.length; + + // Use the original fragment for the last item + // instead of the first because it can end up + // being emptied incorrectly in certain situations (#8070). + for ( ; i < l; i++ ) { + node = fragment; + + if ( i !== iNoClone ) { + node = jQuery.clone( node, true, true ); + + // Keep references to cloned scripts for later restoration + if ( hasScripts ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( scripts, getAll( node, "script" ) ); + } + } + + callback.call( collection[ i ], node, i ); + } + + if ( hasScripts ) { + doc = scripts[ scripts.length - 1 ].ownerDocument; + + // Reenable scripts + jQuery.map( scripts, restoreScript ); + + // Evaluate executable scripts on first document insertion + for ( i = 0; i < hasScripts; i++ ) { + node = scripts[ i ]; + if ( rscriptType.test( node.type || "" ) && + !dataPriv.access( node, "globalEval" ) && + jQuery.contains( doc, node ) ) { + + if ( node.src && ( node.type || "" ).toLowerCase() !== "module" ) { + + // Optional AJAX dependency, but won't run scripts if not present + if ( jQuery._evalUrl && !node.noModule ) { + jQuery._evalUrl( node.src, { + nonce: node.nonce || node.getAttribute( "nonce" ) + }, doc ); + } + } else { + DOMEval( node.textContent.replace( rcleanScript, "" ), node, doc ); + } + } + } + } + } + } + + return collection; +} + +function remove( elem, selector, keepData ) { + var node, + nodes = selector ? jQuery.filter( selector, elem ) : elem, + i = 0; + + for ( ; ( node = nodes[ i ] ) != null; i++ ) { + if ( !keepData && node.nodeType === 1 ) { + jQuery.cleanData( getAll( node ) ); + } + + if ( node.parentNode ) { + if ( keepData && isAttached( node ) ) { + setGlobalEval( getAll( node, "script" ) ); + } + node.parentNode.removeChild( node ); + } + } + + return elem; +} + +jQuery.extend( { + htmlPrefilter: function( html ) { + return html; + }, + + clone: function( elem, dataAndEvents, deepDataAndEvents ) { + var i, l, srcElements, destElements, + clone = elem.cloneNode( true ), + inPage = isAttached( elem ); + + // Fix IE cloning issues + if ( !support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && + !jQuery.isXMLDoc( elem ) ) { + + // We eschew Sizzle here for performance reasons: https://jsperf.com/getall-vs-sizzle/2 + destElements = getAll( clone ); + srcElements = getAll( elem ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + fixInput( srcElements[ i ], destElements[ i ] ); + } + } + + // Copy the events from the original to the clone + if ( dataAndEvents ) { + if ( deepDataAndEvents ) { + srcElements = srcElements || getAll( elem ); + destElements = destElements || getAll( clone ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + cloneCopyEvent( srcElements[ i ], destElements[ i ] ); + } + } else { + cloneCopyEvent( elem, clone ); + } + } + + // Preserve script evaluation history + destElements = getAll( clone, "script" ); + if ( destElements.length > 0 ) { + setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); + } + + // Return the cloned set + return clone; + }, + + cleanData: function( elems ) { + var data, elem, type, + special = jQuery.event.special, + i = 0; + + for ( ; ( elem = elems[ i ] ) !== undefined; i++ ) { + if ( acceptData( elem ) ) { + if ( ( data = elem[ dataPriv.expando ] ) ) { + if ( data.events ) { + for ( type in data.events ) { + if ( special[ type ] ) { + jQuery.event.remove( elem, type ); + + // This is a shortcut to avoid jQuery.event.remove's overhead + } else { + jQuery.removeEvent( elem, type, data.handle ); + } + } + } + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataPriv.expando ] = undefined; + } + if ( elem[ dataUser.expando ] ) { + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataUser.expando ] = undefined; + } + } + } + } +} ); + +jQuery.fn.extend( { + detach: function( selector ) { + return remove( this, selector, true ); + }, + + remove: function( selector ) { + return remove( this, selector ); + }, + + text: function( value ) { + return access( this, function( value ) { + return value === undefined ? + jQuery.text( this ) : + this.empty().each( function() { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + this.textContent = value; + } + } ); + }, null, value, arguments.length ); + }, + + append: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.appendChild( elem ); + } + } ); + }, + + prepend: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.insertBefore( elem, target.firstChild ); + } + } ); + }, + + before: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this ); + } + } ); + }, + + after: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this.nextSibling ); + } + } ); + }, + + empty: function() { + var elem, + i = 0; + + for ( ; ( elem = this[ i ] ) != null; i++ ) { + if ( elem.nodeType === 1 ) { + + // Prevent memory leaks + jQuery.cleanData( getAll( elem, false ) ); + + // Remove any remaining nodes + elem.textContent = ""; + } + } + + return this; + }, + + clone: function( dataAndEvents, deepDataAndEvents ) { + dataAndEvents = dataAndEvents == null ? false : dataAndEvents; + deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; + + return this.map( function() { + return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); + } ); + }, + + html: function( value ) { + return access( this, function( value ) { + var elem = this[ 0 ] || {}, + i = 0, + l = this.length; + + if ( value === undefined && elem.nodeType === 1 ) { + return elem.innerHTML; + } + + // See if we can take a shortcut and just use innerHTML + if ( typeof value === "string" && !rnoInnerhtml.test( value ) && + !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { + + value = jQuery.htmlPrefilter( value ); + + try { + for ( ; i < l; i++ ) { + elem = this[ i ] || {}; + + // Remove element nodes and prevent memory leaks + if ( elem.nodeType === 1 ) { + jQuery.cleanData( getAll( elem, false ) ); + elem.innerHTML = value; + } + } + + elem = 0; + + // If using innerHTML throws an exception, use the fallback method + } catch ( e ) {} + } + + if ( elem ) { + this.empty().append( value ); + } + }, null, value, arguments.length ); + }, + + replaceWith: function() { + var ignored = []; + + // Make the changes, replacing each non-ignored context element with the new content + return domManip( this, arguments, function( elem ) { + var parent = this.parentNode; + + if ( jQuery.inArray( this, ignored ) < 0 ) { + jQuery.cleanData( getAll( this ) ); + if ( parent ) { + parent.replaceChild( elem, this ); + } + } + + // Force callback invocation + }, ignored ); + } +} ); + +jQuery.each( { + appendTo: "append", + prependTo: "prepend", + insertBefore: "before", + insertAfter: "after", + replaceAll: "replaceWith" +}, function( name, original ) { + jQuery.fn[ name ] = function( selector ) { + var elems, + ret = [], + insert = jQuery( selector ), + last = insert.length - 1, + i = 0; + + for ( ; i <= last; i++ ) { + elems = i === last ? this : this.clone( true ); + jQuery( insert[ i ] )[ original ]( elems ); + + // Support: Android <=4.0 only, PhantomJS 1 only + // .get() because push.apply(_, arraylike) throws on ancient WebKit + push.apply( ret, elems.get() ); + } + + return this.pushStack( ret ); + }; +} ); +var rnumnonpx = new RegExp( "^(" + pnum + ")(?!px)[a-z%]+$", "i" ); + +var getStyles = function( elem ) { + + // Support: IE <=11 only, Firefox <=30 (#15098, #14150) + // IE throws on elements created in popups + // FF meanwhile throws on frame elements through "defaultView.getComputedStyle" + var view = elem.ownerDocument.defaultView; + + if ( !view || !view.opener ) { + view = window; + } + + return view.getComputedStyle( elem ); + }; + +var swap = function( elem, options, callback ) { + var ret, name, + old = {}; + + // Remember the old values, and insert the new ones + for ( name in options ) { + old[ name ] = elem.style[ name ]; + elem.style[ name ] = options[ name ]; + } + + ret = callback.call( elem ); + + // Revert the old values + for ( name in options ) { + elem.style[ name ] = old[ name ]; + } + + return ret; +}; + + +var rboxStyle = new RegExp( cssExpand.join( "|" ), "i" ); + + + +( function() { + + // Executing both pixelPosition & boxSizingReliable tests require only one layout + // so they're executed at the same time to save the second computation. + function computeStyleTests() { + + // This is a singleton, we need to execute it only once + if ( !div ) { + return; + } + + container.style.cssText = "position:absolute;left:-11111px;width:60px;" + + "margin-top:1px;padding:0;border:0"; + div.style.cssText = + "position:relative;display:block;box-sizing:border-box;overflow:scroll;" + + "margin:auto;border:1px;padding:1px;" + + "width:60%;top:1%"; + documentElement.appendChild( container ).appendChild( div ); + + var divStyle = window.getComputedStyle( div ); + pixelPositionVal = divStyle.top !== "1%"; + + // Support: Android 4.0 - 4.3 only, Firefox <=3 - 44 + reliableMarginLeftVal = roundPixelMeasures( divStyle.marginLeft ) === 12; + + // Support: Android 4.0 - 4.3 only, Safari <=9.1 - 10.1, iOS <=7.0 - 9.3 + // Some styles come back with percentage values, even though they shouldn't + div.style.right = "60%"; + pixelBoxStylesVal = roundPixelMeasures( divStyle.right ) === 36; + + // Support: IE 9 - 11 only + // Detect misreporting of content dimensions for box-sizing:border-box elements + boxSizingReliableVal = roundPixelMeasures( divStyle.width ) === 36; + + // Support: IE 9 only + // Detect overflow:scroll screwiness (gh-3699) + // Support: Chrome <=64 + // Don't get tricked when zoom affects offsetWidth (gh-4029) + div.style.position = "absolute"; + scrollboxSizeVal = roundPixelMeasures( div.offsetWidth / 3 ) === 12; + + documentElement.removeChild( container ); + + // Nullify the div so it wouldn't be stored in the memory and + // it will also be a sign that checks already performed + div = null; + } + + function roundPixelMeasures( measure ) { + return Math.round( parseFloat( measure ) ); + } + + var pixelPositionVal, boxSizingReliableVal, scrollboxSizeVal, pixelBoxStylesVal, + reliableTrDimensionsVal, reliableMarginLeftVal, + container = document.createElement( "div" ), + div = document.createElement( "div" ); + + // Finish early in limited (non-browser) environments + if ( !div.style ) { + return; + } + + // Support: IE <=9 - 11 only + // Style of cloned element affects source element cloned (#8908) + div.style.backgroundClip = "content-box"; + div.cloneNode( true ).style.backgroundClip = ""; + support.clearCloneStyle = div.style.backgroundClip === "content-box"; + + jQuery.extend( support, { + boxSizingReliable: function() { + computeStyleTests(); + return boxSizingReliableVal; + }, + pixelBoxStyles: function() { + computeStyleTests(); + return pixelBoxStylesVal; + }, + pixelPosition: function() { + computeStyleTests(); + return pixelPositionVal; + }, + reliableMarginLeft: function() { + computeStyleTests(); + return reliableMarginLeftVal; + }, + scrollboxSize: function() { + computeStyleTests(); + return scrollboxSizeVal; + }, + + // Support: IE 9 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Behavior in IE 9 is more subtle than in newer versions & it passes + // some versions of this test; make sure not to make it pass there! + // + // Support: Firefox 70+ + // Only Firefox includes border widths + // in computed dimensions. (gh-4529) + reliableTrDimensions: function() { + var table, tr, trChild, trStyle; + if ( reliableTrDimensionsVal == null ) { + table = document.createElement( "table" ); + tr = document.createElement( "tr" ); + trChild = document.createElement( "div" ); + + table.style.cssText = "position:absolute;left:-11111px;border-collapse:separate"; + tr.style.cssText = "border:1px solid"; + + // Support: Chrome 86+ + // Height set through cssText does not get applied. + // Computed height then comes back as 0. + tr.style.height = "1px"; + trChild.style.height = "9px"; + + // Support: Android 8 Chrome 86+ + // In our bodyBackground.html iframe, + // display for all div elements is set to "inline", + // which causes a problem only in Android 8 Chrome 86. + // Ensuring the div is display: block + // gets around this issue. + trChild.style.display = "block"; + + documentElement + .appendChild( table ) + .appendChild( tr ) + .appendChild( trChild ); + + trStyle = window.getComputedStyle( tr ); + reliableTrDimensionsVal = ( parseInt( trStyle.height, 10 ) + + parseInt( trStyle.borderTopWidth, 10 ) + + parseInt( trStyle.borderBottomWidth, 10 ) ) === tr.offsetHeight; + + documentElement.removeChild( table ); + } + return reliableTrDimensionsVal; + } + } ); +} )(); + + +function curCSS( elem, name, computed ) { + var width, minWidth, maxWidth, ret, + + // Support: Firefox 51+ + // Retrieving style before computed somehow + // fixes an issue with getting wrong values + // on detached elements + style = elem.style; + + computed = computed || getStyles( elem ); + + // getPropertyValue is needed for: + // .css('filter') (IE 9 only, #12537) + // .css('--customProperty) (#3144) + if ( computed ) { + ret = computed.getPropertyValue( name ) || computed[ name ]; + + if ( ret === "" && !isAttached( elem ) ) { + ret = jQuery.style( elem, name ); + } + + // A tribute to the "awesome hack by Dean Edwards" + // Android Browser returns percentage for some values, + // but width seems to be reliably pixels. + // This is against the CSSOM draft spec: + // https://drafts.csswg.org/cssom/#resolved-values + if ( !support.pixelBoxStyles() && rnumnonpx.test( ret ) && rboxStyle.test( name ) ) { + + // Remember the original values + width = style.width; + minWidth = style.minWidth; + maxWidth = style.maxWidth; + + // Put in the new values to get a computed value out + style.minWidth = style.maxWidth = style.width = ret; + ret = computed.width; + + // Revert the changed values + style.width = width; + style.minWidth = minWidth; + style.maxWidth = maxWidth; + } + } + + return ret !== undefined ? + + // Support: IE <=9 - 11 only + // IE returns zIndex value as an integer. + ret + "" : + ret; +} + + +function addGetHookIf( conditionFn, hookFn ) { + + // Define the hook, we'll check on the first run if it's really needed. + return { + get: function() { + if ( conditionFn() ) { + + // Hook not needed (or it's not possible to use it due + // to missing dependency), remove it. + delete this.get; + return; + } + + // Hook needed; redefine it so that the support test is not executed again. + return ( this.get = hookFn ).apply( this, arguments ); + } + }; +} + + +var cssPrefixes = [ "Webkit", "Moz", "ms" ], + emptyStyle = document.createElement( "div" ).style, + vendorProps = {}; + +// Return a vendor-prefixed property or undefined +function vendorPropName( name ) { + + // Check for vendor prefixed names + var capName = name[ 0 ].toUpperCase() + name.slice( 1 ), + i = cssPrefixes.length; + + while ( i-- ) { + name = cssPrefixes[ i ] + capName; + if ( name in emptyStyle ) { + return name; + } + } +} + +// Return a potentially-mapped jQuery.cssProps or vendor prefixed property +function finalPropName( name ) { + var final = jQuery.cssProps[ name ] || vendorProps[ name ]; + + if ( final ) { + return final; + } + if ( name in emptyStyle ) { + return name; + } + return vendorProps[ name ] = vendorPropName( name ) || name; +} + + +var + + // Swappable if display is none or starts with table + // except "table", "table-cell", or "table-caption" + // See here for display values: https://developer.mozilla.org/en-US/docs/CSS/display + rdisplayswap = /^(none|table(?!-c[ea]).+)/, + rcustomProp = /^--/, + cssShow = { position: "absolute", visibility: "hidden", display: "block" }, + cssNormalTransform = { + letterSpacing: "0", + fontWeight: "400" + }; + +function setPositiveNumber( _elem, value, subtract ) { + + // Any relative (+/-) values have already been + // normalized at this point + var matches = rcssNum.exec( value ); + return matches ? + + // Guard against undefined "subtract", e.g., when used as in cssHooks + Math.max( 0, matches[ 2 ] - ( subtract || 0 ) ) + ( matches[ 3 ] || "px" ) : + value; +} + +function boxModelAdjustment( elem, dimension, box, isBorderBox, styles, computedVal ) { + var i = dimension === "width" ? 1 : 0, + extra = 0, + delta = 0; + + // Adjustment may not be necessary + if ( box === ( isBorderBox ? "border" : "content" ) ) { + return 0; + } + + for ( ; i < 4; i += 2 ) { + + // Both box models exclude margin + if ( box === "margin" ) { + delta += jQuery.css( elem, box + cssExpand[ i ], true, styles ); + } + + // If we get here with a content-box, we're seeking "padding" or "border" or "margin" + if ( !isBorderBox ) { + + // Add padding + delta += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + + // For "border" or "margin", add border + if ( box !== "padding" ) { + delta += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + + // But still keep track of it otherwise + } else { + extra += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + + // If we get here with a border-box (content + padding + border), we're seeking "content" or + // "padding" or "margin" + } else { + + // For "content", subtract padding + if ( box === "content" ) { + delta -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + } + + // For "content" or "padding", subtract border + if ( box !== "margin" ) { + delta -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + } + } + + // Account for positive content-box scroll gutter when requested by providing computedVal + if ( !isBorderBox && computedVal >= 0 ) { + + // offsetWidth/offsetHeight is a rounded sum of content, padding, scroll gutter, and border + // Assuming integer scroll gutter, subtract the rest and round down + delta += Math.max( 0, Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + computedVal - + delta - + extra - + 0.5 + + // If offsetWidth/offsetHeight is unknown, then we can't determine content-box scroll gutter + // Use an explicit zero to avoid NaN (gh-3964) + ) ) || 0; + } + + return delta; +} + +function getWidthOrHeight( elem, dimension, extra ) { + + // Start with computed style + var styles = getStyles( elem ), + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-4322). + // Fake content-box until we know it's needed to know the true value. + boxSizingNeeded = !support.boxSizingReliable() || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + valueIsBorderBox = isBorderBox, + + val = curCSS( elem, dimension, styles ), + offsetProp = "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ); + + // Support: Firefox <=54 + // Return a confounding non-pixel value or feign ignorance, as appropriate. + if ( rnumnonpx.test( val ) ) { + if ( !extra ) { + return val; + } + val = "auto"; + } + + + // Support: IE 9 - 11 only + // Use offsetWidth/offsetHeight for when box sizing is unreliable. + // In those cases, the computed value can be trusted to be border-box. + if ( ( !support.boxSizingReliable() && isBorderBox || + + // Support: IE 10 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Interestingly, in some cases IE 9 doesn't suffer from this issue. + !support.reliableTrDimensions() && nodeName( elem, "tr" ) || + + // Fall back to offsetWidth/offsetHeight when value is "auto" + // This happens for inline elements with no explicit setting (gh-3571) + val === "auto" || + + // Support: Android <=4.1 - 4.3 only + // Also use offsetWidth/offsetHeight for misreported inline dimensions (gh-3602) + !parseFloat( val ) && jQuery.css( elem, "display", false, styles ) === "inline" ) && + + // Make sure the element is visible & connected + elem.getClientRects().length ) { + + isBorderBox = jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; + + // Where available, offsetWidth/offsetHeight approximate border box dimensions. + // Where not available (e.g., SVG), assume unreliable box-sizing and interpret the + // retrieved value as a content box dimension. + valueIsBorderBox = offsetProp in elem; + if ( valueIsBorderBox ) { + val = elem[ offsetProp ]; + } + } + + // Normalize "" and auto + val = parseFloat( val ) || 0; + + // Adjust for the element's box model + return ( val + + boxModelAdjustment( + elem, + dimension, + extra || ( isBorderBox ? "border" : "content" ), + valueIsBorderBox, + styles, + + // Provide the current computed size to request scroll gutter calculation (gh-3589) + val + ) + ) + "px"; +} + +jQuery.extend( { + + // Add in style property hooks for overriding the default + // behavior of getting and setting a style property + cssHooks: { + opacity: { + get: function( elem, computed ) { + if ( computed ) { + + // We should always get a number back from opacity + var ret = curCSS( elem, "opacity" ); + return ret === "" ? "1" : ret; + } + } + } + }, + + // Don't automatically add "px" to these possibly-unitless properties + cssNumber: { + "animationIterationCount": true, + "columnCount": true, + "fillOpacity": true, + "flexGrow": true, + "flexShrink": true, + "fontWeight": true, + "gridArea": true, + "gridColumn": true, + "gridColumnEnd": true, + "gridColumnStart": true, + "gridRow": true, + "gridRowEnd": true, + "gridRowStart": true, + "lineHeight": true, + "opacity": true, + "order": true, + "orphans": true, + "widows": true, + "zIndex": true, + "zoom": true + }, + + // Add in properties whose names you wish to fix before + // setting or getting the value + cssProps: {}, + + // Get and set the style property on a DOM Node + style: function( elem, name, value, extra ) { + + // Don't set styles on text and comment nodes + if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { + return; + } + + // Make sure that we're working with the right name + var ret, type, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ), + style = elem.style; + + // Make sure that we're working with the right name. We don't + // want to query the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Gets hook for the prefixed version, then unprefixed version + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // Check if we're setting a value + if ( value !== undefined ) { + type = typeof value; + + // Convert "+=" or "-=" to relative numbers (#7345) + if ( type === "string" && ( ret = rcssNum.exec( value ) ) && ret[ 1 ] ) { + value = adjustCSS( elem, name, ret ); + + // Fixes bug #9237 + type = "number"; + } + + // Make sure that null and NaN values aren't set (#7116) + if ( value == null || value !== value ) { + return; + } + + // If a number was passed in, add the unit (except for certain CSS properties) + // The isCustomProp check can be removed in jQuery 4.0 when we only auto-append + // "px" to a few hardcoded values. + if ( type === "number" && !isCustomProp ) { + value += ret && ret[ 3 ] || ( jQuery.cssNumber[ origName ] ? "" : "px" ); + } + + // background-* props affect original clone's values + if ( !support.clearCloneStyle && value === "" && name.indexOf( "background" ) === 0 ) { + style[ name ] = "inherit"; + } + + // If a hook was provided, use that value, otherwise just set the specified value + if ( !hooks || !( "set" in hooks ) || + ( value = hooks.set( elem, value, extra ) ) !== undefined ) { + + if ( isCustomProp ) { + style.setProperty( name, value ); + } else { + style[ name ] = value; + } + } + + } else { + + // If a hook was provided get the non-computed value from there + if ( hooks && "get" in hooks && + ( ret = hooks.get( elem, false, extra ) ) !== undefined ) { + + return ret; + } + + // Otherwise just get the value from the style object + return style[ name ]; + } + }, + + css: function( elem, name, extra, styles ) { + var val, num, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ); + + // Make sure that we're working with the right name. We don't + // want to modify the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Try prefixed name followed by the unprefixed name + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // If a hook was provided get the computed value from there + if ( hooks && "get" in hooks ) { + val = hooks.get( elem, true, extra ); + } + + // Otherwise, if a way to get the computed value exists, use that + if ( val === undefined ) { + val = curCSS( elem, name, styles ); + } + + // Convert "normal" to computed value + if ( val === "normal" && name in cssNormalTransform ) { + val = cssNormalTransform[ name ]; + } + + // Make numeric if forced or a qualifier was provided and val looks numeric + if ( extra === "" || extra ) { + num = parseFloat( val ); + return extra === true || isFinite( num ) ? num || 0 : val; + } + + return val; + } +} ); + +jQuery.each( [ "height", "width" ], function( _i, dimension ) { + jQuery.cssHooks[ dimension ] = { + get: function( elem, computed, extra ) { + if ( computed ) { + + // Certain elements can have dimension info if we invisibly show them + // but it must have a current display style that would benefit + return rdisplayswap.test( jQuery.css( elem, "display" ) ) && + + // Support: Safari 8+ + // Table columns in Safari have non-zero offsetWidth & zero + // getBoundingClientRect().width unless display is changed. + // Support: IE <=11 only + // Running getBoundingClientRect on a disconnected node + // in IE throws an error. + ( !elem.getClientRects().length || !elem.getBoundingClientRect().width ) ? + swap( elem, cssShow, function() { + return getWidthOrHeight( elem, dimension, extra ); + } ) : + getWidthOrHeight( elem, dimension, extra ); + } + }, + + set: function( elem, value, extra ) { + var matches, + styles = getStyles( elem ), + + // Only read styles.position if the test has a chance to fail + // to avoid forcing a reflow. + scrollboxSizeBuggy = !support.scrollboxSize() && + styles.position === "absolute", + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-3991) + boxSizingNeeded = scrollboxSizeBuggy || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + subtract = extra ? + boxModelAdjustment( + elem, + dimension, + extra, + isBorderBox, + styles + ) : + 0; + + // Account for unreliable border-box dimensions by comparing offset* to computed and + // faking a content-box to get border and padding (gh-3699) + if ( isBorderBox && scrollboxSizeBuggy ) { + subtract -= Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + parseFloat( styles[ dimension ] ) - + boxModelAdjustment( elem, dimension, "border", false, styles ) - + 0.5 + ); + } + + // Convert to pixels if value adjustment is needed + if ( subtract && ( matches = rcssNum.exec( value ) ) && + ( matches[ 3 ] || "px" ) !== "px" ) { + + elem.style[ dimension ] = value; + value = jQuery.css( elem, dimension ); + } + + return setPositiveNumber( elem, value, subtract ); + } + }; +} ); + +jQuery.cssHooks.marginLeft = addGetHookIf( support.reliableMarginLeft, + function( elem, computed ) { + if ( computed ) { + return ( parseFloat( curCSS( elem, "marginLeft" ) ) || + elem.getBoundingClientRect().left - + swap( elem, { marginLeft: 0 }, function() { + return elem.getBoundingClientRect().left; + } ) + ) + "px"; + } + } +); + +// These hooks are used by animate to expand properties +jQuery.each( { + margin: "", + padding: "", + border: "Width" +}, function( prefix, suffix ) { + jQuery.cssHooks[ prefix + suffix ] = { + expand: function( value ) { + var i = 0, + expanded = {}, + + // Assumes a single number if not a string + parts = typeof value === "string" ? value.split( " " ) : [ value ]; + + for ( ; i < 4; i++ ) { + expanded[ prefix + cssExpand[ i ] + suffix ] = + parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; + } + + return expanded; + } + }; + + if ( prefix !== "margin" ) { + jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; + } +} ); + +jQuery.fn.extend( { + css: function( name, value ) { + return access( this, function( elem, name, value ) { + var styles, len, + map = {}, + i = 0; + + if ( Array.isArray( name ) ) { + styles = getStyles( elem ); + len = name.length; + + for ( ; i < len; i++ ) { + map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); + } + + return map; + } + + return value !== undefined ? + jQuery.style( elem, name, value ) : + jQuery.css( elem, name ); + }, name, value, arguments.length > 1 ); + } +} ); + + +function Tween( elem, options, prop, end, easing ) { + return new Tween.prototype.init( elem, options, prop, end, easing ); +} +jQuery.Tween = Tween; + +Tween.prototype = { + constructor: Tween, + init: function( elem, options, prop, end, easing, unit ) { + this.elem = elem; + this.prop = prop; + this.easing = easing || jQuery.easing._default; + this.options = options; + this.start = this.now = this.cur(); + this.end = end; + this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); + }, + cur: function() { + var hooks = Tween.propHooks[ this.prop ]; + + return hooks && hooks.get ? + hooks.get( this ) : + Tween.propHooks._default.get( this ); + }, + run: function( percent ) { + var eased, + hooks = Tween.propHooks[ this.prop ]; + + if ( this.options.duration ) { + this.pos = eased = jQuery.easing[ this.easing ]( + percent, this.options.duration * percent, 0, 1, this.options.duration + ); + } else { + this.pos = eased = percent; + } + this.now = ( this.end - this.start ) * eased + this.start; + + if ( this.options.step ) { + this.options.step.call( this.elem, this.now, this ); + } + + if ( hooks && hooks.set ) { + hooks.set( this ); + } else { + Tween.propHooks._default.set( this ); + } + return this; + } +}; + +Tween.prototype.init.prototype = Tween.prototype; + +Tween.propHooks = { + _default: { + get: function( tween ) { + var result; + + // Use a property on the element directly when it is not a DOM element, + // or when there is no matching style property that exists. + if ( tween.elem.nodeType !== 1 || + tween.elem[ tween.prop ] != null && tween.elem.style[ tween.prop ] == null ) { + return tween.elem[ tween.prop ]; + } + + // Passing an empty string as a 3rd parameter to .css will automatically + // attempt a parseFloat and fallback to a string if the parse fails. + // Simple values such as "10px" are parsed to Float; + // complex values such as "rotate(1rad)" are returned as-is. + result = jQuery.css( tween.elem, tween.prop, "" ); + + // Empty strings, null, undefined and "auto" are converted to 0. + return !result || result === "auto" ? 0 : result; + }, + set: function( tween ) { + + // Use step hook for back compat. + // Use cssHook if its there. + // Use .style if available and use plain properties where available. + if ( jQuery.fx.step[ tween.prop ] ) { + jQuery.fx.step[ tween.prop ]( tween ); + } else if ( tween.elem.nodeType === 1 && ( + jQuery.cssHooks[ tween.prop ] || + tween.elem.style[ finalPropName( tween.prop ) ] != null ) ) { + jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); + } else { + tween.elem[ tween.prop ] = tween.now; + } + } + } +}; + +// Support: IE <=9 only +// Panic based approach to setting things on disconnected nodes +Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { + set: function( tween ) { + if ( tween.elem.nodeType && tween.elem.parentNode ) { + tween.elem[ tween.prop ] = tween.now; + } + } +}; + +jQuery.easing = { + linear: function( p ) { + return p; + }, + swing: function( p ) { + return 0.5 - Math.cos( p * Math.PI ) / 2; + }, + _default: "swing" +}; + +jQuery.fx = Tween.prototype.init; + +// Back compat <1.8 extension point +jQuery.fx.step = {}; + + + + +var + fxNow, inProgress, + rfxtypes = /^(?:toggle|show|hide)$/, + rrun = /queueHooks$/; + +function schedule() { + if ( inProgress ) { + if ( document.hidden === false && window.requestAnimationFrame ) { + window.requestAnimationFrame( schedule ); + } else { + window.setTimeout( schedule, jQuery.fx.interval ); + } + + jQuery.fx.tick(); + } +} + +// Animations created synchronously will run synchronously +function createFxNow() { + window.setTimeout( function() { + fxNow = undefined; + } ); + return ( fxNow = Date.now() ); +} + +// Generate parameters to create a standard animation +function genFx( type, includeWidth ) { + var which, + i = 0, + attrs = { height: type }; + + // If we include width, step value is 1 to do all cssExpand values, + // otherwise step value is 2 to skip over Left and Right + includeWidth = includeWidth ? 1 : 0; + for ( ; i < 4; i += 2 - includeWidth ) { + which = cssExpand[ i ]; + attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; + } + + if ( includeWidth ) { + attrs.opacity = attrs.width = type; + } + + return attrs; +} + +function createTween( value, prop, animation ) { + var tween, + collection = ( Animation.tweeners[ prop ] || [] ).concat( Animation.tweeners[ "*" ] ), + index = 0, + length = collection.length; + for ( ; index < length; index++ ) { + if ( ( tween = collection[ index ].call( animation, prop, value ) ) ) { + + // We're done with this property + return tween; + } + } +} + +function defaultPrefilter( elem, props, opts ) { + var prop, value, toggle, hooks, oldfire, propTween, restoreDisplay, display, + isBox = "width" in props || "height" in props, + anim = this, + orig = {}, + style = elem.style, + hidden = elem.nodeType && isHiddenWithinTree( elem ), + dataShow = dataPriv.get( elem, "fxshow" ); + + // Queue-skipping animations hijack the fx hooks + if ( !opts.queue ) { + hooks = jQuery._queueHooks( elem, "fx" ); + if ( hooks.unqueued == null ) { + hooks.unqueued = 0; + oldfire = hooks.empty.fire; + hooks.empty.fire = function() { + if ( !hooks.unqueued ) { + oldfire(); + } + }; + } + hooks.unqueued++; + + anim.always( function() { + + // Ensure the complete handler is called before this completes + anim.always( function() { + hooks.unqueued--; + if ( !jQuery.queue( elem, "fx" ).length ) { + hooks.empty.fire(); + } + } ); + } ); + } + + // Detect show/hide animations + for ( prop in props ) { + value = props[ prop ]; + if ( rfxtypes.test( value ) ) { + delete props[ prop ]; + toggle = toggle || value === "toggle"; + if ( value === ( hidden ? "hide" : "show" ) ) { + + // Pretend to be hidden if this is a "show" and + // there is still data from a stopped show/hide + if ( value === "show" && dataShow && dataShow[ prop ] !== undefined ) { + hidden = true; + + // Ignore all other no-op show/hide data + } else { + continue; + } + } + orig[ prop ] = dataShow && dataShow[ prop ] || jQuery.style( elem, prop ); + } + } + + // Bail out if this is a no-op like .hide().hide() + propTween = !jQuery.isEmptyObject( props ); + if ( !propTween && jQuery.isEmptyObject( orig ) ) { + return; + } + + // Restrict "overflow" and "display" styles during box animations + if ( isBox && elem.nodeType === 1 ) { + + // Support: IE <=9 - 11, Edge 12 - 15 + // Record all 3 overflow attributes because IE does not infer the shorthand + // from identically-valued overflowX and overflowY and Edge just mirrors + // the overflowX value there. + opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; + + // Identify a display type, preferring old show/hide data over the CSS cascade + restoreDisplay = dataShow && dataShow.display; + if ( restoreDisplay == null ) { + restoreDisplay = dataPriv.get( elem, "display" ); + } + display = jQuery.css( elem, "display" ); + if ( display === "none" ) { + if ( restoreDisplay ) { + display = restoreDisplay; + } else { + + // Get nonempty value(s) by temporarily forcing visibility + showHide( [ elem ], true ); + restoreDisplay = elem.style.display || restoreDisplay; + display = jQuery.css( elem, "display" ); + showHide( [ elem ] ); + } + } + + // Animate inline elements as inline-block + if ( display === "inline" || display === "inline-block" && restoreDisplay != null ) { + if ( jQuery.css( elem, "float" ) === "none" ) { + + // Restore the original display value at the end of pure show/hide animations + if ( !propTween ) { + anim.done( function() { + style.display = restoreDisplay; + } ); + if ( restoreDisplay == null ) { + display = style.display; + restoreDisplay = display === "none" ? "" : display; + } + } + style.display = "inline-block"; + } + } + } + + if ( opts.overflow ) { + style.overflow = "hidden"; + anim.always( function() { + style.overflow = opts.overflow[ 0 ]; + style.overflowX = opts.overflow[ 1 ]; + style.overflowY = opts.overflow[ 2 ]; + } ); + } + + // Implement show/hide animations + propTween = false; + for ( prop in orig ) { + + // General show/hide setup for this element animation + if ( !propTween ) { + if ( dataShow ) { + if ( "hidden" in dataShow ) { + hidden = dataShow.hidden; + } + } else { + dataShow = dataPriv.access( elem, "fxshow", { display: restoreDisplay } ); + } + + // Store hidden/visible for toggle so `.stop().toggle()` "reverses" + if ( toggle ) { + dataShow.hidden = !hidden; + } + + // Show elements before animating them + if ( hidden ) { + showHide( [ elem ], true ); + } + + /* eslint-disable no-loop-func */ + + anim.done( function() { + + /* eslint-enable no-loop-func */ + + // The final step of a "hide" animation is actually hiding the element + if ( !hidden ) { + showHide( [ elem ] ); + } + dataPriv.remove( elem, "fxshow" ); + for ( prop in orig ) { + jQuery.style( elem, prop, orig[ prop ] ); + } + } ); + } + + // Per-property setup + propTween = createTween( hidden ? dataShow[ prop ] : 0, prop, anim ); + if ( !( prop in dataShow ) ) { + dataShow[ prop ] = propTween.start; + if ( hidden ) { + propTween.end = propTween.start; + propTween.start = 0; + } + } + } +} + +function propFilter( props, specialEasing ) { + var index, name, easing, value, hooks; + + // camelCase, specialEasing and expand cssHook pass + for ( index in props ) { + name = camelCase( index ); + easing = specialEasing[ name ]; + value = props[ index ]; + if ( Array.isArray( value ) ) { + easing = value[ 1 ]; + value = props[ index ] = value[ 0 ]; + } + + if ( index !== name ) { + props[ name ] = value; + delete props[ index ]; + } + + hooks = jQuery.cssHooks[ name ]; + if ( hooks && "expand" in hooks ) { + value = hooks.expand( value ); + delete props[ name ]; + + // Not quite $.extend, this won't overwrite existing keys. + // Reusing 'index' because we have the correct "name" + for ( index in value ) { + if ( !( index in props ) ) { + props[ index ] = value[ index ]; + specialEasing[ index ] = easing; + } + } + } else { + specialEasing[ name ] = easing; + } + } +} + +function Animation( elem, properties, options ) { + var result, + stopped, + index = 0, + length = Animation.prefilters.length, + deferred = jQuery.Deferred().always( function() { + + // Don't match elem in the :animated selector + delete tick.elem; + } ), + tick = function() { + if ( stopped ) { + return false; + } + var currentTime = fxNow || createFxNow(), + remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), + + // Support: Android 2.3 only + // Archaic crash bug won't allow us to use `1 - ( 0.5 || 0 )` (#12497) + temp = remaining / animation.duration || 0, + percent = 1 - temp, + index = 0, + length = animation.tweens.length; + + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( percent ); + } + + deferred.notifyWith( elem, [ animation, percent, remaining ] ); + + // If there's more to do, yield + if ( percent < 1 && length ) { + return remaining; + } + + // If this was an empty animation, synthesize a final progress notification + if ( !length ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + } + + // Resolve the animation and report its conclusion + deferred.resolveWith( elem, [ animation ] ); + return false; + }, + animation = deferred.promise( { + elem: elem, + props: jQuery.extend( {}, properties ), + opts: jQuery.extend( true, { + specialEasing: {}, + easing: jQuery.easing._default + }, options ), + originalProperties: properties, + originalOptions: options, + startTime: fxNow || createFxNow(), + duration: options.duration, + tweens: [], + createTween: function( prop, end ) { + var tween = jQuery.Tween( elem, animation.opts, prop, end, + animation.opts.specialEasing[ prop ] || animation.opts.easing ); + animation.tweens.push( tween ); + return tween; + }, + stop: function( gotoEnd ) { + var index = 0, + + // If we are going to the end, we want to run all the tweens + // otherwise we skip this part + length = gotoEnd ? animation.tweens.length : 0; + if ( stopped ) { + return this; + } + stopped = true; + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( 1 ); + } + + // Resolve when we played the last frame; otherwise, reject + if ( gotoEnd ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + deferred.resolveWith( elem, [ animation, gotoEnd ] ); + } else { + deferred.rejectWith( elem, [ animation, gotoEnd ] ); + } + return this; + } + } ), + props = animation.props; + + propFilter( props, animation.opts.specialEasing ); + + for ( ; index < length; index++ ) { + result = Animation.prefilters[ index ].call( animation, elem, props, animation.opts ); + if ( result ) { + if ( isFunction( result.stop ) ) { + jQuery._queueHooks( animation.elem, animation.opts.queue ).stop = + result.stop.bind( result ); + } + return result; + } + } + + jQuery.map( props, createTween, animation ); + + if ( isFunction( animation.opts.start ) ) { + animation.opts.start.call( elem, animation ); + } + + // Attach callbacks from options + animation + .progress( animation.opts.progress ) + .done( animation.opts.done, animation.opts.complete ) + .fail( animation.opts.fail ) + .always( animation.opts.always ); + + jQuery.fx.timer( + jQuery.extend( tick, { + elem: elem, + anim: animation, + queue: animation.opts.queue + } ) + ); + + return animation; +} + +jQuery.Animation = jQuery.extend( Animation, { + + tweeners: { + "*": [ function( prop, value ) { + var tween = this.createTween( prop, value ); + adjustCSS( tween.elem, prop, rcssNum.exec( value ), tween ); + return tween; + } ] + }, + + tweener: function( props, callback ) { + if ( isFunction( props ) ) { + callback = props; + props = [ "*" ]; + } else { + props = props.match( rnothtmlwhite ); + } + + var prop, + index = 0, + length = props.length; + + for ( ; index < length; index++ ) { + prop = props[ index ]; + Animation.tweeners[ prop ] = Animation.tweeners[ prop ] || []; + Animation.tweeners[ prop ].unshift( callback ); + } + }, + + prefilters: [ defaultPrefilter ], + + prefilter: function( callback, prepend ) { + if ( prepend ) { + Animation.prefilters.unshift( callback ); + } else { + Animation.prefilters.push( callback ); + } + } +} ); + +jQuery.speed = function( speed, easing, fn ) { + var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { + complete: fn || !fn && easing || + isFunction( speed ) && speed, + duration: speed, + easing: fn && easing || easing && !isFunction( easing ) && easing + }; + + // Go to the end state if fx are off + if ( jQuery.fx.off ) { + opt.duration = 0; + + } else { + if ( typeof opt.duration !== "number" ) { + if ( opt.duration in jQuery.fx.speeds ) { + opt.duration = jQuery.fx.speeds[ opt.duration ]; + + } else { + opt.duration = jQuery.fx.speeds._default; + } + } + } + + // Normalize opt.queue - true/undefined/null -> "fx" + if ( opt.queue == null || opt.queue === true ) { + opt.queue = "fx"; + } + + // Queueing + opt.old = opt.complete; + + opt.complete = function() { + if ( isFunction( opt.old ) ) { + opt.old.call( this ); + } + + if ( opt.queue ) { + jQuery.dequeue( this, opt.queue ); + } + }; + + return opt; +}; + +jQuery.fn.extend( { + fadeTo: function( speed, to, easing, callback ) { + + // Show any hidden elements after setting opacity to 0 + return this.filter( isHiddenWithinTree ).css( "opacity", 0 ).show() + + // Animate to the value specified + .end().animate( { opacity: to }, speed, easing, callback ); + }, + animate: function( prop, speed, easing, callback ) { + var empty = jQuery.isEmptyObject( prop ), + optall = jQuery.speed( speed, easing, callback ), + doAnimation = function() { + + // Operate on a copy of prop so per-property easing won't be lost + var anim = Animation( this, jQuery.extend( {}, prop ), optall ); + + // Empty animations, or finishing resolves immediately + if ( empty || dataPriv.get( this, "finish" ) ) { + anim.stop( true ); + } + }; + + doAnimation.finish = doAnimation; + + return empty || optall.queue === false ? + this.each( doAnimation ) : + this.queue( optall.queue, doAnimation ); + }, + stop: function( type, clearQueue, gotoEnd ) { + var stopQueue = function( hooks ) { + var stop = hooks.stop; + delete hooks.stop; + stop( gotoEnd ); + }; + + if ( typeof type !== "string" ) { + gotoEnd = clearQueue; + clearQueue = type; + type = undefined; + } + if ( clearQueue ) { + this.queue( type || "fx", [] ); + } + + return this.each( function() { + var dequeue = true, + index = type != null && type + "queueHooks", + timers = jQuery.timers, + data = dataPriv.get( this ); + + if ( index ) { + if ( data[ index ] && data[ index ].stop ) { + stopQueue( data[ index ] ); + } + } else { + for ( index in data ) { + if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { + stopQueue( data[ index ] ); + } + } + } + + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && + ( type == null || timers[ index ].queue === type ) ) { + + timers[ index ].anim.stop( gotoEnd ); + dequeue = false; + timers.splice( index, 1 ); + } + } + + // Start the next in the queue if the last step wasn't forced. + // Timers currently will call their complete callbacks, which + // will dequeue but only if they were gotoEnd. + if ( dequeue || !gotoEnd ) { + jQuery.dequeue( this, type ); + } + } ); + }, + finish: function( type ) { + if ( type !== false ) { + type = type || "fx"; + } + return this.each( function() { + var index, + data = dataPriv.get( this ), + queue = data[ type + "queue" ], + hooks = data[ type + "queueHooks" ], + timers = jQuery.timers, + length = queue ? queue.length : 0; + + // Enable finishing flag on private data + data.finish = true; + + // Empty the queue first + jQuery.queue( this, type, [] ); + + if ( hooks && hooks.stop ) { + hooks.stop.call( this, true ); + } + + // Look for any active animations, and finish them + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && timers[ index ].queue === type ) { + timers[ index ].anim.stop( true ); + timers.splice( index, 1 ); + } + } + + // Look for any animations in the old queue and finish them + for ( index = 0; index < length; index++ ) { + if ( queue[ index ] && queue[ index ].finish ) { + queue[ index ].finish.call( this ); + } + } + + // Turn off finishing flag + delete data.finish; + } ); + } +} ); + +jQuery.each( [ "toggle", "show", "hide" ], function( _i, name ) { + var cssFn = jQuery.fn[ name ]; + jQuery.fn[ name ] = function( speed, easing, callback ) { + return speed == null || typeof speed === "boolean" ? + cssFn.apply( this, arguments ) : + this.animate( genFx( name, true ), speed, easing, callback ); + }; +} ); + +// Generate shortcuts for custom animations +jQuery.each( { + slideDown: genFx( "show" ), + slideUp: genFx( "hide" ), + slideToggle: genFx( "toggle" ), + fadeIn: { opacity: "show" }, + fadeOut: { opacity: "hide" }, + fadeToggle: { opacity: "toggle" } +}, function( name, props ) { + jQuery.fn[ name ] = function( speed, easing, callback ) { + return this.animate( props, speed, easing, callback ); + }; +} ); + +jQuery.timers = []; +jQuery.fx.tick = function() { + var timer, + i = 0, + timers = jQuery.timers; + + fxNow = Date.now(); + + for ( ; i < timers.length; i++ ) { + timer = timers[ i ]; + + // Run the timer and safely remove it when done (allowing for external removal) + if ( !timer() && timers[ i ] === timer ) { + timers.splice( i--, 1 ); + } + } + + if ( !timers.length ) { + jQuery.fx.stop(); + } + fxNow = undefined; +}; + +jQuery.fx.timer = function( timer ) { + jQuery.timers.push( timer ); + jQuery.fx.start(); +}; + +jQuery.fx.interval = 13; +jQuery.fx.start = function() { + if ( inProgress ) { + return; + } + + inProgress = true; + schedule(); +}; + +jQuery.fx.stop = function() { + inProgress = null; +}; + +jQuery.fx.speeds = { + slow: 600, + fast: 200, + + // Default speed + _default: 400 +}; + + +// Based off of the plugin by Clint Helfers, with permission. +// https://web.archive.org/web/20100324014747/http://blindsignals.com/index.php/2009/07/jquery-delay/ +jQuery.fn.delay = function( time, type ) { + time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; + type = type || "fx"; + + return this.queue( type, function( next, hooks ) { + var timeout = window.setTimeout( next, time ); + hooks.stop = function() { + window.clearTimeout( timeout ); + }; + } ); +}; + + +( function() { + var input = document.createElement( "input" ), + select = document.createElement( "select" ), + opt = select.appendChild( document.createElement( "option" ) ); + + input.type = "checkbox"; + + // Support: Android <=4.3 only + // Default value for a checkbox should be "on" + support.checkOn = input.value !== ""; + + // Support: IE <=11 only + // Must access selectedIndex to make default options select + support.optSelected = opt.selected; + + // Support: IE <=11 only + // An input loses its value after becoming a radio + input = document.createElement( "input" ); + input.value = "t"; + input.type = "radio"; + support.radioValue = input.value === "t"; +} )(); + + +var boolHook, + attrHandle = jQuery.expr.attrHandle; + +jQuery.fn.extend( { + attr: function( name, value ) { + return access( this, jQuery.attr, name, value, arguments.length > 1 ); + }, + + removeAttr: function( name ) { + return this.each( function() { + jQuery.removeAttr( this, name ); + } ); + } +} ); + +jQuery.extend( { + attr: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set attributes on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + // Fallback to prop when attributes are not supported + if ( typeof elem.getAttribute === "undefined" ) { + return jQuery.prop( elem, name, value ); + } + + // Attribute hooks are determined by the lowercase version + // Grab necessary hook if one is defined + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + hooks = jQuery.attrHooks[ name.toLowerCase() ] || + ( jQuery.expr.match.bool.test( name ) ? boolHook : undefined ); + } + + if ( value !== undefined ) { + if ( value === null ) { + jQuery.removeAttr( elem, name ); + return; + } + + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + elem.setAttribute( name, value + "" ); + return value; + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + ret = jQuery.find.attr( elem, name ); + + // Non-existent attributes return null, we normalize to undefined + return ret == null ? undefined : ret; + }, + + attrHooks: { + type: { + set: function( elem, value ) { + if ( !support.radioValue && value === "radio" && + nodeName( elem, "input" ) ) { + var val = elem.value; + elem.setAttribute( "type", value ); + if ( val ) { + elem.value = val; + } + return value; + } + } + } + }, + + removeAttr: function( elem, value ) { + var name, + i = 0, + + // Attribute names can contain non-HTML whitespace characters + // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 + attrNames = value && value.match( rnothtmlwhite ); + + if ( attrNames && elem.nodeType === 1 ) { + while ( ( name = attrNames[ i++ ] ) ) { + elem.removeAttribute( name ); + } + } + } +} ); + +// Hooks for boolean attributes +boolHook = { + set: function( elem, value, name ) { + if ( value === false ) { + + // Remove boolean attributes when set to false + jQuery.removeAttr( elem, name ); + } else { + elem.setAttribute( name, name ); + } + return name; + } +}; + +jQuery.each( jQuery.expr.match.bool.source.match( /\w+/g ), function( _i, name ) { + var getter = attrHandle[ name ] || jQuery.find.attr; + + attrHandle[ name ] = function( elem, name, isXML ) { + var ret, handle, + lowercaseName = name.toLowerCase(); + + if ( !isXML ) { + + // Avoid an infinite loop by temporarily removing this function from the getter + handle = attrHandle[ lowercaseName ]; + attrHandle[ lowercaseName ] = ret; + ret = getter( elem, name, isXML ) != null ? + lowercaseName : + null; + attrHandle[ lowercaseName ] = handle; + } + return ret; + }; +} ); + + + + +var rfocusable = /^(?:input|select|textarea|button)$/i, + rclickable = /^(?:a|area)$/i; + +jQuery.fn.extend( { + prop: function( name, value ) { + return access( this, jQuery.prop, name, value, arguments.length > 1 ); + }, + + removeProp: function( name ) { + return this.each( function() { + delete this[ jQuery.propFix[ name ] || name ]; + } ); + } +} ); + +jQuery.extend( { + prop: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set properties on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + + // Fix name and attach hooks + name = jQuery.propFix[ name ] || name; + hooks = jQuery.propHooks[ name ]; + } + + if ( value !== undefined ) { + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + return ( elem[ name ] = value ); + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + return elem[ name ]; + }, + + propHooks: { + tabIndex: { + get: function( elem ) { + + // Support: IE <=9 - 11 only + // elem.tabIndex doesn't always return the + // correct value when it hasn't been explicitly set + // https://web.archive.org/web/20141116233347/http://fluidproject.org/blog/2008/01/09/getting-setting-and-removing-tabindex-values-with-javascript/ + // Use proper attribute retrieval(#12072) + var tabindex = jQuery.find.attr( elem, "tabindex" ); + + if ( tabindex ) { + return parseInt( tabindex, 10 ); + } + + if ( + rfocusable.test( elem.nodeName ) || + rclickable.test( elem.nodeName ) && + elem.href + ) { + return 0; + } + + return -1; + } + } + }, + + propFix: { + "for": "htmlFor", + "class": "className" + } +} ); + +// Support: IE <=11 only +// Accessing the selectedIndex property +// forces the browser to respect setting selected +// on the option +// The getter ensures a default option is selected +// when in an optgroup +// eslint rule "no-unused-expressions" is disabled for this code +// since it considers such accessions noop +if ( !support.optSelected ) { + jQuery.propHooks.selected = { + get: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent && parent.parentNode ) { + parent.parentNode.selectedIndex; + } + return null; + }, + set: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent ) { + parent.selectedIndex; + + if ( parent.parentNode ) { + parent.parentNode.selectedIndex; + } + } + } + }; +} + +jQuery.each( [ + "tabIndex", + "readOnly", + "maxLength", + "cellSpacing", + "cellPadding", + "rowSpan", + "colSpan", + "useMap", + "frameBorder", + "contentEditable" +], function() { + jQuery.propFix[ this.toLowerCase() ] = this; +} ); + + + + + // Strip and collapse whitespace according to HTML spec + // https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace + function stripAndCollapse( value ) { + var tokens = value.match( rnothtmlwhite ) || []; + return tokens.join( " " ); + } + + +function getClass( elem ) { + return elem.getAttribute && elem.getAttribute( "class" ) || ""; +} + +function classesToArray( value ) { + if ( Array.isArray( value ) ) { + return value; + } + if ( typeof value === "string" ) { + return value.match( rnothtmlwhite ) || []; + } + return []; +} + +jQuery.fn.extend( { + addClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).addClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + if ( cur.indexOf( " " + clazz + " " ) < 0 ) { + cur += clazz + " "; + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + removeClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).removeClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + if ( !arguments.length ) { + return this.attr( "class", "" ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + + // This expression is here for better compressibility (see addClass) + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + + // Remove *all* instances + while ( cur.indexOf( " " + clazz + " " ) > -1 ) { + cur = cur.replace( " " + clazz + " ", " " ); + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + toggleClass: function( value, stateVal ) { + var type = typeof value, + isValidValue = type === "string" || Array.isArray( value ); + + if ( typeof stateVal === "boolean" && isValidValue ) { + return stateVal ? this.addClass( value ) : this.removeClass( value ); + } + + if ( isFunction( value ) ) { + return this.each( function( i ) { + jQuery( this ).toggleClass( + value.call( this, i, getClass( this ), stateVal ), + stateVal + ); + } ); + } + + return this.each( function() { + var className, i, self, classNames; + + if ( isValidValue ) { + + // Toggle individual class names + i = 0; + self = jQuery( this ); + classNames = classesToArray( value ); + + while ( ( className = classNames[ i++ ] ) ) { + + // Check each className given, space separated list + if ( self.hasClass( className ) ) { + self.removeClass( className ); + } else { + self.addClass( className ); + } + } + + // Toggle whole class name + } else if ( value === undefined || type === "boolean" ) { + className = getClass( this ); + if ( className ) { + + // Store className if set + dataPriv.set( this, "__className__", className ); + } + + // If the element has a class name or if we're passed `false`, + // then remove the whole classname (if there was one, the above saved it). + // Otherwise bring back whatever was previously saved (if anything), + // falling back to the empty string if nothing was stored. + if ( this.setAttribute ) { + this.setAttribute( "class", + className || value === false ? + "" : + dataPriv.get( this, "__className__" ) || "" + ); + } + } + } ); + }, + + hasClass: function( selector ) { + var className, elem, + i = 0; + + className = " " + selector + " "; + while ( ( elem = this[ i++ ] ) ) { + if ( elem.nodeType === 1 && + ( " " + stripAndCollapse( getClass( elem ) ) + " " ).indexOf( className ) > -1 ) { + return true; + } + } + + return false; + } +} ); + + + + +var rreturn = /\r/g; + +jQuery.fn.extend( { + val: function( value ) { + var hooks, ret, valueIsFunction, + elem = this[ 0 ]; + + if ( !arguments.length ) { + if ( elem ) { + hooks = jQuery.valHooks[ elem.type ] || + jQuery.valHooks[ elem.nodeName.toLowerCase() ]; + + if ( hooks && + "get" in hooks && + ( ret = hooks.get( elem, "value" ) ) !== undefined + ) { + return ret; + } + + ret = elem.value; + + // Handle most common string cases + if ( typeof ret === "string" ) { + return ret.replace( rreturn, "" ); + } + + // Handle cases where value is null/undef or number + return ret == null ? "" : ret; + } + + return; + } + + valueIsFunction = isFunction( value ); + + return this.each( function( i ) { + var val; + + if ( this.nodeType !== 1 ) { + return; + } + + if ( valueIsFunction ) { + val = value.call( this, i, jQuery( this ).val() ); + } else { + val = value; + } + + // Treat null/undefined as ""; convert numbers to string + if ( val == null ) { + val = ""; + + } else if ( typeof val === "number" ) { + val += ""; + + } else if ( Array.isArray( val ) ) { + val = jQuery.map( val, function( value ) { + return value == null ? "" : value + ""; + } ); + } + + hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; + + // If set returns undefined, fall back to normal setting + if ( !hooks || !( "set" in hooks ) || hooks.set( this, val, "value" ) === undefined ) { + this.value = val; + } + } ); + } +} ); + +jQuery.extend( { + valHooks: { + option: { + get: function( elem ) { + + var val = jQuery.find.attr( elem, "value" ); + return val != null ? + val : + + // Support: IE <=10 - 11 only + // option.text throws exceptions (#14686, #14858) + // Strip and collapse whitespace + // https://html.spec.whatwg.org/#strip-and-collapse-whitespace + stripAndCollapse( jQuery.text( elem ) ); + } + }, + select: { + get: function( elem ) { + var value, option, i, + options = elem.options, + index = elem.selectedIndex, + one = elem.type === "select-one", + values = one ? null : [], + max = one ? index + 1 : options.length; + + if ( index < 0 ) { + i = max; + + } else { + i = one ? index : 0; + } + + // Loop through all the selected options + for ( ; i < max; i++ ) { + option = options[ i ]; + + // Support: IE <=9 only + // IE8-9 doesn't update selected after form reset (#2551) + if ( ( option.selected || i === index ) && + + // Don't return options that are disabled or in a disabled optgroup + !option.disabled && + ( !option.parentNode.disabled || + !nodeName( option.parentNode, "optgroup" ) ) ) { + + // Get the specific value for the option + value = jQuery( option ).val(); + + // We don't need an array for one selects + if ( one ) { + return value; + } + + // Multi-Selects return an array + values.push( value ); + } + } + + return values; + }, + + set: function( elem, value ) { + var optionSet, option, + options = elem.options, + values = jQuery.makeArray( value ), + i = options.length; + + while ( i-- ) { + option = options[ i ]; + + /* eslint-disable no-cond-assign */ + + if ( option.selected = + jQuery.inArray( jQuery.valHooks.option.get( option ), values ) > -1 + ) { + optionSet = true; + } + + /* eslint-enable no-cond-assign */ + } + + // Force browsers to behave consistently when non-matching value is set + if ( !optionSet ) { + elem.selectedIndex = -1; + } + return values; + } + } + } +} ); + +// Radios and checkboxes getter/setter +jQuery.each( [ "radio", "checkbox" ], function() { + jQuery.valHooks[ this ] = { + set: function( elem, value ) { + if ( Array.isArray( value ) ) { + return ( elem.checked = jQuery.inArray( jQuery( elem ).val(), value ) > -1 ); + } + } + }; + if ( !support.checkOn ) { + jQuery.valHooks[ this ].get = function( elem ) { + return elem.getAttribute( "value" ) === null ? "on" : elem.value; + }; + } +} ); + + + + +// Return jQuery for attributes-only inclusion + + +support.focusin = "onfocusin" in window; + + +var rfocusMorph = /^(?:focusinfocus|focusoutblur)$/, + stopPropagationCallback = function( e ) { + e.stopPropagation(); + }; + +jQuery.extend( jQuery.event, { + + trigger: function( event, data, elem, onlyHandlers ) { + + var i, cur, tmp, bubbleType, ontype, handle, special, lastElement, + eventPath = [ elem || document ], + type = hasOwn.call( event, "type" ) ? event.type : event, + namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split( "." ) : []; + + cur = lastElement = tmp = elem = elem || document; + + // Don't do events on text and comment nodes + if ( elem.nodeType === 3 || elem.nodeType === 8 ) { + return; + } + + // focus/blur morphs to focusin/out; ensure we're not firing them right now + if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { + return; + } + + if ( type.indexOf( "." ) > -1 ) { + + // Namespaced trigger; create a regexp to match event type in handle() + namespaces = type.split( "." ); + type = namespaces.shift(); + namespaces.sort(); + } + ontype = type.indexOf( ":" ) < 0 && "on" + type; + + // Caller can pass in a jQuery.Event object, Object, or just an event type string + event = event[ jQuery.expando ] ? + event : + new jQuery.Event( type, typeof event === "object" && event ); + + // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) + event.isTrigger = onlyHandlers ? 2 : 3; + event.namespace = namespaces.join( "." ); + event.rnamespace = event.namespace ? + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ) : + null; + + // Clean up the event in case it is being reused + event.result = undefined; + if ( !event.target ) { + event.target = elem; + } + + // Clone any incoming data and prepend the event, creating the handler arg list + data = data == null ? + [ event ] : + jQuery.makeArray( data, [ event ] ); + + // Allow special events to draw outside the lines + special = jQuery.event.special[ type ] || {}; + if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { + return; + } + + // Determine event propagation path in advance, per W3C events spec (#9951) + // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) + if ( !onlyHandlers && !special.noBubble && !isWindow( elem ) ) { + + bubbleType = special.delegateType || type; + if ( !rfocusMorph.test( bubbleType + type ) ) { + cur = cur.parentNode; + } + for ( ; cur; cur = cur.parentNode ) { + eventPath.push( cur ); + tmp = cur; + } + + // Only add window if we got to document (e.g., not plain obj or detached DOM) + if ( tmp === ( elem.ownerDocument || document ) ) { + eventPath.push( tmp.defaultView || tmp.parentWindow || window ); + } + } + + // Fire handlers on the event path + i = 0; + while ( ( cur = eventPath[ i++ ] ) && !event.isPropagationStopped() ) { + lastElement = cur; + event.type = i > 1 ? + bubbleType : + special.bindType || type; + + // jQuery handler + handle = ( dataPriv.get( cur, "events" ) || Object.create( null ) )[ event.type ] && + dataPriv.get( cur, "handle" ); + if ( handle ) { + handle.apply( cur, data ); + } + + // Native handler + handle = ontype && cur[ ontype ]; + if ( handle && handle.apply && acceptData( cur ) ) { + event.result = handle.apply( cur, data ); + if ( event.result === false ) { + event.preventDefault(); + } + } + } + event.type = type; + + // If nobody prevented the default action, do it now + if ( !onlyHandlers && !event.isDefaultPrevented() ) { + + if ( ( !special._default || + special._default.apply( eventPath.pop(), data ) === false ) && + acceptData( elem ) ) { + + // Call a native DOM method on the target with the same name as the event. + // Don't do default actions on window, that's where global variables be (#6170) + if ( ontype && isFunction( elem[ type ] ) && !isWindow( elem ) ) { + + // Don't re-trigger an onFOO event when we call its FOO() method + tmp = elem[ ontype ]; + + if ( tmp ) { + elem[ ontype ] = null; + } + + // Prevent re-triggering of the same event, since we already bubbled it above + jQuery.event.triggered = type; + + if ( event.isPropagationStopped() ) { + lastElement.addEventListener( type, stopPropagationCallback ); + } + + elem[ type ](); + + if ( event.isPropagationStopped() ) { + lastElement.removeEventListener( type, stopPropagationCallback ); + } + + jQuery.event.triggered = undefined; + + if ( tmp ) { + elem[ ontype ] = tmp; + } + } + } + } + + return event.result; + }, + + // Piggyback on a donor event to simulate a different one + // Used only for `focus(in | out)` events + simulate: function( type, elem, event ) { + var e = jQuery.extend( + new jQuery.Event(), + event, + { + type: type, + isSimulated: true + } + ); + + jQuery.event.trigger( e, null, elem ); + } + +} ); + +jQuery.fn.extend( { + + trigger: function( type, data ) { + return this.each( function() { + jQuery.event.trigger( type, data, this ); + } ); + }, + triggerHandler: function( type, data ) { + var elem = this[ 0 ]; + if ( elem ) { + return jQuery.event.trigger( type, data, elem, true ); + } + } +} ); + + +// Support: Firefox <=44 +// Firefox doesn't have focus(in | out) events +// Related ticket - https://bugzilla.mozilla.org/show_bug.cgi?id=687787 +// +// Support: Chrome <=48 - 49, Safari <=9.0 - 9.1 +// focus(in | out) events fire after focus & blur events, +// which is spec violation - http://www.w3.org/TR/DOM-Level-3-Events/#events-focusevent-event-order +// Related ticket - https://bugs.chromium.org/p/chromium/issues/detail?id=449857 +if ( !support.focusin ) { + jQuery.each( { focus: "focusin", blur: "focusout" }, function( orig, fix ) { + + // Attach a single capturing handler on the document while someone wants focusin/focusout + var handler = function( event ) { + jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ) ); + }; + + jQuery.event.special[ fix ] = { + setup: function() { + + // Handle: regular nodes (via `this.ownerDocument`), window + // (via `this.document`) & document (via `this`). + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ); + + if ( !attaches ) { + doc.addEventListener( orig, handler, true ); + } + dataPriv.access( doc, fix, ( attaches || 0 ) + 1 ); + }, + teardown: function() { + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ) - 1; + + if ( !attaches ) { + doc.removeEventListener( orig, handler, true ); + dataPriv.remove( doc, fix ); + + } else { + dataPriv.access( doc, fix, attaches ); + } + } + }; + } ); +} +var location = window.location; + +var nonce = { guid: Date.now() }; + +var rquery = ( /\?/ ); + + + +// Cross-browser xml parsing +jQuery.parseXML = function( data ) { + var xml, parserErrorElem; + if ( !data || typeof data !== "string" ) { + return null; + } + + // Support: IE 9 - 11 only + // IE throws on parseFromString with invalid input. + try { + xml = ( new window.DOMParser() ).parseFromString( data, "text/xml" ); + } catch ( e ) {} + + parserErrorElem = xml && xml.getElementsByTagName( "parsererror" )[ 0 ]; + if ( !xml || parserErrorElem ) { + jQuery.error( "Invalid XML: " + ( + parserErrorElem ? + jQuery.map( parserErrorElem.childNodes, function( el ) { + return el.textContent; + } ).join( "\n" ) : + data + ) ); + } + return xml; +}; + + +var + rbracket = /\[\]$/, + rCRLF = /\r?\n/g, + rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, + rsubmittable = /^(?:input|select|textarea|keygen)/i; + +function buildParams( prefix, obj, traditional, add ) { + var name; + + if ( Array.isArray( obj ) ) { + + // Serialize array item. + jQuery.each( obj, function( i, v ) { + if ( traditional || rbracket.test( prefix ) ) { + + // Treat each array item as a scalar. + add( prefix, v ); + + } else { + + // Item is non-scalar (array or object), encode its numeric index. + buildParams( + prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", + v, + traditional, + add + ); + } + } ); + + } else if ( !traditional && toType( obj ) === "object" ) { + + // Serialize object item. + for ( name in obj ) { + buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); + } + + } else { + + // Serialize scalar item. + add( prefix, obj ); + } +} + +// Serialize an array of form elements or a set of +// key/values into a query string +jQuery.param = function( a, traditional ) { + var prefix, + s = [], + add = function( key, valueOrFunction ) { + + // If value is a function, invoke it and use its return value + var value = isFunction( valueOrFunction ) ? + valueOrFunction() : + valueOrFunction; + + s[ s.length ] = encodeURIComponent( key ) + "=" + + encodeURIComponent( value == null ? "" : value ); + }; + + if ( a == null ) { + return ""; + } + + // If an array was passed in, assume that it is an array of form elements. + if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { + + // Serialize the form elements + jQuery.each( a, function() { + add( this.name, this.value ); + } ); + + } else { + + // If traditional, encode the "old" way (the way 1.3.2 or older + // did it), otherwise encode params recursively. + for ( prefix in a ) { + buildParams( prefix, a[ prefix ], traditional, add ); + } + } + + // Return the resulting serialization + return s.join( "&" ); +}; + +jQuery.fn.extend( { + serialize: function() { + return jQuery.param( this.serializeArray() ); + }, + serializeArray: function() { + return this.map( function() { + + // Can add propHook for "elements" to filter or add form elements + var elements = jQuery.prop( this, "elements" ); + return elements ? jQuery.makeArray( elements ) : this; + } ).filter( function() { + var type = this.type; + + // Use .is( ":disabled" ) so that fieldset[disabled] works + return this.name && !jQuery( this ).is( ":disabled" ) && + rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && + ( this.checked || !rcheckableType.test( type ) ); + } ).map( function( _i, elem ) { + var val = jQuery( this ).val(); + + if ( val == null ) { + return null; + } + + if ( Array.isArray( val ) ) { + return jQuery.map( val, function( val ) { + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ); + } + + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ).get(); + } +} ); + + +var + r20 = /%20/g, + rhash = /#.*$/, + rantiCache = /([?&])_=[^&]*/, + rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, + + // #7653, #8125, #8152: local protocol detection + rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, + rnoContent = /^(?:GET|HEAD)$/, + rprotocol = /^\/\//, + + /* Prefilters + * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) + * 2) These are called: + * - BEFORE asking for a transport + * - AFTER param serialization (s.data is a string if s.processData is true) + * 3) key is the dataType + * 4) the catchall symbol "*" can be used + * 5) execution will start with transport dataType and THEN continue down to "*" if needed + */ + prefilters = {}, + + /* Transports bindings + * 1) key is the dataType + * 2) the catchall symbol "*" can be used + * 3) selection will start with transport dataType and THEN go to "*" if needed + */ + transports = {}, + + // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression + allTypes = "*/".concat( "*" ), + + // Anchor tag for parsing the document origin + originAnchor = document.createElement( "a" ); + +originAnchor.href = location.href; + +// Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport +function addToPrefiltersOrTransports( structure ) { + + // dataTypeExpression is optional and defaults to "*" + return function( dataTypeExpression, func ) { + + if ( typeof dataTypeExpression !== "string" ) { + func = dataTypeExpression; + dataTypeExpression = "*"; + } + + var dataType, + i = 0, + dataTypes = dataTypeExpression.toLowerCase().match( rnothtmlwhite ) || []; + + if ( isFunction( func ) ) { + + // For each dataType in the dataTypeExpression + while ( ( dataType = dataTypes[ i++ ] ) ) { + + // Prepend if requested + if ( dataType[ 0 ] === "+" ) { + dataType = dataType.slice( 1 ) || "*"; + ( structure[ dataType ] = structure[ dataType ] || [] ).unshift( func ); + + // Otherwise append + } else { + ( structure[ dataType ] = structure[ dataType ] || [] ).push( func ); + } + } + } + }; +} + +// Base inspection function for prefilters and transports +function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { + + var inspected = {}, + seekingTransport = ( structure === transports ); + + function inspect( dataType ) { + var selected; + inspected[ dataType ] = true; + jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { + var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); + if ( typeof dataTypeOrTransport === "string" && + !seekingTransport && !inspected[ dataTypeOrTransport ] ) { + + options.dataTypes.unshift( dataTypeOrTransport ); + inspect( dataTypeOrTransport ); + return false; + } else if ( seekingTransport ) { + return !( selected = dataTypeOrTransport ); + } + } ); + return selected; + } + + return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); +} + +// A special extend for ajax options +// that takes "flat" options (not to be deep extended) +// Fixes #9887 +function ajaxExtend( target, src ) { + var key, deep, + flatOptions = jQuery.ajaxSettings.flatOptions || {}; + + for ( key in src ) { + if ( src[ key ] !== undefined ) { + ( flatOptions[ key ] ? target : ( deep || ( deep = {} ) ) )[ key ] = src[ key ]; + } + } + if ( deep ) { + jQuery.extend( true, target, deep ); + } + + return target; +} + +/* Handles responses to an ajax request: + * - finds the right dataType (mediates between content-type and expected dataType) + * - returns the corresponding response + */ +function ajaxHandleResponses( s, jqXHR, responses ) { + + var ct, type, finalDataType, firstDataType, + contents = s.contents, + dataTypes = s.dataTypes; + + // Remove auto dataType and get content-type in the process + while ( dataTypes[ 0 ] === "*" ) { + dataTypes.shift(); + if ( ct === undefined ) { + ct = s.mimeType || jqXHR.getResponseHeader( "Content-Type" ); + } + } + + // Check if we're dealing with a known content-type + if ( ct ) { + for ( type in contents ) { + if ( contents[ type ] && contents[ type ].test( ct ) ) { + dataTypes.unshift( type ); + break; + } + } + } + + // Check to see if we have a response for the expected dataType + if ( dataTypes[ 0 ] in responses ) { + finalDataType = dataTypes[ 0 ]; + } else { + + // Try convertible dataTypes + for ( type in responses ) { + if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[ 0 ] ] ) { + finalDataType = type; + break; + } + if ( !firstDataType ) { + firstDataType = type; + } + } + + // Or just use first one + finalDataType = finalDataType || firstDataType; + } + + // If we found a dataType + // We add the dataType to the list if needed + // and return the corresponding response + if ( finalDataType ) { + if ( finalDataType !== dataTypes[ 0 ] ) { + dataTypes.unshift( finalDataType ); + } + return responses[ finalDataType ]; + } +} + +/* Chain conversions given the request and the original response + * Also sets the responseXXX fields on the jqXHR instance + */ +function ajaxConvert( s, response, jqXHR, isSuccess ) { + var conv2, current, conv, tmp, prev, + converters = {}, + + // Work with a copy of dataTypes in case we need to modify it for conversion + dataTypes = s.dataTypes.slice(); + + // Create converters map with lowercased keys + if ( dataTypes[ 1 ] ) { + for ( conv in s.converters ) { + converters[ conv.toLowerCase() ] = s.converters[ conv ]; + } + } + + current = dataTypes.shift(); + + // Convert to each sequential dataType + while ( current ) { + + if ( s.responseFields[ current ] ) { + jqXHR[ s.responseFields[ current ] ] = response; + } + + // Apply the dataFilter if provided + if ( !prev && isSuccess && s.dataFilter ) { + response = s.dataFilter( response, s.dataType ); + } + + prev = current; + current = dataTypes.shift(); + + if ( current ) { + + // There's only work to do if current dataType is non-auto + if ( current === "*" ) { + + current = prev; + + // Convert response if prev dataType is non-auto and differs from current + } else if ( prev !== "*" && prev !== current ) { + + // Seek a direct converter + conv = converters[ prev + " " + current ] || converters[ "* " + current ]; + + // If none found, seek a pair + if ( !conv ) { + for ( conv2 in converters ) { + + // If conv2 outputs current + tmp = conv2.split( " " ); + if ( tmp[ 1 ] === current ) { + + // If prev can be converted to accepted input + conv = converters[ prev + " " + tmp[ 0 ] ] || + converters[ "* " + tmp[ 0 ] ]; + if ( conv ) { + + // Condense equivalence converters + if ( conv === true ) { + conv = converters[ conv2 ]; + + // Otherwise, insert the intermediate dataType + } else if ( converters[ conv2 ] !== true ) { + current = tmp[ 0 ]; + dataTypes.unshift( tmp[ 1 ] ); + } + break; + } + } + } + } + + // Apply converter (if not an equivalence) + if ( conv !== true ) { + + // Unless errors are allowed to bubble, catch and return them + if ( conv && s.throws ) { + response = conv( response ); + } else { + try { + response = conv( response ); + } catch ( e ) { + return { + state: "parsererror", + error: conv ? e : "No conversion from " + prev + " to " + current + }; + } + } + } + } + } + } + + return { state: "success", data: response }; +} + +jQuery.extend( { + + // Counter for holding the number of active queries + active: 0, + + // Last-Modified header cache for next request + lastModified: {}, + etag: {}, + + ajaxSettings: { + url: location.href, + type: "GET", + isLocal: rlocalProtocol.test( location.protocol ), + global: true, + processData: true, + async: true, + contentType: "application/x-www-form-urlencoded; charset=UTF-8", + + /* + timeout: 0, + data: null, + dataType: null, + username: null, + password: null, + cache: null, + throws: false, + traditional: false, + headers: {}, + */ + + accepts: { + "*": allTypes, + text: "text/plain", + html: "text/html", + xml: "application/xml, text/xml", + json: "application/json, text/javascript" + }, + + contents: { + xml: /\bxml\b/, + html: /\bhtml/, + json: /\bjson\b/ + }, + + responseFields: { + xml: "responseXML", + text: "responseText", + json: "responseJSON" + }, + + // Data converters + // Keys separate source (or catchall "*") and destination types with a single space + converters: { + + // Convert anything to text + "* text": String, + + // Text to html (true = no transformation) + "text html": true, + + // Evaluate text as a json expression + "text json": JSON.parse, + + // Parse text as xml + "text xml": jQuery.parseXML + }, + + // For options that shouldn't be deep extended: + // you can add your own custom options here if + // and when you create one that shouldn't be + // deep extended (see ajaxExtend) + flatOptions: { + url: true, + context: true + } + }, + + // Creates a full fledged settings object into target + // with both ajaxSettings and settings fields. + // If target is omitted, writes into ajaxSettings. + ajaxSetup: function( target, settings ) { + return settings ? + + // Building a settings object + ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : + + // Extending ajaxSettings + ajaxExtend( jQuery.ajaxSettings, target ); + }, + + ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), + ajaxTransport: addToPrefiltersOrTransports( transports ), + + // Main method + ajax: function( url, options ) { + + // If url is an object, simulate pre-1.5 signature + if ( typeof url === "object" ) { + options = url; + url = undefined; + } + + // Force options to be an object + options = options || {}; + + var transport, + + // URL without anti-cache param + cacheURL, + + // Response headers + responseHeadersString, + responseHeaders, + + // timeout handle + timeoutTimer, + + // Url cleanup var + urlAnchor, + + // Request state (becomes false upon send and true upon completion) + completed, + + // To know if global events are to be dispatched + fireGlobals, + + // Loop variable + i, + + // uncached part of the url + uncached, + + // Create the final options object + s = jQuery.ajaxSetup( {}, options ), + + // Callbacks context + callbackContext = s.context || s, + + // Context for global events is callbackContext if it is a DOM node or jQuery collection + globalEventContext = s.context && + ( callbackContext.nodeType || callbackContext.jquery ) ? + jQuery( callbackContext ) : + jQuery.event, + + // Deferreds + deferred = jQuery.Deferred(), + completeDeferred = jQuery.Callbacks( "once memory" ), + + // Status-dependent callbacks + statusCode = s.statusCode || {}, + + // Headers (they are sent all at once) + requestHeaders = {}, + requestHeadersNames = {}, + + // Default abort message + strAbort = "canceled", + + // Fake xhr + jqXHR = { + readyState: 0, + + // Builds headers hashtable if needed + getResponseHeader: function( key ) { + var match; + if ( completed ) { + if ( !responseHeaders ) { + responseHeaders = {}; + while ( ( match = rheaders.exec( responseHeadersString ) ) ) { + responseHeaders[ match[ 1 ].toLowerCase() + " " ] = + ( responseHeaders[ match[ 1 ].toLowerCase() + " " ] || [] ) + .concat( match[ 2 ] ); + } + } + match = responseHeaders[ key.toLowerCase() + " " ]; + } + return match == null ? null : match.join( ", " ); + }, + + // Raw string + getAllResponseHeaders: function() { + return completed ? responseHeadersString : null; + }, + + // Caches the header + setRequestHeader: function( name, value ) { + if ( completed == null ) { + name = requestHeadersNames[ name.toLowerCase() ] = + requestHeadersNames[ name.toLowerCase() ] || name; + requestHeaders[ name ] = value; + } + return this; + }, + + // Overrides response content-type header + overrideMimeType: function( type ) { + if ( completed == null ) { + s.mimeType = type; + } + return this; + }, + + // Status-dependent callbacks + statusCode: function( map ) { + var code; + if ( map ) { + if ( completed ) { + + // Execute the appropriate callbacks + jqXHR.always( map[ jqXHR.status ] ); + } else { + + // Lazy-add the new callbacks in a way that preserves old ones + for ( code in map ) { + statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; + } + } + } + return this; + }, + + // Cancel the request + abort: function( statusText ) { + var finalText = statusText || strAbort; + if ( transport ) { + transport.abort( finalText ); + } + done( 0, finalText ); + return this; + } + }; + + // Attach deferreds + deferred.promise( jqXHR ); + + // Add protocol if not provided (prefilters might expect it) + // Handle falsy url in the settings object (#10093: consistency with old signature) + // We also use the url parameter if available + s.url = ( ( url || s.url || location.href ) + "" ) + .replace( rprotocol, location.protocol + "//" ); + + // Alias method option to type as per ticket #12004 + s.type = options.method || options.type || s.method || s.type; + + // Extract dataTypes list + s.dataTypes = ( s.dataType || "*" ).toLowerCase().match( rnothtmlwhite ) || [ "" ]; + + // A cross-domain request is in order when the origin doesn't match the current origin. + if ( s.crossDomain == null ) { + urlAnchor = document.createElement( "a" ); + + // Support: IE <=8 - 11, Edge 12 - 15 + // IE throws exception on accessing the href property if url is malformed, + // e.g. http://example.com:80x/ + try { + urlAnchor.href = s.url; + + // Support: IE <=8 - 11 only + // Anchor's host property isn't correctly set when s.url is relative + urlAnchor.href = urlAnchor.href; + s.crossDomain = originAnchor.protocol + "//" + originAnchor.host !== + urlAnchor.protocol + "//" + urlAnchor.host; + } catch ( e ) { + + // If there is an error parsing the URL, assume it is crossDomain, + // it can be rejected by the transport if it is invalid + s.crossDomain = true; + } + } + + // Convert data if not already a string + if ( s.data && s.processData && typeof s.data !== "string" ) { + s.data = jQuery.param( s.data, s.traditional ); + } + + // Apply prefilters + inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); + + // If request was aborted inside a prefilter, stop there + if ( completed ) { + return jqXHR; + } + + // We can fire global events as of now if asked to + // Don't fire events if jQuery.event is undefined in an AMD-usage scenario (#15118) + fireGlobals = jQuery.event && s.global; + + // Watch for a new set of requests + if ( fireGlobals && jQuery.active++ === 0 ) { + jQuery.event.trigger( "ajaxStart" ); + } + + // Uppercase the type + s.type = s.type.toUpperCase(); + + // Determine if request has content + s.hasContent = !rnoContent.test( s.type ); + + // Save the URL in case we're toying with the If-Modified-Since + // and/or If-None-Match header later on + // Remove hash to simplify url manipulation + cacheURL = s.url.replace( rhash, "" ); + + // More options handling for requests with no content + if ( !s.hasContent ) { + + // Remember the hash so we can put it back + uncached = s.url.slice( cacheURL.length ); + + // If data is available and should be processed, append data to url + if ( s.data && ( s.processData || typeof s.data === "string" ) ) { + cacheURL += ( rquery.test( cacheURL ) ? "&" : "?" ) + s.data; + + // #9682: remove data so that it's not used in an eventual retry + delete s.data; + } + + // Add or update anti-cache param if needed + if ( s.cache === false ) { + cacheURL = cacheURL.replace( rantiCache, "$1" ); + uncached = ( rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ( nonce.guid++ ) + + uncached; + } + + // Put hash and anti-cache on the URL that will be requested (gh-1732) + s.url = cacheURL + uncached; + + // Change '%20' to '+' if this is encoded form body content (gh-2658) + } else if ( s.data && s.processData && + ( s.contentType || "" ).indexOf( "application/x-www-form-urlencoded" ) === 0 ) { + s.data = s.data.replace( r20, "+" ); + } + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + if ( jQuery.lastModified[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); + } + if ( jQuery.etag[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); + } + } + + // Set the correct header, if data is being sent + if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { + jqXHR.setRequestHeader( "Content-Type", s.contentType ); + } + + // Set the Accepts header for the server, depending on the dataType + jqXHR.setRequestHeader( + "Accept", + s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[ 0 ] ] ? + s.accepts[ s.dataTypes[ 0 ] ] + + ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : + s.accepts[ "*" ] + ); + + // Check for headers option + for ( i in s.headers ) { + jqXHR.setRequestHeader( i, s.headers[ i ] ); + } + + // Allow custom headers/mimetypes and early abort + if ( s.beforeSend && + ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || completed ) ) { + + // Abort if not done already and return + return jqXHR.abort(); + } + + // Aborting is no longer a cancellation + strAbort = "abort"; + + // Install callbacks on deferreds + completeDeferred.add( s.complete ); + jqXHR.done( s.success ); + jqXHR.fail( s.error ); + + // Get transport + transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); + + // If no transport, we auto-abort + if ( !transport ) { + done( -1, "No Transport" ); + } else { + jqXHR.readyState = 1; + + // Send global event + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); + } + + // If request was aborted inside ajaxSend, stop there + if ( completed ) { + return jqXHR; + } + + // Timeout + if ( s.async && s.timeout > 0 ) { + timeoutTimer = window.setTimeout( function() { + jqXHR.abort( "timeout" ); + }, s.timeout ); + } + + try { + completed = false; + transport.send( requestHeaders, done ); + } catch ( e ) { + + // Rethrow post-completion exceptions + if ( completed ) { + throw e; + } + + // Propagate others as results + done( -1, e ); + } + } + + // Callback for when everything is done + function done( status, nativeStatusText, responses, headers ) { + var isSuccess, success, error, response, modified, + statusText = nativeStatusText; + + // Ignore repeat invocations + if ( completed ) { + return; + } + + completed = true; + + // Clear timeout if it exists + if ( timeoutTimer ) { + window.clearTimeout( timeoutTimer ); + } + + // Dereference transport for early garbage collection + // (no matter how long the jqXHR object will be used) + transport = undefined; + + // Cache response headers + responseHeadersString = headers || ""; + + // Set readyState + jqXHR.readyState = status > 0 ? 4 : 0; + + // Determine if successful + isSuccess = status >= 200 && status < 300 || status === 304; + + // Get response data + if ( responses ) { + response = ajaxHandleResponses( s, jqXHR, responses ); + } + + // Use a noop converter for missing script but not if jsonp + if ( !isSuccess && + jQuery.inArray( "script", s.dataTypes ) > -1 && + jQuery.inArray( "json", s.dataTypes ) < 0 ) { + s.converters[ "text script" ] = function() {}; + } + + // Convert no matter what (that way responseXXX fields are always set) + response = ajaxConvert( s, response, jqXHR, isSuccess ); + + // If successful, handle type chaining + if ( isSuccess ) { + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + modified = jqXHR.getResponseHeader( "Last-Modified" ); + if ( modified ) { + jQuery.lastModified[ cacheURL ] = modified; + } + modified = jqXHR.getResponseHeader( "etag" ); + if ( modified ) { + jQuery.etag[ cacheURL ] = modified; + } + } + + // if no content + if ( status === 204 || s.type === "HEAD" ) { + statusText = "nocontent"; + + // if not modified + } else if ( status === 304 ) { + statusText = "notmodified"; + + // If we have data, let's convert it + } else { + statusText = response.state; + success = response.data; + error = response.error; + isSuccess = !error; + } + } else { + + // Extract error from statusText and normalize for non-aborts + error = statusText; + if ( status || !statusText ) { + statusText = "error"; + if ( status < 0 ) { + status = 0; + } + } + } + + // Set data for the fake xhr object + jqXHR.status = status; + jqXHR.statusText = ( nativeStatusText || statusText ) + ""; + + // Success/Error + if ( isSuccess ) { + deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); + } else { + deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); + } + + // Status-dependent callbacks + jqXHR.statusCode( statusCode ); + statusCode = undefined; + + if ( fireGlobals ) { + globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", + [ jqXHR, s, isSuccess ? success : error ] ); + } + + // Complete + completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); + + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); + + // Handle the global AJAX counter + if ( !( --jQuery.active ) ) { + jQuery.event.trigger( "ajaxStop" ); + } + } + } + + return jqXHR; + }, + + getJSON: function( url, data, callback ) { + return jQuery.get( url, data, callback, "json" ); + }, + + getScript: function( url, callback ) { + return jQuery.get( url, undefined, callback, "script" ); + } +} ); + +jQuery.each( [ "get", "post" ], function( _i, method ) { + jQuery[ method ] = function( url, data, callback, type ) { + + // Shift arguments if data argument was omitted + if ( isFunction( data ) ) { + type = type || callback; + callback = data; + data = undefined; + } + + // The url can be an options object (which then must have .url) + return jQuery.ajax( jQuery.extend( { + url: url, + type: method, + dataType: type, + data: data, + success: callback + }, jQuery.isPlainObject( url ) && url ) ); + }; +} ); + +jQuery.ajaxPrefilter( function( s ) { + var i; + for ( i in s.headers ) { + if ( i.toLowerCase() === "content-type" ) { + s.contentType = s.headers[ i ] || ""; + } + } +} ); + + +jQuery._evalUrl = function( url, options, doc ) { + return jQuery.ajax( { + url: url, + + // Make this explicit, since user can override this through ajaxSetup (#11264) + type: "GET", + dataType: "script", + cache: true, + async: false, + global: false, + + // Only evaluate the response if it is successful (gh-4126) + // dataFilter is not invoked for failure responses, so using it instead + // of the default converter is kludgy but it works. + converters: { + "text script": function() {} + }, + dataFilter: function( response ) { + jQuery.globalEval( response, options, doc ); + } + } ); +}; + + +jQuery.fn.extend( { + wrapAll: function( html ) { + var wrap; + + if ( this[ 0 ] ) { + if ( isFunction( html ) ) { + html = html.call( this[ 0 ] ); + } + + // The elements to wrap the target around + wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); + + if ( this[ 0 ].parentNode ) { + wrap.insertBefore( this[ 0 ] ); + } + + wrap.map( function() { + var elem = this; + + while ( elem.firstElementChild ) { + elem = elem.firstElementChild; + } + + return elem; + } ).append( this ); + } + + return this; + }, + + wrapInner: function( html ) { + if ( isFunction( html ) ) { + return this.each( function( i ) { + jQuery( this ).wrapInner( html.call( this, i ) ); + } ); + } + + return this.each( function() { + var self = jQuery( this ), + contents = self.contents(); + + if ( contents.length ) { + contents.wrapAll( html ); + + } else { + self.append( html ); + } + } ); + }, + + wrap: function( html ) { + var htmlIsFunction = isFunction( html ); + + return this.each( function( i ) { + jQuery( this ).wrapAll( htmlIsFunction ? html.call( this, i ) : html ); + } ); + }, + + unwrap: function( selector ) { + this.parent( selector ).not( "body" ).each( function() { + jQuery( this ).replaceWith( this.childNodes ); + } ); + return this; + } +} ); + + +jQuery.expr.pseudos.hidden = function( elem ) { + return !jQuery.expr.pseudos.visible( elem ); +}; +jQuery.expr.pseudos.visible = function( elem ) { + return !!( elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length ); +}; + + + + +jQuery.ajaxSettings.xhr = function() { + try { + return new window.XMLHttpRequest(); + } catch ( e ) {} +}; + +var xhrSuccessStatus = { + + // File protocol always yields status code 0, assume 200 + 0: 200, + + // Support: IE <=9 only + // #1450: sometimes IE returns 1223 when it should be 204 + 1223: 204 + }, + xhrSupported = jQuery.ajaxSettings.xhr(); + +support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); +support.ajax = xhrSupported = !!xhrSupported; + +jQuery.ajaxTransport( function( options ) { + var callback, errorCallback; + + // Cross domain only allowed if supported through XMLHttpRequest + if ( support.cors || xhrSupported && !options.crossDomain ) { + return { + send: function( headers, complete ) { + var i, + xhr = options.xhr(); + + xhr.open( + options.type, + options.url, + options.async, + options.username, + options.password + ); + + // Apply custom fields if provided + if ( options.xhrFields ) { + for ( i in options.xhrFields ) { + xhr[ i ] = options.xhrFields[ i ]; + } + } + + // Override mime type if needed + if ( options.mimeType && xhr.overrideMimeType ) { + xhr.overrideMimeType( options.mimeType ); + } + + // X-Requested-With header + // For cross-domain requests, seeing as conditions for a preflight are + // akin to a jigsaw puzzle, we simply never set it to be sure. + // (it can always be set on a per-request basis or even using ajaxSetup) + // For same-domain requests, won't change header if already provided. + if ( !options.crossDomain && !headers[ "X-Requested-With" ] ) { + headers[ "X-Requested-With" ] = "XMLHttpRequest"; + } + + // Set headers + for ( i in headers ) { + xhr.setRequestHeader( i, headers[ i ] ); + } + + // Callback + callback = function( type ) { + return function() { + if ( callback ) { + callback = errorCallback = xhr.onload = + xhr.onerror = xhr.onabort = xhr.ontimeout = + xhr.onreadystatechange = null; + + if ( type === "abort" ) { + xhr.abort(); + } else if ( type === "error" ) { + + // Support: IE <=9 only + // On a manual native abort, IE9 throws + // errors on any property access that is not readyState + if ( typeof xhr.status !== "number" ) { + complete( 0, "error" ); + } else { + complete( + + // File: protocol always yields status 0; see #8605, #14207 + xhr.status, + xhr.statusText + ); + } + } else { + complete( + xhrSuccessStatus[ xhr.status ] || xhr.status, + xhr.statusText, + + // Support: IE <=9 only + // IE9 has no XHR2 but throws on binary (trac-11426) + // For XHR2 non-text, let the caller handle it (gh-2498) + ( xhr.responseType || "text" ) !== "text" || + typeof xhr.responseText !== "string" ? + { binary: xhr.response } : + { text: xhr.responseText }, + xhr.getAllResponseHeaders() + ); + } + } + }; + }; + + // Listen to events + xhr.onload = callback(); + errorCallback = xhr.onerror = xhr.ontimeout = callback( "error" ); + + // Support: IE 9 only + // Use onreadystatechange to replace onabort + // to handle uncaught aborts + if ( xhr.onabort !== undefined ) { + xhr.onabort = errorCallback; + } else { + xhr.onreadystatechange = function() { + + // Check readyState before timeout as it changes + if ( xhr.readyState === 4 ) { + + // Allow onerror to be called first, + // but that will not handle a native abort + // Also, save errorCallback to a variable + // as xhr.onerror cannot be accessed + window.setTimeout( function() { + if ( callback ) { + errorCallback(); + } + } ); + } + }; + } + + // Create the abort callback + callback = callback( "abort" ); + + try { + + // Do send the request (this may raise an exception) + xhr.send( options.hasContent && options.data || null ); + } catch ( e ) { + + // #14683: Only rethrow if this hasn't been notified as an error yet + if ( callback ) { + throw e; + } + } + }, + + abort: function() { + if ( callback ) { + callback(); + } + } + }; + } +} ); + + + + +// Prevent auto-execution of scripts when no explicit dataType was provided (See gh-2432) +jQuery.ajaxPrefilter( function( s ) { + if ( s.crossDomain ) { + s.contents.script = false; + } +} ); + +// Install script dataType +jQuery.ajaxSetup( { + accepts: { + script: "text/javascript, application/javascript, " + + "application/ecmascript, application/x-ecmascript" + }, + contents: { + script: /\b(?:java|ecma)script\b/ + }, + converters: { + "text script": function( text ) { + jQuery.globalEval( text ); + return text; + } + } +} ); + +// Handle cache's special case and crossDomain +jQuery.ajaxPrefilter( "script", function( s ) { + if ( s.cache === undefined ) { + s.cache = false; + } + if ( s.crossDomain ) { + s.type = "GET"; + } +} ); + +// Bind script tag hack transport +jQuery.ajaxTransport( "script", function( s ) { + + // This transport only deals with cross domain or forced-by-attrs requests + if ( s.crossDomain || s.scriptAttrs ) { + var script, callback; + return { + send: function( _, complete ) { + script = jQuery( " + + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/add_new_model_to_data_labeler.ipynb b/docs/0.12.0/html/add_new_model_to_data_labeler.ipynb new file mode 100644 index 000000000..1495e6a85 --- /dev/null +++ b/docs/0.12.0/html/add_new_model_to_data_labeler.ipynb @@ -0,0 +1,488 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "228bb2a6", + "metadata": {}, + "source": [ + "# Adding new model to the existing DataLabeler pipeline" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cab7a569", + "metadata": {}, + "source": [ + "Consider the case when we would like to explore different character-level neural network models and evaluate their performance on different datasets. The existing DataLabeler in the DataProfiler library already contains a preprocessor, a postprocessor, and a character-level CNN (Convolutional Neural Network) model that are combined to work on such data. All we need is to build additional model classes that inherit the main functionalities from the CNN model and also adapt the model construction to the desired architectures. In this example, we define such a new model to be used with the Data Labeler component of the Data Profiler. In particular, a character-level LSTM (Long Short-Term Memory) model is implemented, then integrated into the DataLabeler pipeline to be trained with a tabular dataset. The process includes the following steps:\n", + "\n", + " - Build a new character-level LSTM model that inherits the CNN model\n", + " - Load the DataLabeler from the DataProfiler\n", + " - Swap the existing CNN model with the new LSTM model\n", + " - Train the data labeler pipeline on a given dataset\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16624c48", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e90728ab", + "metadata": {}, + "source": [ + "## Dataset" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3d61981c", + "metadata": {}, + "source": [ + "In this example, we use a structured dataset, the aws honeypot dataset, given in the test folder of the library. This dataset is first read by the Data Reader class of the Data Profiler, then split into training and test data to be used in the next sections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f031fe06", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data\n", + "data = dp.Data(\"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\")\n", + "df_data = data.data\n", + "\n", + "# split data to training and test set\n", + "split_ratio = 0.2\n", + "df_data = df_data.sample(frac=1).reset_index(drop=True)\n", + "data_train = df_data[:int((1 - split_ratio) * len(df_data))]\n", + "data_test = df_data[int((1 - split_ratio) * len(df_data)):]\n", + "\n", + "df_data.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "745ed0d4", + "metadata": {}, + "source": [ + "## Implement a new character-level LSTM model" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7375b0c0", + "metadata": {}, + "source": [ + "This new model is inherited from `CharacterLevelCnnModel` class, with some modifications on the following functions\n", + "\n", + "`__init__`: to add new parameters for the LSTM model. The new parameters, `size_lstm`, `rec_dropout`, `activation`, `recurrent_activation`, specify number of LSTM layers, activation function, and recurrent dropout ratio.\n", + "\n", + "`_validate_parameters`: to add additional checks on the new parameters for the LSTM model\n", + "\n", + "`_construct_model`: to construct the new LSTM model with the desired architecture" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8568fb49", + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "from dataprofiler.labelers.character_level_cnn_model import (\n", + " CharacterLevelCnnModel,\n", + " create_glove_char,\n", + " build_embd_dictionary,\n", + ")\n", + "from dataprofiler.labelers.base_model import BaseModel\n", + "from dataprofiler.labelers.labeler_utils import F1Score\n", + "\n", + "\n", + "# CharacterLevelLstmModel derives from CharacterLevelCnnModel\n", + "#########################################################\n", + "#########################################################\n", + "class CharacterLevelLstmModel(CharacterLevelCnnModel):\n", + " # boolean if the label mapping requires the mapping for index 0 reserved\n", + " requires_zero_mapping = True\n", + "\n", + " def __init__(self, label_mapping=None, parameters=None):\n", + " \"\"\"\n", + " LSTM Model Initializer\n", + " \"\"\"\n", + "\n", + " # parameter initialization\n", + " if not parameters:\n", + " parameters = {}\n", + " parameters.setdefault(\"max_length\", 3400)\n", + " parameters.setdefault(\"max_char_encoding_id\", 127)\n", + " parameters.setdefault(\"dim_embed\", 64)\n", + " parameters.setdefault(\"size_fc\", [32, 32])\n", + " parameters.setdefault(\"dropout\", 0.1)\n", + " # new parameters for LSTM model\n", + " #########################################################\n", + " #########################################################\n", + " parameters.setdefault(\"size_lstm\", [64])\n", + " parameters.setdefault(\"rec_dropout\", 0.1)\n", + " parameters.setdefault(\"activation\", \"tanh\")\n", + " parameters.setdefault(\"recurrent_activation\", \"sigmoid\")\n", + " #########################################################\n", + " #########################################################\n", + " parameters.setdefault(\"default_label\", \"UNKNOWN\")\n", + " parameters[\"pad_label\"] = \"PAD\"\n", + " self._epoch_id = 0\n", + "\n", + " # reconstruct flags for model\n", + " self._model_num_labels = 0\n", + " self._model_default_ind = -1\n", + "\n", + " BaseModel.__init__(self, label_mapping, parameters)\n", + "\n", + " def _validate_parameters(self, parameters):\n", + " \"\"\"\n", + " Validate the parameters sent in. Raise error if invalid parameters are\n", + " present.\n", + " \"\"\"\n", + " errors = []\n", + " list_of_necessary_params = [\n", + " \"max_length\",\n", + " \"max_char_encoding_id\",\n", + " \"dim_embed\",\n", + " \"size_fc\",\n", + " \"dropout\",\n", + " \"size_lstm\",\n", + " \"rec_dropout\",\n", + " \"activation\",\n", + " \"recurrent_activation\",\n", + " \"default_label\",\n", + " \"pad_label\",\n", + " ]\n", + " # Make sure the necessary parameters are present and valid.\n", + " for param in parameters:\n", + " if param in [\n", + " \"max_length\",\n", + " \"max_char_encoding_id\",\n", + " \"dim_embed\",\n", + " \"size_conv\",\n", + " ]:\n", + " if (\n", + " not isinstance(parameters[param], (int, float))\n", + " or parameters[param] < 0\n", + " ):\n", + " errors.append(\n", + " param + \" must be a valid integer or float \" \"greater than 0.\"\n", + " )\n", + " elif param in [\n", + " \"dropout\",\n", + " \"rec_dropout\",\n", + " ]: # additional check for rec_dropout\n", + " if (\n", + " not isinstance(parameters[param], (int, float))\n", + " or parameters[param] < 0\n", + " or parameters[param] > 1\n", + " ):\n", + " errors.append(\n", + " param + \" must be a valid integer or float \" \"from 0 to 1.\"\n", + " )\n", + " elif (\n", + " param == \"size_fc\" or param == \"size_lstm\"\n", + " ): # additional check for size_lstm\n", + " if (\n", + " not isinstance(parameters[param], list)\n", + " or len(parameters[param]) == 0\n", + " ):\n", + " errors.append(param + \" must be a non-empty list of \" \"integers.\")\n", + " else:\n", + " for item in parameters[param]:\n", + " if not isinstance(item, int):\n", + " errors.append(\n", + " param + \" must be a non-empty \" \"list of integers.\"\n", + " )\n", + " break\n", + " elif param in [\n", + " \"default_label\",\n", + " \"activation\",\n", + " \"recurrent_activation\",\n", + " ]: # additional check for activation and recurrent_activation\n", + " if not isinstance(parameters[param], str):\n", + " error = str(param) + \" must be a string.\"\n", + " errors.append(error)\n", + "\n", + " # Error if there are extra parameters thrown in\n", + " for param in parameters:\n", + " if param not in list_of_necessary_params:\n", + " errors.append(param + \" is not an accepted parameter.\")\n", + " if errors:\n", + " raise ValueError(\"\\n\".join(errors))\n", + "\n", + " def _construct_model(self):\n", + " \"\"\"\n", + " Model constructor for the data labeler. This also serves as a weight\n", + " reset.\n", + "\n", + " :return: None\n", + " \"\"\"\n", + " num_labels = self.num_labels\n", + " default_ind = self.label_mapping[self._parameters[\"default_label\"]]\n", + "\n", + " # Reset model\n", + " tf.keras.backend.clear_session()\n", + "\n", + " # generate glove embedding\n", + " create_glove_char(self._parameters[\"dim_embed\"])\n", + "\n", + " # generate model\n", + " self._model = tf.keras.models.Sequential()\n", + "\n", + " # default parameters\n", + " max_length = self._parameters[\"max_length\"]\n", + " max_char_encoding_id = self._parameters[\"max_char_encoding_id\"]\n", + "\n", + " # Encoding layer\n", + " def encoding_function(input_str):\n", + " char_in_vector = CharacterLevelLstmModel._char_encoding_layer(\n", + " input_str, max_char_encoding_id, max_length\n", + " )\n", + " return char_in_vector\n", + "\n", + " self._model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.string))\n", + "\n", + " self._model.add(\n", + " tf.keras.layers.Lambda(encoding_function, output_shape=tuple([max_length]))\n", + " )\n", + "\n", + " # Create a pre-trained weight matrix\n", + " # character encoding indices range from 0 to max_char_encoding_id,\n", + " # we add one extra index for out-of-vocabulary character\n", + " embed_file = os.path.join(\n", + " \"../dataprofiler/labelers\",\n", + " \"embeddings/glove-reduced-{}D.txt\".format(self._parameters[\"dim_embed\"]),\n", + " )\n", + " embedding_matrix = np.zeros(\n", + " (max_char_encoding_id + 2, self._parameters[\"dim_embed\"])\n", + " )\n", + " embedding_dict = build_embd_dictionary(embed_file)\n", + "\n", + " input_shape = tuple([max_length])\n", + " # Fill in the weight matrix: let pad and space be 0s\n", + " for ascii_num in range(max_char_encoding_id):\n", + " if chr(ascii_num) in embedding_dict:\n", + " embedding_matrix[ascii_num + 1] = embedding_dict[chr(ascii_num)]\n", + "\n", + " self._model.add(\n", + " tf.keras.layers.Embedding(\n", + " max_char_encoding_id + 2,\n", + " self._parameters[\"dim_embed\"],\n", + " weights=[embedding_matrix],\n", + " input_length=input_shape[0],\n", + " trainable=True,\n", + " )\n", + " )\n", + "\n", + " # Add the lstm layers\n", + " #########################################################\n", + " #########################################################\n", + " for size in self._parameters[\"size_lstm\"]:\n", + " self._model.add(\n", + " tf.keras.layers.LSTM(\n", + " units=size,\n", + " recurrent_dropout=self._parameters[\"rec_dropout\"],\n", + " activation=self._parameters[\"activation\"],\n", + " recurrent_activation=self._parameters[\"recurrent_activation\"],\n", + " return_sequences=True,\n", + " )\n", + " )\n", + " if self._parameters[\"dropout\"]:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n", + " #########################################################\n", + " #########################################################\n", + "\n", + " # Add the fully connected layers\n", + " for size in self._parameters[\"size_fc\"]:\n", + " self._model.add(tf.keras.layers.Dense(units=size, activation=\"relu\"))\n", + " if self._parameters[\"dropout\"]:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n", + "\n", + " # Add the final Softmax layer\n", + " self._model.add(tf.keras.layers.Dense(num_labels, activation=\"softmax\"))\n", + "\n", + " # Output the model into a .pb file for TensorFlow\n", + " argmax_layer = tf.keras.backend.argmax(self._model.output)\n", + "\n", + " # Create confidence layers\n", + " final_predicted_layer = CharacterLevelLstmModel._argmax_threshold_layer(\n", + " num_labels, threshold=0.0, default_ind=default_ind\n", + " )\n", + "\n", + " argmax_outputs = self._model.outputs + [\n", + " argmax_layer,\n", + " final_predicted_layer(argmax_layer, self._model.output),\n", + " ]\n", + " self._model = tf.keras.Model(self._model.inputs, argmax_outputs)\n", + "\n", + " # Compile the model\n", + " softmax_output_layer_name = self._model.outputs[0].name.split(\"/\")[0]\n", + " losses = {softmax_output_layer_name: \"categorical_crossentropy\"}\n", + "\n", + " # use f1 score metric\n", + " f1_score_training = F1Score(num_classes=num_labels, average=\"micro\")\n", + " metrics = {softmax_output_layer_name: [\"acc\", f1_score_training]}\n", + "\n", + " self._model.compile(loss=losses, optimizer=\"adam\", metrics=metrics)\n", + "\n", + " self._epoch_id = 0\n", + " self._model_num_labels = num_labels\n", + " self._model_default_ind = default_ind" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d66bd25c", + "metadata": {}, + "source": [ + "## Integrate the new LSTM model to the DataLabeler" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "479f407a", + "metadata": {}, + "source": [ + "Once the LSTM model is built, it replaces the existing model in the DataLabeler pipeline, which is then trained on the given dataset. Note that, as the DataLabeler is trained on the above tabular dataset, its label mapping is updated by the list of column names in that dataset while training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb482ffe", + "metadata": {}, + "outputs": [], + "source": [ + "# get labels from the given dataset\n", + "value_label_df = data_train.reset_index(drop=True).melt()\n", + "value_label_df.columns = [1, 0] # labels=1, values=0 in that order\n", + "value_label_df = value_label_df.astype(str)\n", + "labels = value_label_df[1].unique().tolist()\n", + "\n", + "# create a new LSTM model\n", + "# set default label (one of the column names) to the model\n", + "model = CharacterLevelLstmModel(label_mapping=labels, parameters={'default_label': 'comment'})\n", + "\n", + "# add the new LSTM model to the data labeler\n", + "data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True)\n", + "data_labeler.set_model(model)\n", + "\n", + "# set default label (one of the column names) to the preprocessor and postprocessor\n", + "processor_params = {'default_label': 'comment'}\n", + "data_labeler._preprocessor.set_params(**processor_params)\n", + "data_labeler._postprocessor.set_params(**processor_params)\n", + "\n", + "# train the data labeler\n", + "save_dirpath=\"data_labeler_saved\"\n", + "if not os.path.exists(save_dirpath):\n", + " os.makedirs(save_dirpath)\n", + "\n", + "epochs=2\n", + "data_labeler.fit(\n", + " x=value_label_df[0], y=value_label_df[1], labels=labels, epochs=epochs)\n", + "if save_dirpath:\n", + " data_labeler.save_to_disk(save_dirpath)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "14b78c69", + "metadata": {}, + "source": [ + "The trained Data Labeler is then used by the Data Profiler to provide the prediction on the new dataset. In this example, all options except data labeler are disabled for the sake of presenting data labeler functionality. The results are given in the columnar format where true column types are given in the first column, and the predicted column labels are given in the second column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdfcf1d2", + "metadata": {}, + "outputs": [], + "source": [ + "# predict with the data labeler object\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"structured_options.text.is_enabled\": False, \n", + " \"int.is_enabled\": False, \n", + " \"float.is_enabled\": False, \n", + " \"order.is_enabled\": False, \n", + " \"category.is_enabled\": False, \n", + " \"datetime.is_enabled\": False,})\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': data_labeler})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "def get_structured_results(results):\n", + " columns = []\n", + " predictions = []\n", + " for col_report in results['data_stats']:\n", + " columns.append(col_report['column_name'])\n", + " predictions.append(col_report['data_label'])\n", + "\n", + " df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions})\n", + " return df_results\n", + "\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cc60ff8a", + "metadata": {}, + "source": [ + "In summary, users can define their own model, plug it in the DataLabeler pipeline, and train the labeler with the new dataset. Above, we show one example of adding the LSTM model to the pipeline. Interested users can implement other neural network models as desired with the same process." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/html/column_name_labeler_example.html b/docs/0.12.0/html/column_name_labeler_example.html new file mode 100644 index 000000000..bc9a57787 --- /dev/null +++ b/docs/0.12.0/html/column_name_labeler_example.html @@ -0,0 +1,822 @@ + + + + + + + + + ColumnName Labeler Tutorial - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

ColumnName Labeler Tutorial

+

This notebook teaches how to use the existing ColumnNameModel:

+
    +
  1. Loading and utilizing the pre-existing ColumnNameModel

  2. +
  3. Run the labeler

  4. +
+

First, let’s import the libraries needed for this example.

+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+from pprint import pprint
+
+import pandas as pd
+
+try:
+    import dataprofiler as dp
+except ImportError:
+    sys.path.insert(0, '../..')
+    import dataprofiler as dp
+
+
+
+
+

Loading and predicting using a pre-existing model using load_from_library

+

The easiest option for users is to load_from_library by specifying the name for the labeler in the resources/ folder. Quickly import and start predicting with any model from the Data Profiler’s library of models available.

+
+
[ ]:
+
+
+
+labeler_from_library = dp.DataLabeler.load_from_library('column_name_labeler')
+
+
+
+
+
[ ]:
+
+
+
+labeler_from_library.predict(data=["ssn"])
+
+
+
+
+
+

Loading and using the pre-existing column name labeler using load_with_components

+

For example purposes here, we will import the exsting ColumnName labeler via the load_with_components command from the dp.DataLabeler. This shows a bit more of the details of the data labeler’s flow.

+
+
[ ]:
+
+
+
+parameters = {
+            "true_positive_dict": [
+                {"attribute": "ssn", "label": "ssn"},
+                {"attribute": "suffix", "label": "name"},
+                {"attribute": "my_home_address", "label": "address"},
+            ],
+            "false_positive_dict": [
+                {
+                    "attribute": "contract_number",
+                    "label": "ssn",
+                },
+                {
+                    "attribute": "role",
+                    "label": "name",
+                },
+                {
+                    "attribute": "send_address",
+                    "label": "address",
+                },
+            ],
+            "negative_threshold_config": 50,
+            "positive_threshold_config": 85,
+            "include_label": True,
+        }
+
+label_mapping = {"ssn": 1, "name": 2, "address": 3}
+
+
+
+
+
[ ]:
+
+
+
+# pre processor
+preprocessor = dp.labelers.data_processing.DirectPassPreprocessor()
+
+# model
+from dataprofiler.labelers.column_name_model import ColumnNameModel
+model = ColumnNameModel(
+    parameters=parameters,
+    label_mapping=label_mapping,
+)
+
+
+# post processor
+postprocessor = dp.labelers.data_processing.ColumnNameModelPostprocessor()
+
+
+
+
+
[ ]:
+
+
+
+data_labeler = dp.DataLabeler.load_with_components(
+    preprocessor=preprocessor,
+    model=model,
+    postprocessor=postprocessor,
+)
+data_labeler.model.help()
+
+
+
+
+
[ ]:
+
+
+
+pprint(data_labeler.label_mapping)
+
+
+
+
+
[ ]:
+
+
+
+pprint(data_labeler.model._parameters)
+
+
+
+
+

Predicting with the ColumnName labeler

+

In the prediction below, the data will be passed into to stages in the background - 1) compare_negative: The idea behind the compare_negative is to first filter out any possibility of flagging a false positive in the model prediction. In this step, the confidence value is checked and if the similarity is too close to being a false positive, that particular string in the data is removed and not returned to the compare_positive. - 2) compare_positive: Finally the data is +passed to the compare_positive step and checked for similarity with the the true_positive_dict values. Again, during this stage the positive_threshold_config is used to filter the results to only those data values that are greater than or equal to the positive_threshold_config provided by the user.

+
+
[ ]:
+
+
+
+# evaluate a prediction using the default parameters
+data_labeler.predict(data=["ssn", "name", "address"])
+
+
+
+
+
+
+

Replacing the parameters in the existing labeler

+

We can achieve this by: 1. Setting the label mapping to the new labels 2. Setting the model parameters which include: true_positive_dict, false_positive_dict, negative_threshold_config, positive_threshold_config, and include_label

+

where true_positive_dict and false_positive_dict are lists of dicts, negative_threshold_config and positive_threshold_config are integer values between 0 and 100, and include_label is a boolean value that determines if the output should include the prediction labels or only the confidence values.

+

Below, we created 4 labels where other is the default_label.

+
+
[ ]:
+
+
+
+data_labeler.set_labels({'other': 0, "funky_one": 1, "funky_two": 2, "funky_three": 3})
+data_labeler.model.set_params(
+    true_positive_dict= [
+                {"attribute": "ssn", "label": "funky_one"},
+                {"attribute": "suffix", "label": "funky_two"},
+                {"attribute": "my_home_address", "label": "funky_three"},
+            ],
+    false_positive_dict=[
+                {
+                    "attribute": "contract_number",
+                    "label": "ssn",
+                },
+                {
+                    "attribute": "role",
+                    "label": "name",
+                },
+                {
+                    "attribute": "not_my_address",
+                    "label": "address",
+                },
+            ],
+    negative_threshold_config=50,
+    positive_threshold_config=85,
+    include_label=True,
+)
+data_labeler.label_mapping
+
+
+
+
+

Predicting with the new labels

+

Here we are testing the predict() method with brand new labels for label_mapping. As we can see the new labels flow throught to the output of the data labeler.

+
+
[ ]:
+
+
+
+data_labeler.predict(data=["ssn", "suffix"], predict_options=dict(show_confidences=True))
+
+
+
+
+
+
+

Saving the Data Labeler for future use

+
+
[ ]:
+
+
+
+if not os.path.isdir('new_column_name_labeler'):
+    os.mkdir('new_column_name_labeler')
+data_labeler.save_to_disk('new_column_name_labeler')
+
+
+
+
+
+

Loading the saved Data Labeler

+
+
[ ]:
+
+
+
+saved_labeler = dp.DataLabeler.load_from_disk('new_column_name_labeler')
+
+
+
+
+
[ ]:
+
+
+
+# ensuring the parametesr are what we saved.
+print("label_mapping:")
+pprint(saved_labeler.label_mapping)
+print("\nmodel parameters:")
+pprint(saved_labeler.model._parameters)
+print()
+print("postprocessor: " + saved_labeler.postprocessor.__class__.__name__)
+
+
+
+
+
[ ]:
+
+
+
+# predicting with the loaded labeler.
+saved_labeler.predict(["ssn", "name", "address"])
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/column_name_labeler_example.ipynb b/docs/0.12.0/html/column_name_labeler_example.ipynb new file mode 100644 index 000000000..6d3369698 --- /dev/null +++ b/docs/0.12.0/html/column_name_labeler_example.ipynb @@ -0,0 +1,364 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e04c382a-7c49-452b-b9bf-e448951c64fe", + "metadata": {}, + "source": [ + "# ColumnName Labeler Tutorial" + ] + }, + { + "cell_type": "markdown", + "id": "6fb3ecb9-bc51-4c18-93d5-7991bbee5165", + "metadata": {}, + "source": [ + "This notebook teaches how to use the existing `ColumnNameModel`:\n", + "\n", + "1. Loading and utilizing the pre-existing `ColumnNameModel`\n", + "2. Run the labeler\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a67c197b-d3ee-4896-a96f-cc3d043601d3", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "from pprint import pprint\n", + "\n", + "import pandas as pd\n", + "\n", + "try:\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " sys.path.insert(0, '../..')\n", + " import dataprofiler as dp" + ] + }, + { + "cell_type": "markdown", + "id": "35841215", + "metadata": {}, + "source": [ + "## Loading and predicting using a pre-existing model using `load_from_library`\n", + "\n", + "The easiest option for users is to `load_from_library` by specifying the name for the labeler in the `resources/` folder. Quickly import and start predicting with any model from the Data Profiler's library of models available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46e36dd6", + "metadata": {}, + "outputs": [], + "source": [ + "labeler_from_library = dp.DataLabeler.load_from_library('column_name_labeler')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfa94868", + "metadata": {}, + "outputs": [], + "source": [ + "labeler_from_library.predict(data=[\"ssn\"])" + ] + }, + { + "cell_type": "markdown", + "id": "c71356f4-9020-4862-a1e1-816effbb5443", + "metadata": {}, + "source": [ + "## Loading and using the pre-existing column name labeler using `load_with_components`\n", + "\n", + "For example purposes here, we will import the exsting `ColumnName` labeler via the `load_with_components` command from the `dp.DataLabeler`. This shows a bit more of the details of the data labeler's flow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "818c5b88", + "metadata": {}, + "outputs": [], + "source": [ + "parameters = {\n", + " \"true_positive_dict\": [\n", + " {\"attribute\": \"ssn\", \"label\": \"ssn\"},\n", + " {\"attribute\": \"suffix\", \"label\": \"name\"},\n", + " {\"attribute\": \"my_home_address\", \"label\": \"address\"},\n", + " ],\n", + " \"false_positive_dict\": [\n", + " {\n", + " \"attribute\": \"contract_number\",\n", + " \"label\": \"ssn\",\n", + " },\n", + " {\n", + " \"attribute\": \"role\",\n", + " \"label\": \"name\",\n", + " },\n", + " {\n", + " \"attribute\": \"send_address\",\n", + " \"label\": \"address\",\n", + " },\n", + " ],\n", + " \"negative_threshold_config\": 50,\n", + " \"positive_threshold_config\": 85,\n", + " \"include_label\": True,\n", + " }\n", + "\n", + "label_mapping = {\"ssn\": 1, \"name\": 2, \"address\": 3}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9098329e", + "metadata": {}, + "outputs": [], + "source": [ + "# pre processor \n", + "preprocessor = dp.labelers.data_processing.DirectPassPreprocessor()\n", + "\n", + "# model\n", + "from dataprofiler.labelers.column_name_model import ColumnNameModel\n", + "model = ColumnNameModel(\n", + " parameters=parameters,\n", + " label_mapping=label_mapping,\n", + ")\n", + "\n", + "\n", + "# post processor\n", + "postprocessor = dp.labelers.data_processing.ColumnNameModelPostprocessor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "113d6655-4bca-4d8e-9e6f-b972e29d5684", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler = dp.DataLabeler.load_with_components(\n", + " preprocessor=preprocessor,\n", + " model=model,\n", + " postprocessor=postprocessor,\n", + ")\n", + "data_labeler.model.help()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b405887-2b92-44ca-b8d7-29c384f6dd9c", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(data_labeler.label_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11916a48-098c-4056-ac6c-b9542d85fa86", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(data_labeler.model._parameters)" + ] + }, + { + "cell_type": "markdown", + "id": "da0e97ee-8d6d-4631-9b55-78ed904d5f41", + "metadata": {}, + "source": [ + "### Predicting with the ColumnName labeler\n", + "\n", + "In the prediction below, the data will be passed into to stages in the background\n", + "- 1) `compare_negative`: The idea behind the `compare_negative` is to first filter out any possibility of flagging a false positive in the model prediction. In this step, the confidence value is checked and if the similarity is too close to being a false positive, that particular string in the `data` is removed and not returned to the `compare_positive`.\n", + "- 2) `compare_positive`: Finally the `data` is passed to the `compare_positive` step and checked for similarity with the the `true_positive_dict` values. Again, during this stage the `positive_threshold_config` is used to filter the results to only those `data` values that are greater than or equal to the `positive_threshold_config` provided by the user." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe519e65-36a7-4f42-8314-5369de8635c7", + "metadata": {}, + "outputs": [], + "source": [ + "# evaluate a prediction using the default parameters\n", + "data_labeler.predict(data=[\"ssn\", \"name\", \"address\"])" + ] + }, + { + "cell_type": "markdown", + "id": "b41d834d-e47b-45a6-8970-d2d2033e2ade", + "metadata": {}, + "source": [ + "## Replacing the parameters in the existing labeler\n", + "\n", + "We can achieve this by:\n", + "1. Setting the label mapping to the new labels\n", + "2. Setting the model parameters which include: `true_positive_dict`, `false_positive_dict`, `negative_threshold_config`, `positive_threshold_config`, and `include_label`\n", + "\n", + "where `true_positive_dict` and `false_positive_dict` are `lists` of `dicts`, `negative_threshold_config` and `positive_threshold_config` are integer values between `0` and `100`, and `include_label` is a `boolean` value that determines if the output should include the prediction labels or only the confidence values." + ] + }, + { + "cell_type": "markdown", + "id": "c6bb010a-406f-4fd8-abd0-3355a5ad0ded", + "metadata": {}, + "source": [ + "Below, we created 4 labels where `other` is the `default_label`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f86584cf-a7af-4bae-bf44-d87caa68833a", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.set_labels({'other': 0, \"funky_one\": 1, \"funky_two\": 2, \"funky_three\": 3})\n", + "data_labeler.model.set_params(\n", + " true_positive_dict= [\n", + " {\"attribute\": \"ssn\", \"label\": \"funky_one\"},\n", + " {\"attribute\": \"suffix\", \"label\": \"funky_two\"},\n", + " {\"attribute\": \"my_home_address\", \"label\": \"funky_three\"},\n", + " ],\n", + " false_positive_dict=[\n", + " {\n", + " \"attribute\": \"contract_number\",\n", + " \"label\": \"ssn\",\n", + " },\n", + " {\n", + " \"attribute\": \"role\",\n", + " \"label\": \"name\",\n", + " },\n", + " {\n", + " \"attribute\": \"not_my_address\",\n", + " \"label\": \"address\",\n", + " },\n", + " ],\n", + " negative_threshold_config=50,\n", + " positive_threshold_config=85,\n", + " include_label=True,\n", + ")\n", + "data_labeler.label_mapping" + ] + }, + { + "cell_type": "markdown", + "id": "1ece1c8c-18a5-46fc-b563-6458e6e71e53", + "metadata": {}, + "source": [ + "### Predicting with the new labels\n", + "\n", + "Here we are testing the `predict()` method with brand new labels for label_mapping. As we can see the new labels flow throught to the output of the data labeler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92842e14-2ea6-4879-b58c-c52b607dc94c", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.predict(data=[\"ssn\", \"suffix\"], predict_options=dict(show_confidences=True))" + ] + }, + { + "cell_type": "markdown", + "id": "261b903f-8f4c-403f-839b-ab8813f850e9", + "metadata": {}, + "source": [ + "## Saving the Data Labeler for future use" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6ffbaf2-9400-486a-ba83-5fc9ba9334d7", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.isdir('new_column_name_labeler'):\n", + " os.mkdir('new_column_name_labeler')\n", + "data_labeler.save_to_disk('new_column_name_labeler')" + ] + }, + { + "cell_type": "markdown", + "id": "09e40cb6-9d89-41c4-ae28-3dca498f8c68", + "metadata": {}, + "source": [ + "## Loading the saved Data Labeler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52615b25-70a6-4ebb-8a32-14aaf1e747d9", + "metadata": {}, + "outputs": [], + "source": [ + "saved_labeler = dp.DataLabeler.load_from_disk('new_column_name_labeler')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1ccc0b3-1dc2-4847-95c2-d6b8769b1590", + "metadata": {}, + "outputs": [], + "source": [ + "# ensuring the parametesr are what we saved.\n", + "print(\"label_mapping:\")\n", + "pprint(saved_labeler.label_mapping)\n", + "print(\"\\nmodel parameters:\")\n", + "pprint(saved_labeler.model._parameters)\n", + "print()\n", + "print(\"postprocessor: \" + saved_labeler.postprocessor.__class__.__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c827f2ae-4af6-4f3f-9651-9ee9ebea9fa0", + "metadata": {}, + "outputs": [], + "source": [ + "# predicting with the loaded labeler.\n", + "saved_labeler.predict([\"ssn\", \"name\", \"address\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/html/data_labeling.html b/docs/0.12.0/html/data_labeling.html new file mode 100644 index 000000000..8ee9f93a9 --- /dev/null +++ b/docs/0.12.0/html/data_labeling.html @@ -0,0 +1,617 @@ + + + + + + + + + Labeler (Sensitive Data) - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Labeler (Sensitive Data)

+

In this library, the term data labeling refers to entity recognition.

+

Builtin to the data profiler is a classifier which evaluates the complex data types of the dataset. +For structured data, it determines the complex data type of each column. When +running the data profile, it uses the default data labeling model builtin to the +library. However, the data labeler allows users to train their own data labeler +as well.

+

Data Labels are determined per cell for structured data (column/row when +the profiler is used) or at the character level for unstructured data. This +is a list of the default labels.

+
    +
  • UNKNOWN

  • +
  • ADDRESS

  • +
  • BAN (bank account number, 10-18 digits)

  • +
  • CREDIT_CARD

  • +
  • EMAIL_ADDRESS

  • +
  • UUID

  • +
  • HASH_OR_KEY (md5, sha1, sha256, random hash, etc.)

  • +
  • IPV4

  • +
  • IPV6

  • +
  • MAC_ADDRESS

  • +
  • PERSON

  • +
  • PHONE_NUMBER

  • +
  • SSN

  • +
  • URL

  • +
  • US_STATE

  • +
  • DRIVERS_LICENSE

  • +
  • DATE

  • +
  • TIME

  • +
  • DATETIME

  • +
  • INTEGER

  • +
  • FLOAT

  • +
  • QUANTITY

  • +
  • ORDINAL

  • +
+
+

Identify Entities in Structured Data

+

Makes predictions and identifying labels:

+
import dataprofiler as dp
+
+# load data and data labeler
+data = dp.Data("your_data.csv")
+data_labeler = dp.DataLabeler(labeler_type='structured')
+
+# make predictions and get labels per cell
+predictions = data_labeler.predict(data)
+
+
+
+
+

Identify Entities in Unstructured Data

+

Predict which class characters belong to in unstructured text:

+
import dataprofiler as dp
+
+data_labeler = dp.DataLabeler(labeler_type='unstructured')
+
+# Example sample string, must be in an array (multiple arrays can be passed)
+sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
+          "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"]
+
+# Prediction what class each character belongs to
+model_predictions = data_labeler.predict(
+    sample, predict_options=dict(show_confidences=True))
+
+# Predictions / confidences are at the character level
+final_results = model_predictions["pred"]
+final_confidences = model_predictions["conf"]
+
+
+

It’s also possible to change output formats, output similar to a SpaCy format:

+
import dataprofiler as dp
+
+data_labeler = dp.DataLabeler(labeler_type='unstructured', trainable=True)
+
+# Example sample string, must be in an array (multiple arrays can be passed)
+sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
+          "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"]
+
+# Set the output to the NER format (start position, end position, label)
+data_labeler.set_params(
+    { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } }
+)
+
+results = data_labeler.predict(sample)
+
+print(results)
+
+
+
+
+

Train a New Data Labeler

+

Mechanism for training your own data labeler on their own set of structured data +(tabular):

+
import dataprofiler as dp
+
+# Will need one column with a default label of UNKNOWN
+data = dp.Data("your_file.csv")
+
+data_labeler = dp.train_structured_labeler(
+    data=data,
+    save_dirpath="/path/to/save/labeler",
+    epochs=2
+)
+
+data_labeler.save_to_disk("my/save/path") # Saves the data labeler for reuse
+
+
+
+
+

Load an Existing Data Labeler

+

Mechanism for loading an existing data_labeler:

+
import dataprofiler as dp
+
+data_labeler = dp.DataLabeler(
+    labeler_type='structured', dirpath="/path/to/my/labeler")
+
+# get information about the parameters/inputs/output formats for the DataLabeler
+data_labeler.help()
+
+
+
+
+

Extending a Data Labeler with Transfer Learning

+

Extending or changing labels of a data labeler w/ transfer learning: +Note: By default, a labeler loaded will not be trainable. In order to load a +trainable DataLabeler, the user must set trainable=True or load a labeler +using the TrainableDataLabeler class.

+

The following illustrates how to change the labels:

+
import dataprofiler as dp
+
+labels = ['label1', 'label2', ...]  # new label set can also be an encoding dict
+data = dp.Data("your_file.csv")  # contains data with new labels
+
+# load default structured Data Labeler w/ trainable set to True
+data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True)
+
+# this will use transfer learning to retrain the data labeler on your new
+# dataset and labels.
+# NOTE: data must be in an acceptable format for the preprocessor to interpret.
+#       please refer to the preprocessor/model for the expected data format.
+#       Currently, the DataLabeler cannot take in Tabular data, but requires
+#       data to be ingested with two columns [X, y] where X is the samples and
+#       y is the labels.
+model_results = data_labeler.fit(x=data['samples'], y=data['labels'],
+                                 validation_split=0.2, epochs=2, labels=labels)
+
+# final_results, final_confidences are a list of results for each epoch
+epoch_id = 0
+final_results = model_results[epoch_id]["pred"]
+final_confidences = model_results[epoch_id]["conf"]
+
+
+

The following illustrates how to extend the labels:

+
import dataprofiler as dp
+
+new_labels = ['label1', 'label2', ...]
+data = dp.Data("your_file.csv")  # contains data with new labels
+
+# load default structured Data Labeler w/ trainable set to True
+data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True)
+
+# this will maintain current labels and model weights, but extend the model's
+# labels
+for label in new_labels:
+    data_labeler.add_label(label)
+
+# NOTE: a user can also add a label which maps to the same index as an existing
+# label
+# data_labeler.add_label(label, same_as='<label_name>')
+
+# For a trainable model, the user must then train the model to be able to
+# continue using the labeler since the model's graph has likely changed
+# NOTE: data must be in an acceptable format for the preprocessor to interpret.
+#       please refer to the preprocessor/model for the expected data format.
+#       Currently, the DataLabeler cannot take in Tabular data, but requires
+#       data to be ingested with two columns [X, y] where X is the samples and
+#       y is the labels.
+model_results = data_labeler.fit(x=data['samples'], y=data['labels'],
+                                 validation_split=0.2, epochs=2)
+
+# final_results, final_confidences are a list of results for each epoch
+epoch_id = 0
+final_results = model_results[epoch_id]["pred"]
+final_confidences = model_results[epoch_id]["conf"]
+
+
+

Changing pipeline parameters:

+
import dataprofiler as dp
+
+# load default Data Labeler
+data_labeler = dp.DataLabeler(labeler_type='structured')
+
+# change parameters of specific component
+data_labeler.preprocessor.set_params({'param1': 'value1'})
+
+# change multiple simultaneously.
+data_labeler.set_params({
+    'preprocessor':  {'param1': 'value1'},
+    'model':         {'param2': 'value2'},
+    'postprocessor': {'param3': 'value3'}
+})
+
+
+
+

Build Your Own Data Labeler

+

The DataLabeler has 3 main components: preprocessor, model, and postprocessor. +To create your own DataLabeler, each one would have to be created or an +existing component can be reused.

+

Given a set of the 3 components, you can construct your own DataLabeler:

+

Option for swapping out specific components of an existing labeler.

+
import dataprofiler as dp
+from dataprofiler.labelers.character_level_cnn_model import \
+    CharacterLevelCnnModel
+from dataprofiler.labelers.data_processing import \
+    StructCharPreprocessor, StructCharPostprocessor
+
+model = CharacterLevelCnnModel(...)
+preprocessor = StructCharPreprocessor(...)
+postprocessor = StructCharPostprocessor(...)
+
+data_labeler = dp.DataLabeler(labeler_type='structured')
+data_labeler.set_preprocessor(preprocessor)
+data_labeler.set_model(model)
+data_labeler.set_postprocessor(postprocessor)
+
+# check for basic compatibility between the processors and the model
+data_labeler.check_pipeline()
+
+
+
+
+
+

Model Component

+

In order to create your own model component for data labeling, you can utilize +the BaseModel class from dataprofiler.labelers.base_model and +overriding the abstract class methods.

+

Reviewing CharacterLevelCnnModel from +dataprofiler.labelers.character_level_cnn_model illustrates the functions +which need an override.

+
    +
  1. __init__: specifying default parameters and calling base __init__

  2. +
  3. _validate_parameters: validating parameters given by user during setting

  4. +
  5. _need_to_reconstruct_model: flag for when to reconstruct a model (i.e. +parameters change or labels change require a model reconstruction)

  6. +
  7. _construct_model: initial construction of the model given the parameters

  8. +
  9. _reconstruct_model: updates model architecture for new label set while +maintaining current model weights

  10. +
  11. fit: mechanism for the model to learn given training data

  12. +
  13. predict: mechanism for model to make predictions on data

  14. +
  15. details: prints a summary of the model construction

  16. +
  17. save_to_disk: saves model and model parameters to disk

  18. +
  19. load_from_disk: loads model given a path on disk

  20. +
+
+
+

Preprocessor Component

+

In order to create your own preprocessor component for data labeling, you can +utilize the BaseDataPreprocessor class +from dataprofiler.labelers.data_processing and override the abstract class +methods.

+

Reviewing StructCharPreprocessor from +dataprofiler.labelers.data_processing illustrates the functions which +need an override.

+
    +
  1. __init__: passing parameters to the base class and executing any +extraneous calculations to be saved as parameters

  2. +
  3. _validate_parameters: validating parameters given by user during +setting

  4. +
  5. process: takes in the user data and converts it into an digestible, +iterable format for the model

  6. +
  7. set_params (optional): if a parameter requires processing before setting, +a user can override this function to assist with setting the parameter

  8. +
  9. _save_processor (optional): if a parameter is not JSON serializable, a +user can override this function to assist in saving the processor and its +parameters

  10. +
  11. load_from_disk (optional): if a parameter(s) is not JSON serializable, a +user can override this function to assist in loading the processor

  12. +
+
+
+

Postprocessor Component

+

The postprocessor is nearly identical to the preprocessor except it handles +the output of the model for processing. In order to create your own +postprocessor component for data labeling, you can utilize the +BaseDataPostprocessor class from dataprofiler.labelers.data_processing +and override the abstract class methods.

+

Reviewing StructCharPostprocessor from +dataprofiler.labelers.data_processing illustrates the functions which +need an override.

+
    +
  1. __init__: passing parameters to the base class and executing any +extraneous calculations to be saved as parameters

  2. +
  3. _validate_parameters: validating parameters given by user during +setting

  4. +
  5. process: takes in the output of the model and processes for output to +the user

  6. +
  7. set_params (optional): if a parameter requires processing before setting, +a user can override this function to assist with setting the parameter

  8. +
  9. _save_processor (optional): if a parameter is not JSON serializable, a +user can override this function to assist in saving the processor and its +parameters

  10. +
  11. load_from_disk (optional): if a parameter(s) is not JSON serializable, a +user can override this function to assist in loading the processor

  12. +
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/data_reader.html b/docs/0.12.0/html/data_reader.html new file mode 100644 index 000000000..db55b4ff0 --- /dev/null +++ b/docs/0.12.0/html/data_reader.html @@ -0,0 +1,1103 @@ + + + + + + + + + Intro to Data Readers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Intro to Data Readers

+

Within the Data Profiler, there are 5 data reader classes:

+
    +
  • CSVData (delimited data: CSV, TSV, etc.)

  • +
  • JSONData

  • +
  • ParquetData

  • +
  • AVROData

  • +
  • GraphData

  • +
  • TextData

  • +
+

Each of these classes can be used to read data individually, however the Data Profiler provides the unique capability of auto detecting what data you have and reading it automatically by using the Data class.

+
import dataprofiler as dp
+data = dp.Data('/path/to/mydata.abc')  # auto detects and reads your data
+
+
+
+

Automatically reading and detecting data

+

Below is a demonstration of utilizing the Data class which automatically detects the type of data for a given file and reads it automatically.

+
+
[ ]:
+
+
+
+import os
+import sys
+
+try:
+    sys.path.insert(0, '..')
+    import dataprofiler as dp
+except ImportError:
+    import dataprofiler as dp
+
+
+
+
+
[ ]:
+
+
+
+# use data reader to read input data with different file types
+data_folder = "../dataprofiler/tests/data"
+csv_files = [
+    "csv/aws_honeypot_marx_geo.csv",
+    "csv/all-strings-skip-header-author.csv", # csv files with the author/description on the first line
+    "csv/sparse-first-and-last-column-empty-first-row.txt", # csv file with the .txt extension
+]
+json_files = [
+    "json/complex_nested.json",
+    "json/honeypot_intentially_mislabeled_file.csv", # json file with the .csv extension
+]
+parquet_files = [
+    "parquet/nation.dict.parquet",
+    "parquet/nation.plain.intentionally_mislabled_file.csv", # parquet file with the .csv extension
+]
+avro_files = [
+    "avro/userdata1.avro",
+    "avro/userdata1_intentionally_mislabled_file.json", # avro file with the .json extension
+]
+graph_files = [
+    "csv/graph_data_csv_identify.csv", # csv file with graph column names
+]
+text_files = [
+    "txt/discussion_reddit.txt",
+]
+all_files = csv_files + json_files + parquet_files + avro_files + graph_files + text_files
+print('filepath' + ' ' * 58 + 'data type')
+print('='*80)
+for file in all_files:
+    filepath = os.path.join(data_folder, file)
+    data = dp.Data(filepath)
+    print("{:<65} {:<15}".format(file, data.data_type))
+print("\n")
+
+
+
+
+
[ ]:
+
+
+
+# importing from a url
+data = dp.Data('https://raw.githubusercontent.com/capitalone/DataProfiler/main/dataprofiler/tests/data/csv/diamonds.csv')
+data.head()
+
+
+
+
+
+

Specifying detection options of Data and loading pandas.DataFrame

+

The Data class also gives the ability to set options or if the user wants to load their data with specific requirements. Options for each data reader are specified in the docs: https://capitalone.github.io/DataProfiler/docs/0.4.4/html/dataprofiler.data_readers.html

+
import dataprofiler as dp
+
+options = {...}  # allowed options are specified for each data reader.
+data = dp.Data(data, options=options)
+
+
+

Later in this tutorial, the options for the CSVData class will be discussed.

+

Additionally, a user can directly load a pandas.DataFrame as any data reader they choose.

+
+
[ ]:
+
+
+
+import pandas as pd
+from dataprofiler.data_readers.csv_data import CSVData
+
+
+df = pd.DataFrame(['my', 'random', 'data'])
+
+# specify via the `Data` class
+data = dp.Data(data=df, data_type='csv')
+print('Data Type: ', data.data_type)
+
+# specifically use the CSVData class
+data = CSVData(data=df)
+print('Data Type: ', data.data_type)
+
+
+
+
+
+

Accessing data and attributes

+

Once loaded, the data can be accessed via the data property of the object. Additional information about the data loaded may differ between data readers.

+

For this example we will focus on CSVData.

+
+
[ ]:
+
+
+
+filepath = "../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv"
+data = dp.Data(filepath)
+print('Data Type: ', data.data_type)
+print('Data Filepath: ', data.input_file_path)
+print('File Encoding: ', data.file_encoding)
+print('Data Length (two techniques): ', len(data), data.length)
+print("Data Access:")
+data.data
+
+
+
+
+
+

Checking data file types with is_match

+

Each data reader has a class method is_match which determines whether or not a dataset is of a given data type.

+
CSVData.is_match
+JSONData.is_match
+ParquetData.is_match
+AVROData.is_match
+GraphData.is_match
+TextData.is_match
+
+
+
+
[ ]:
+
+
+
+# supplemental function
+def add_true_false_color(value):
+    """Converts True to green and False to red in printed text."""
+    if value:
+        return "\x1b[92m  " + str(is_match) + "\x1b[0m"
+    return "\x1b[31m " + str(is_match) + "\x1b[0m"
+
+
+
+
+
[ ]:
+
+
+
+from dataprofiler.data_readers.csv_data import CSVData
+
+
+non_csv_files = [
+    'json/iris-utf-8.json',
+    'json/honeypot_intentially_mislabeled_file.csv',
+    'parquet/titanic.parq',
+    'parquet/nation.plain.intentionally_mislabled_file.csv',
+    'txt/code.txt',
+    'txt/sentence.txt',
+    'avro/users.avro',
+    'avro/snappy_compressed_intentionally_mislabeled_file.csv',
+]
+
+print("Is the file a CSV?")
+print('=' * 80)
+for file in csv_files:
+    filepath = os.path.join(data_folder, file)
+    is_match = CSVData.is_match(filepath)
+    print(add_true_false_color(is_match), ':', file)
+    print('=' * 80)
+
+for file in non_csv_files:
+    filepath = os.path.join(data_folder, file)
+    is_match = CSVData.is_match(filepath)
+    print(add_true_false_color(is_match), ':', file)
+    print('=' * 80)
+
+
+
+
+
+

Reloading data after altering options with reload

+

There are two cases for using the reload function, both of which require the data type to have been interpreted correctly:

+
1. The options were not correctly determined
+2. The options were loaded correctly but a change is desired.
+
+
+

In the example below, the data_format for reading the data is changed and the data is then reloaded.

+
+
[ ]:
+
+
+
+filepath = "../dataprofiler/tests/data/csv/diamonds.csv"
+
+data = dp.Data(filepath)
+print('original data:')
+print('=' * 80)
+print(data.data[:5])
+
+print()
+data.reload(options={'data_format': 'records', 'record_samples_per_line': 1})
+print('reloaded data:')
+print('=' * 80)
+data.data[:5]
+
+
+
+
+
+

A deeper dive into CSVData

+

This next section will focus on how to use the data reader class: CSVData. The CSVData class is used for reading delimited data. Delimited data are datasets which have their columns specified by a specific character, commonly the ,. E.g. from the diamonds.csv dataset:

+
carat,cut,color,clarity,depth,table,price,x,y,z
+0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
+0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31
+0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
+0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63
+0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
+
+
+

However, the delimiter can be any character. Additionally, a quotechar, commonly ", can be specified which allows a delimiter to be contained within a column value. E.g. from the blogposts.csv dataset:

+
Blog Post,Date,Subject,Field
+"Monty Hall, meet Game Theory",4/13/2014,Statistics,Mathematics
+Gaussian Quadrature,4/13/2014,Algorithms,Mathematics
+
+
+

Notice how "Monty Hall, meet Game Theory" is contained by the quotechar because it contains the delimiter value ,.

+

These delimiter dataset parameters (and more) can be automatically determined by the CSVData data reader, however they can also be set via the options as demonstrated later in this tutorial.

+
+
+

Intro to the CSVData data reader

+

Previously, it was shown that CSVData may automatically be detected using Data or can be manually specified by the user:

+
import dataprofiler as dp
+from dataprofiler.data_readers.csv_data import CSVData
+
+data = dp.Data(filepath)
+data = CSVData(filepath)
+
+
+
+
[ ]:
+
+
+
+# use data reader to read delimited data
+data_folder = "../dataprofiler/tests/data"
+csv_files = [
+    "csv/diamonds.csv",
+    "csv/all-strings-skip-header-author.csv", # csv files with the author/description on the first line
+    "csv/sparse-first-and-last-column-empty-first-row.txt", # csv file with the .txt extension
+]
+
+for file in csv_files:
+    data = CSVData(os.path.join(data_folder, file))
+    print(data.data.head())
+    print('=' * 80)
+
+
+
+
+
+

CSVData Options

+

As mentioned preivously, CSVData has options that can be set to finetune its detection or to ensure the data is being read in a specific manner. The options for CSVData are detailed below:

+
    +
  • delimiter - delimiter used to decipher the csv input file

  • +
  • quotechar - quote character used in the delimited file

  • +
  • header - location of the header in the file.

  • +
  • data_format - user selected format in which to return data can only be of specified types

  • +
  • selected_columns - columns being selected from the entire dataset

  • +
+
+
[ ]:
+
+
+
+# options are set via a dictionary object in which the parameters are specified.
+# these are the default values for each option
+options = {
+    "delimiter": ",",
+    "quotechar": '"',
+    "header": 'auto',
+    "data_format": "dataframe",  # type: str, choices: "dataframe", "records"
+    "selected_columns": list(),
+}
+
+
+
+
+
+

Options: delimiter and quotechar

+

Below, both the auto detection and use of options will be illustrated for delimiter and quotechar.

+
+
[ ]:
+
+
+
+# display the data we are reading
+filepath = "../dataprofiler/tests/data/csv/daily-activity-sheet-@-singlequote.csv"
+num_lines = 10
+with open(filepath) as fp:
+    print(''.join(fp.readlines()[:num_lines]))
+
+
+
+
+
[ ]:
+
+
+
+data = dp.Data(filepath)  # or use CSVData
+print('Auto detected')
+print('=' * 80)
+print('delimiter: ', data.delimiter)
+print('quotechar: ', data.quotechar)
+data.data.head()
+
+
+
+
+
[ ]:
+
+
+
+options = {'delimiter': '@', 'quotechar': "'"}
+data = dp.Data(filepath, options=options)  # or use CSVData
+print('manually set')
+print('=' * 80)
+print('delimiter: ', data.delimiter)
+print('quotechar: ', data.quotechar)
+data.data.head()
+
+
+
+
+
[ ]:
+
+
+
+# intentional failure with incorrect options
+options = {'delimiter': ',', 'quotechar': '"'}
+
+# will be interepted as TextData because the delimtier and quotechar were incorrect
+data = dp.Data(filepath, options=options)
+print('intentional faliure set')
+print('=' * 80)
+try:
+    print('delimiter: ', data.delimiter)  # attribute error raised here, bc TextData, not CSVData
+    print('quotechar: ', data.quotechar)
+
+    # should not reach this or something went wrong
+    raise Exception('Should have failed because this is detected as TextData.')
+except AttributeError:
+    print('When data_type is not set or the CSVData is not set, it will fail over to the\n'
+          'next best reader. In this case it is "TextData"\n')
+data.data
+
+
+
+
+
+

Options: header

+

Below, both the auto detection and use of options will be illustrated for header.

+

Notice how in the manually set mechanism, we are intentionally setting the header incorrectly to illustrate what happens.

+
+
[ ]:
+
+
+
+# display the data we are reading
+filepath = "../dataprofiler/tests/data/csv/sparse-first-and-last-column-header-and-author-description.txt"
+num_lines = 10
+with open(filepath) as fp:
+    print(''.join(fp.readlines()[:num_lines]))
+
+
+
+
+
[ ]:
+
+
+
+options = {'header': 'auto'}  # auto detected (default value)
+data = dp.Data(filepath, options=options)  # or use CSVData
+print('Data Header:', data.header)
+print('=' * 80)
+data.data.head()
+
+
+
+
+
[ ]:
+
+
+
+options = {'header': 2}  # intentionally set incorrectly at value 2
+data = dp.Data(filepath, options=options)  # or use CSVData
+print('Data Header:', data.header)
+print('=' * 80)
+data.data.head()
+
+
+
+
+
+

Options: data_format

+

For CSVData, the data_format option can have the following values:

+
    +
  • dataframe - (default) loads the dataset as a pandas.DataFrame

  • +
  • records - loads the data as rows of text values, the extra parameter record_samples_per_line how many rows are combined into a single line

  • +
+

dataframe is used for conducting structured profiling of the dataset while records is for unstructured profiling.

+

Below, both the auto detection and use of options will be illustrated for data_format.

+
+
[ ]:
+
+
+
+# display the data we are reading
+filepath = "../dataprofiler/tests/data/csv/diamonds.csv"
+num_lines = 10
+with open(filepath) as fp:
+    print(''.join(fp.readlines()[:num_lines]))
+
+
+
+
+
[ ]:
+
+
+
+options = {'data_format': 'dataframe'}  # default
+data = dp.Data(filepath, options=options)  # or use CSVData
+data.data[:5]
+
+
+
+
+
[ ]:
+
+
+
+options = {'data_format': 'records', 'record_samples_per_line': 1}
+data = dp.Data(filepath, options=options)
+data.data[:5]
+
+
+
+
+
+

Options: selected columns

+

By default, all columns of a dataset will be read and loaded into the data reader. However, selected_columns can be set to only load columns which the user requests.

+
+
[ ]:
+
+
+
+# display the data we are reading
+filepath = "../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv"
+num_lines = 10
+with open(filepath) as fp:
+    print(''.join(fp.readlines()[:num_lines]))
+
+
+
+
+
[ ]:
+
+
+
+options = {'selected_columns': ['datetime', 'host', 'src', 'proto']}
+data = dp.Data(filepath, options=options)
+data.data.head()
+
+
+
+
+
+

Intro to GraphData data reader

+

This tutorial will focus on how to use the data reader class: GraphData. The GraphData class is used for reading the delimited data from a CSV file into a NetworkX Graph object. This is all in an effort to prepare the data automaticaly for GraphProfiler class to then profile graph data.

+

The DataProiler keys off of common graph naming conventions in the column header row. E.G. from dataprofiler/tests/csv/graph_data_csv_identify.csv

+
node_id_dst, node_id_src, continuous_weight, categorical_status
+108,289,7.4448069,9
+81,180,3.65064207,0
+458,83,5.9959787,10
+55,116,4.63359209,79
+454,177,5.76715529,11
+429,225,4.79556889,3
+
+
+

Options for the GraphData are exactly the same as CSVData.

+

Example implementation of GraphData:

+
import dataprofiler as dp
+from dataprofiler.data_readers.graph_data import GraphData
+
+data = dp.Data(graph_file)
+data = GraphData(graph_file)
+
+
+
+
[ ]:
+
+
+
+from dataprofiler.data_readers.graph_data import GraphData
+
+# use data reader to read delimited data
+data_folder = "../dataprofiler/tests/data"
+graph_file = "csv/graph_data_csv_identify.csv"
+
+data = GraphData(os.path.join(data_folder, graph_file))
+print(data.data.edges)
+print('=' * 80)
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/data_reader.ipynb b/docs/0.12.0/html/data_reader.ipynb new file mode 100644 index 000000000..d2ce887e6 --- /dev/null +++ b/docs/0.12.0/html/data_reader.ipynb @@ -0,0 +1,689 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d4d79832-59ab-410a-ad6d-fbba01a3f0d3", + "metadata": {}, + "source": [ + "# Intro to Data Readers\n", + "Within the Data Profiler, there are 5 data reader classes:\n", + "\n", + " * CSVData (delimited data: CSV, TSV, etc.)\n", + " * JSONData\n", + " * ParquetData\n", + " * AVROData\n", + " * GraphData\n", + " * TextData\n", + " \n", + "Each of these classes can be used to read data individually, however the Data Profiler provides the unique capability of auto detecting what data you have and reading it automatically by using the `Data` class.\n", + "```python\n", + "import dataprofiler as dp\n", + "data = dp.Data('/path/to/mydata.abc') # auto detects and reads your data\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "f2315666-20be-4937-9f9a-26d42dc135e2", + "metadata": { + "tags": [] + }, + "source": [ + "## Automatically reading and detecting data\n", + "\n", + "Below is a demonstration of utilizing the `Data` class which automatically detects the type of data for a given file and reads it automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99e61c6c-43b8-4700-b627-759b5ef8bdda", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8821ad8d-b2c0-489c-ae6a-54c11b7f0a08", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data with different file types\n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "csv_files = [\n", + " \"csv/aws_honeypot_marx_geo.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "json_files = [\n", + " \"json/complex_nested.json\",\n", + " \"json/honeypot_intentially_mislabeled_file.csv\", # json file with the .csv extension\n", + "]\n", + "parquet_files = [\n", + " \"parquet/nation.dict.parquet\",\n", + " \"parquet/nation.plain.intentionally_mislabled_file.csv\", # parquet file with the .csv extension\n", + "]\n", + "avro_files = [\n", + " \"avro/userdata1.avro\",\n", + " \"avro/userdata1_intentionally_mislabled_file.json\", # avro file with the .json extension\n", + "]\n", + "graph_files = [\n", + " \"csv/graph_data_csv_identify.csv\", # csv file with graph column names\n", + "]\n", + "text_files = [\n", + " \"txt/discussion_reddit.txt\",\n", + "]\n", + "all_files = csv_files + json_files + parquet_files + avro_files + graph_files + text_files\n", + "print('filepath' + ' ' * 58 + 'data type')\n", + "print('='*80)\n", + "for file in all_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " data = dp.Data(filepath)\n", + " print(\"{:<65} {:<15}\".format(file, data.data_type))\n", + "print(\"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49dfc981-59fd-48a5-ad7b-e01f0a52d0b2", + "metadata": {}, + "outputs": [], + "source": [ + "# importing from a url\n", + "data = dp.Data('https://raw.githubusercontent.com/capitalone/DataProfiler/main/dataprofiler/tests/data/csv/diamonds.csv')\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "77f8ef2d-5aaf-44d6-b6d1-bf14f7eb7aa6", + "metadata": {}, + "source": [ + "## Specifying detection options of `Data` and loading `pandas.DataFrame`\n", + "\n", + "The `Data` class also gives the ability to set options or if the user wants to load their data with specific requirements.\n", + "Options for each data reader are specified in the docs: https://capitalone.github.io/DataProfiler/docs/0.4.4/html/dataprofiler.data_readers.html\n", + "\n", + "```python\n", + "import dataprofiler as dp\n", + "\n", + "options = {...} # allowed options are specified for each data reader.\n", + "data = dp.Data(data, options=options)\n", + "```\n", + "Later in this tutorial, the options for the CSVData class will be discussed.\n", + "\n", + "Additionally, a user can directly load a `pandas.DataFrame` as any data reader they choose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b925d4e-ca94-4913-9acf-26a883585e85", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "\n", + "df = pd.DataFrame(['my', 'random', 'data'])\n", + "\n", + "# specify via the `Data` class\n", + "data = dp.Data(data=df, data_type='csv')\n", + "print('Data Type: ', data.data_type)\n", + "\n", + "# specifically use the CSVData class\n", + "data = CSVData(data=df)\n", + "print('Data Type: ', data.data_type)" + ] + }, + { + "cell_type": "markdown", + "id": "52c3c3ac-c241-4d91-8ac7-b3d28ffd19c3", + "metadata": {}, + "source": [ + "## Accessing data and attributes\n", + "\n", + "Once loaded, the data can be accessed via the `data` property of the object. Additional information about the data loaded may differ between data readers.\n", + "\n", + "For this example we will focus on `CSVData`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09fa5929-e710-4107-9313-1370ab639c9c", + "metadata": {}, + "outputs": [], + "source": [ + "filepath = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "data = dp.Data(filepath)\n", + "print('Data Type: ', data.data_type)\n", + "print('Data Filepath: ', data.input_file_path)\n", + "print('File Encoding: ', data.file_encoding)\n", + "print('Data Length (two techniques): ', len(data), data.length)\n", + "print(\"Data Access:\")\n", + "data.data" + ] + }, + { + "cell_type": "markdown", + "id": "b98be971-4768-479d-9e54-00f05a6fb790", + "metadata": {}, + "source": [ + "## Checking data file types with `is_match`\n", + "\n", + "Each data reader has a class method `is_match` which determines whether or not a dataset is of a given data type.\n", + "```python\n", + "CSVData.is_match\n", + "JSONData.is_match\n", + "ParquetData.is_match\n", + "AVROData.is_match\n", + "GraphData.is_match\n", + "TextData.is_match\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "104a32c1-1d50-4aa5-94ce-b2e72de38476", + "metadata": {}, + "outputs": [], + "source": [ + "# supplemental function\n", + "def add_true_false_color(value):\n", + " \"\"\"Converts True to green and False to red in printed text.\"\"\"\n", + " if value:\n", + " return \"\\x1b[92m \" + str(is_match) + \"\\x1b[0m\"\n", + " return \"\\x1b[31m \" + str(is_match) + \"\\x1b[0m\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06868d90-2726-4096-a6da-3866174e6671", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "\n", + "non_csv_files = [\n", + " 'json/iris-utf-8.json',\n", + " 'json/honeypot_intentially_mislabeled_file.csv',\n", + " 'parquet/titanic.parq',\n", + " 'parquet/nation.plain.intentionally_mislabled_file.csv',\n", + " 'txt/code.txt',\n", + " 'txt/sentence.txt',\n", + " 'avro/users.avro',\n", + " 'avro/snappy_compressed_intentionally_mislabeled_file.csv',\n", + "]\n", + "\n", + "print(\"Is the file a CSV?\")\n", + "print('=' * 80)\n", + "for file in csv_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " is_match = CSVData.is_match(filepath)\n", + " print(add_true_false_color(is_match), ':', file)\n", + " print('=' * 80)\n", + " \n", + "for file in non_csv_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " is_match = CSVData.is_match(filepath)\n", + " print(add_true_false_color(is_match), ':', file)\n", + " print('=' * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "38889990-8e19-4114-a4f3-dc2af938e29d", + "metadata": {}, + "source": [ + "## Reloading data after altering options with `reload`\n", + "\n", + "There are two cases for using the reload function, both of which require the data type to have been interpreted correctly:\n", + "\n", + " 1. The options were not correctly determined\n", + " 2. The options were loaded correctly but a change is desired.\n", + " \n", + "In the example below, the `data_format` for reading the data is changed and the data is then reloaded." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01870e8d-45ee-4f33-a088-4453c7ffc7c2", + "metadata": {}, + "outputs": [], + "source": [ + "filepath = \"../dataprofiler/tests/data/csv/diamonds.csv\"\n", + "\n", + "data = dp.Data(filepath)\n", + "print('original data:')\n", + "print('=' * 80)\n", + "print(data.data[:5])\n", + "\n", + "print()\n", + "data.reload(options={'data_format': 'records', 'record_samples_per_line': 1})\n", + "print('reloaded data:')\n", + "print('=' * 80)\n", + "data.data[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "e2285f19-9b34-4484-beaa-79df890b2825", + "metadata": {}, + "source": [ + "## A deeper dive into `CSVData`\n", + "\n", + "This next section will focus on how to use the data reader class: `CSVData`. The `CSVData` class is used for reading delimited data. Delimited data are datasets which have their columns specified by a specific character, commonly the `,`. E.g. from the `diamonds.csv` dataset:\n", + "```\n", + "carat,cut,color,clarity,depth,table,price,x,y,z\n", + "0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43\n", + "0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31\n", + "0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31\n", + "0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63\n", + "0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75\n", + "```\n", + "\n", + "However, the delimiter can be any character. Additionally, a `quotechar`, commonly `\"`, can be specified which allows a delimiter to be contained within a column value.\n", + "E.g. from the `blogposts.csv` dataset:\n", + "```\n", + "Blog Post,Date,Subject,Field\n", + "\"Monty Hall, meet Game Theory\",4/13/2014,Statistics,Mathematics\n", + "Gaussian Quadrature,4/13/2014,Algorithms,Mathematics\n", + "```\n", + "Notice how `\"Monty Hall, meet Game Theory\"` is contained by the quotechar because it contains the delimiter value `,`.\n", + "\n", + "These delimiter dataset parameters (and more) can be automatically determined by the `CSVData` data reader, however they can also be set via the options as demonstrated later in this tutorial." + ] + }, + { + "cell_type": "markdown", + "id": "cccb6bf9-7fb8-46b8-992e-9caacb7ab3a8", + "metadata": {}, + "source": [ + "## Intro to the `CSVData` data reader\n", + "\n", + "Previously, it was shown that `CSVData` may automatically be detected using `Data` or can be manually specified by the user:\n", + "\n", + "```python\n", + "import dataprofiler as dp\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "data = dp.Data(filepath)\n", + "data = CSVData(filepath)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e25f5130-4f19-40c5-9d13-549a04f1aef5", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read delimited data \n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "csv_files = [\n", + " \"csv/diamonds.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "\n", + "for file in csv_files:\n", + " data = CSVData(os.path.join(data_folder, file))\n", + " print(data.data.head())\n", + " print('=' * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "8940de56-1417-4bf6-af87-9d4d00b9a631", + "metadata": {}, + "source": [ + "## CSVData Options\n", + "\n", + "As mentioned preivously, `CSVData` has options that can be set to finetune its detection or to ensure the data is being read in a specific manner.\n", + "The options for `CSVData` are detailed below:\n", + "\n", + " * delimiter - delimiter used to decipher the csv input file\n", + " * quotechar - quote character used in the delimited file\n", + " * header - location of the header in the file.\n", + " * data_format - user selected format in which to return data can only be of specified types\n", + " * selected_columns - columns being selected from the entire dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d74f2e8-0ec3-4e93-8778-0a5f013e0cdb", + "metadata": {}, + "outputs": [], + "source": [ + "# options are set via a dictionary object in which the parameters are specified.\n", + "# these are the default values for each option\n", + "options = {\n", + " \"delimiter\": \",\",\n", + " \"quotechar\": '\"',\n", + " \"header\": 'auto',\n", + " \"data_format\": \"dataframe\", # type: str, choices: \"dataframe\", \"records\"\n", + " \"selected_columns\": list(),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9af108a1-ffe6-4c3a-82cc-833b1a3b57a1", + "metadata": {}, + "source": [ + "## Options: delimiter and quotechar\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `delimiter` and `quotechar`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "570e20c3-198e-4356-98d3-92eb9655ef4e", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/daily-activity-sheet-@-singlequote.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98385148-861e-4eb1-ba8d-e93120515401", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(filepath) # or use CSVData\n", + "print('Auto detected')\n", + "print('=' * 80)\n", + "print('delimiter: ', data.delimiter)\n", + "print('quotechar: ', data.quotechar)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f5d9306-d90a-4fc6-85a7-a0d535fe2d80", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'delimiter': '@', 'quotechar': \"'\"}\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('manually set')\n", + "print('=' * 80)\n", + "print('delimiter: ', data.delimiter)\n", + "print('quotechar: ', data.quotechar)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7bfa60f-b5b9-48a5-adc5-3937aed145da", + "metadata": {}, + "outputs": [], + "source": [ + "# intentional failure with incorrect options\n", + "options = {'delimiter': ',', 'quotechar': '\"'}\n", + "\n", + "# will be interepted as TextData because the delimtier and quotechar were incorrect\n", + "data = dp.Data(filepath, options=options)\n", + "print('intentional faliure set')\n", + "print('=' * 80)\n", + "try:\n", + " print('delimiter: ', data.delimiter) # attribute error raised here, bc TextData, not CSVData\n", + " print('quotechar: ', data.quotechar)\n", + " \n", + " # should not reach this or something went wrong\n", + " raise Exception('Should have failed because this is detected as TextData.')\n", + "except AttributeError:\n", + " print('When data_type is not set or the CSVData is not set, it will fail over to the\\n'\n", + " 'next best reader. In this case it is \"TextData\"\\n')\n", + "data.data" + ] + }, + { + "cell_type": "markdown", + "id": "eeb41c7c-8319-40a3-9d87-88edbb3c5290", + "metadata": {}, + "source": [ + "## Options: header\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `header`.\n", + "\n", + "Notice how in the manually set mechanism, we are intentionally setting the header incorrectly to illustrate what happens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16a927ef-1ba8-4bf2-ae40-2a9909030609", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/sparse-first-and-last-column-header-and-author-description.txt\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0701d7bf-2de0-4dce-8f09-7f0cddd1132c", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'header': 'auto'} # auto detected (default value)\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('Data Header:', data.header)\n", + "print('=' * 80)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8642a0a-367a-44c6-b611-b89d97b29f85", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'header': 2} # intentionally set incorrectly at value 2\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('Data Header:', data.header)\n", + "print('=' * 80)\n", + "data.data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d6e3f640-c809-4eb6-9571-30065821615e", + "metadata": {}, + "source": [ + "## Options: data_format\n", + "\n", + "For CSVData, the `data_format` option can have the following values:\n", + "\n", + " * dataframe - (default) loads the dataset as a pandas.DataFrame\n", + " * records - loads the data as rows of text values, the extra parameter `record_samples_per_line` how many rows are combined into a single line\n", + " \n", + "`dataframe` is used for conducting **structured profiling** of the dataset while `records` is for **unstructured profiling**.\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `data_format`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "146109ea-a554-4766-bb19-78c116d2a8dd", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/diamonds.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dceac967-d326-4064-ba1c-87a1146c9d72", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'data_format': 'dataframe'} # default\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "data.data[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c25524f-ef23-4e06-9023-842c64c2640e", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'data_format': 'records', 'record_samples_per_line': 1}\n", + "data = dp.Data(filepath, options=options)\n", + "data.data[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "d45f3ed6-ddcd-4bf3-95bc-09f23eb94c97", + "metadata": {}, + "source": [ + "## Options: selected columns\n", + "\n", + "By default, all columns of a dataset will be read and loaded into the data reader. However, `selected_columns` can be set to only load columns which the user requests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9b45e18-93c6-42e6-b978-af51574307eb", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "018f3f4d-32ac-411a-9918-bae78aff0b0e", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'selected_columns': ['datetime', 'host', 'src', 'proto']}\n", + "data = dp.Data(filepath, options=options)\n", + "data.data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "b50679ea", + "metadata": {}, + "source": [ + "## Intro to `GraphData` data reader\n", + "\n", + "This tutorial will focus on how to use the data reader class: `GraphData`. The `GraphData` class is used for reading the delimited data from a CSV file into a `NetworkX` Graph object. This is all in an effort to prepare the data automaticaly for `GraphProfiler` class to then profile graph data. \n", + "\n", + "The DataProiler keys off of common graph naming conventions in the column header row. E.G. from `dataprofiler/tests/csv/graph_data_csv_identify.csv`\n", + "```\n", + "node_id_dst, node_id_src, continuous_weight, categorical_status\n", + "108,289,7.4448069,9\n", + "81,180,3.65064207,0\n", + "458,83,5.9959787,10\n", + "55,116,4.63359209,79\n", + "454,177,5.76715529,11\n", + "429,225,4.79556889,3\n", + "```\n", + "\n", + "Options for the `GraphData` are exactly the same as `CSVData`.\n", + "\n", + "\n", + "Example implementation of `GraphData`:\n", + "```python\n", + "import dataprofiler as dp\n", + "from dataprofiler.data_readers.graph_data import GraphData\n", + "\n", + "data = dp.Data(graph_file)\n", + "data = GraphData(graph_file)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "838db976", + "metadata": {}, + "outputs": [], + "source": [ + "from dataprofiler.data_readers.graph_data import GraphData\n", + "\n", + "# use data reader to read delimited data \n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "graph_file = \"csv/graph_data_csv_identify.csv\"\n", + "\n", + "data = GraphData(os.path.join(data_folder, graph_file))\n", + "print(data.data.edges)\n", + "print('=' * 80)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/html/data_readers.html b/docs/0.12.0/html/data_readers.html new file mode 100644 index 000000000..99a8a8484 --- /dev/null +++ b/docs/0.12.0/html/data_readers.html @@ -0,0 +1,475 @@ + + + + + + + + + Data Readers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Readers

+

The Data class itself will identify then output one of the following Data class types. +Using the data reader is easy, just pass it through the Data object.

+
import dataprofiler as dp
+data = dp.Data("your_file.csv")
+
+
+

The supported file types are:

+
    +
  • CSV file (or any delimited file)

  • +
  • JSON object

  • +
  • Avro file

  • +
  • Parquet file

  • +
  • Graph data file

  • +
  • Text file

  • +
  • Pandas DataFrame

  • +
  • A URL that points to one of the supported file types above

  • +
+

It’s also possible to specifically call one of the data classes such as the following command:

+
from dataprofiler.data_readers.csv_data import CSVData
+data = CSVData("your_file.csv", options={"delimiter": ","})
+
+
+

Additionally any of the data classes can be loaded using a URL:

+
import dataprofiler as dp
+data = dp.Data("https://you_website.com/your_file.file", options={"verify_ssl": "True"})
+
+
+

Below are descriptions of the various Data classes and the available options.

+
+

CSVData

+

Data class for loading datasets of type CSV. Can be specified by passing +in memory data or via a file path. Options pertaining the CSV may also +be specified using the options dict parameter.

+

CSVData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • delimiter - Must be a string, for example “delimiter”: “,”

  • +
  • data_format - Must be a string, possible choices: “dataframe”, “records”

  • +
  • selected_columns - Columns being selected from the entire dataset, must be a +list [“column 1”, “ssn”]

  • +
  • sample_nrows - Reservoir sampling to sample “n” rows out of a total of “M” rows. +Specified for how many rows to sample, default None.

  • +
  • header - Define the header, for example

    +
      +
    • “header”: ‘auto’ for auto detection

    • +
    • “header”: None for no header

    • +
    • “header”: <INT> to specify the header row (0 based index)

    • +
    +
  • +
+
+
+

JSONData

+

Data class for loading datasets of type JSON. Can be specified by +passing in memory data or via a file path. Options pertaining the JSON +may also be specified using the options dict parameter. JSON data can be +accessed via the “data” property, the “metadata” property, and the +“data_and_metadata” property.

+

JSONData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • data_format - must be a string, choices: “dataframe”, “records”, “json”, “flattened_dataframe”

    +
      +
    • “flattened_dataframe” is best used for JSON structure typically found in data streams that contain +nested lists of dictionaries and a payload. For example: {“data”: [ columns ], “response”: 200}

    • +
    +
  • +
  • selected_keys - columns being selected from the entire dataset, must be a list [“column 1”, “ssn”]

  • +
  • payload_keys - The dictionary keys for the payload of the JSON, typically called “data” +or “payload”. Defaults to [“data”, “payload”, “response”].

  • +
+
+
+

AVROData

+

Data class for loading datasets of type AVRO. Can be specified by +passing in memory data or via a file path. Options pertaining the AVRO +may also be specified using the options dict parameter.

+

AVROData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • data_format - must be a string, choices: “dataframe”, “records”, “avro”, “json”, “flattened_dataframe”

    +
      +
    • “flattened_dataframe” is best used for AVROs with a JSON structure typically found in data streams that contain +nested lists of dictionaries and a payload. For example: {“data”: [ columns ], “response”: 200}

    • +
    +
  • +
  • selected_keys - columns being selected from the entire dataset, must be a list [“column 1”, “ssn”]

  • +
+
+
+

ParquetData

+

Data class for loading datasets of type PARQUET. Can be specified by +passing in memory data or via a file path. Options pertaining the +PARQUET may also be specified using the options dict parameter.

+

ParquetData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • data_format - must be a string, choices: “dataframe”, “records”, “json”

  • +
  • selected_keys - columns being selected from the entire dataset, must be a list [“column 1”, “ssn”]

  • +
  • sample_nrows - Random sampling to sample “n” rows out of a total of “M” rows. +Specified for how many rows to sample, default None.

  • +
+
+
+

GraphData

+

Data Class for loading datasets of graph data. Currently takes CSV format, +further type formats will be supported. Can be specified by passing +in memory data (NetworkX Graph) or via a file path. Options pertaining the CSV file may also +be specified using the options dict parameter. Loads data from CSV into memory +as a NetworkX Graph.

+

GraphData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • delimiter - must be a string, for example “delimiter”: “,”

  • +
  • data_format - must be a string, possible choices: “graph”, “dataframe”, “records”

  • +
  • header - Define the header, for example

    +
      +
    • “header”: ‘auto’ for auto detection

    • +
    • “header”: None for no header

    • +
    • “header”: <INT> to specify the header row (0 based index)

    • +
    +
  • +
+
+
+

TextData

+

Data class for loading datasets of type TEXT. Can be specified by +passing in memory data or via a file path. Options pertaining the TEXT +may also be specified using the options dict parameter.

+

TextData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • data_format: user selected format in which to return data. Currently only supports “text”.

  • +
  • samples_per_line - chunks by which to read in the specified dataset

  • +
+
+
+

Data Using a URL

+

Data class for loading datasets of any type using a URL. Specified by passing in +any valid URL that points to one of the valid data types. Options pertaining the +URL may also be specified using the options dict parameter.

+

Data(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • verify_ssl: must be a boolean string, choices: “True”, “False”. Set to “True” by default.

  • +
+
+
+

Data Using an AWS S3 URI

+

Data class for loading datasets from AWS S3 URI. Specified by passing in +any valid bucket path that points to one of the valid data types.

+

Data(‘s3a://my-bucket/file_name.txt’)

+

Possible options:

+
    +
  • storage_options: must be a dictionary where the keys for boto3 initialization are set +If storage_options is provided in options, the below variables are retrieved from the dictionary provided. Otherwise, will retrieve from environment variables.

    +
      +
    • AWS_ACCESS_KEY_ID

    • +
    • AWS_SECRET_ACCESS_KEY

    • +
    • AWS_SESSION_TOKEN

    • +
    • AWS_REGION (default us-east-1)

    • +
    +
  • +
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.avro_data.html b/docs/0.12.0/html/dataprofiler.data_readers.avro_data.html new file mode 100644 index 000000000..49572fe4a --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.avro_data.html @@ -0,0 +1,411 @@ + + + + + + + + + Avro Data - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Avro Data

+

Contains class for saving and loading spreadsheet data.

+
+
+class dataprofiler.data_readers.avro_data.AVROData(input_file_path: Optional[str] = None, data: Optional[Any] = None, options: Optional[Dict] = None)
+

Bases: JSONData, BaseData

+

AVROData class to save and load spreadsheet data.

+

Initialize Data class for loading datasets of type AVRO.

+

Can be specified by passing in memory data or via a file path. +Options pertaining to AVRO may also be specified using options dict param. +Possible Options:

+
options = dict(
+    data_format= type: str, choices: "dataframe", "records", "avro"
+    selected_keys= type: list(str)
+)
+
+
+

data_format: user selected format can only be of specified types +selected_keys: keys being selected from the entire dataset

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type: str = 'avro'
+
+
+
+property file_encoding: Optional[str]
+

Set file encoding to None since not detected for avro.

+
+
+
+classmethod is_match(file_path: Union[str, StringIO, BytesIO], options: Optional[Dict] = None) bool
+

Test the given file to check if the file has valid AVRO format or not.

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – avro read options

  • +
+
+
Returns
+

is file a avro file or not

+
+
Return type
+

bool

+
+
+
+
+
+property data
+

Return data.

+
+
+
+property data_and_metadata: Optional[DataFrame]
+

Return a data frame that joins the data and the metadata.

+
+
+
+property data_format: Optional[str]
+

Return data format.

+
+
+
+get_batch_generator(batch_size: int) Generator[Union[DataFrame, List], None, None]
+

Get batch generator.

+
+
+
+info: Optional[str] = None
+
+
+
+property is_structured
+

Determine compatibility with StructuredProfiler.

+
+
+
+property length: int
+

Return the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+property metadata: Optional[DataFrame]
+

Return a data frame that contains the metadata.

+
+
+
+reload(input_file_path: Optional[str] = None, data: Optional[Union[DataFrame, str]] = None, options: Optional[Dict] = None) None
+

Reload the data class with a new dataset.

+

This erases all existing data/options and replaces it +with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+property selected_keys: Optional[List[str]]
+

Return selected keys.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.base_data.html b/docs/0.12.0/html/dataprofiler.data_readers.base_data.html new file mode 100644 index 000000000..851787539 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.base_data.html @@ -0,0 +1,373 @@ + + + + + + + + + Base Data - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Data

+

Contains abstract class for data loading and saving.

+
+
+class dataprofiler.data_readers.base_data.BaseData(input_file_path: Optional[str], data: Any, options: Dict)
+

Bases: object

+

Abstract class for data loading and saving.

+

Initialize Base class for loading a dataset.

+

Options can be specified and maybe +more specific to the subclasses.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type: str
+
+
+
+info: Optional[str] = None
+
+
+
+property data
+

Return data.

+
+
+
+property data_format: Optional[str]
+

Return data format.

+
+
+
+property is_structured: bool
+

Determine compatibility with StructuredProfiler.

+
+
+
+property file_encoding: Optional[str]
+

Return file encoding.

+
+
+
+get_batch_generator(batch_size: int) Generator[Union[DataFrame, List], None, None]
+

Get batch generator.

+
+
+
+classmethod is_match(input_file_path: str, options: Optional[Dict]) bool
+

Return true if match, false otherwise.

+
+
+
+reload(input_file_path: Optional[str], data: Any, options: Optional[Dict]) None
+

Reload the data class with a new dataset.

+

This erases all existing +data/options and replaces it with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+property length: int
+

Return the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.csv_data.html b/docs/0.12.0/html/dataprofiler.data_readers.csv_data.html new file mode 100644 index 000000000..dc31659ea --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.csv_data.html @@ -0,0 +1,440 @@ + + + + + + + + + CSV Data - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

CSV Data

+

Contains class that saves and loads spreadsheet data.

+
+
+class dataprofiler.data_readers.csv_data.CSVData(input_file_path: Optional[str] = None, data: Optional[DataFrame] = None, options: Optional[Dict] = None)
+

Bases: SpreadSheetDataMixin, BaseData

+

SpreadsheetData class to save and load spreadsheet data.

+

Initialize Data class for loading datasets of type CSV.

+

Can be specified by passing in memory data or via a file path. +Options pertaining to CSV may also be specified using options dict param. +Possible Options:

+
options = dict(
+    delimiter= type: str
+    data_format= type: str, choices: "dataframe", "records"
+    record_samples_per_line= type: int (only for "records")
+    selected_columns= type: list(str)
+    header= type: any
+)
+
+
+

delimiter: delimiter used to decipher the csv input file +data_format: user selected format in which to return data +can only be of specified types: +``` +dataframe - (default) loads the dataset as a pandas.DataFrame +records - loads the data as rows of text values, the extra parameter

+
+

“record_samples_per_line” determines how many rows are combined into +a single line

+
+

``` +selected_columns: columns being selected from the entire dataset +header: location of the header in the file +quotechar: quote character used in the delimited file

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type: str = 'csv'
+
+
+
+property selected_columns: List[str]
+

Return selected columns.

+
+
+
+property delimiter: Optional[str]
+

Return delimiter.

+
+
+
+property quotechar: Optional[str]
+

Return quotechar.

+
+
+
+property header: Optional[Union[str, int]]
+

Return header.

+
+
+
+property sample_nrows: Optional[int]
+

Return sample_nrows.

+
+
+
+property is_structured: bool
+

Determine compatibility with StructuredProfiler.

+
+
+
+property data
+

Return data.

+
+
+
+property data_format: Optional[str]
+

Return data format.

+
+
+
+property file_encoding: Optional[str]
+

Return file encoding.

+
+
+
+get_batch_generator(batch_size: int) Generator[Union[DataFrame, List], None, None]
+

Get batch generator.

+
+
+
+info: Optional[str] = None
+
+
+
+classmethod is_match(file_path: str, options: Optional[Dict] = None) bool
+

Check if first 1000 lines of given file has valid delimited format.

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – delimiter read options dict(delimiter=”,”)

  • +
+
+
Returns
+

is file a csv file or not

+
+
Return type
+

bool

+
+
+
+
+
+property length: int
+

Return the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+options: Optional[Dict]
+
+
+
+reload(input_file_path: Optional[str] = None, data: Optional[DataFrame] = None, options: Optional[Dict] = None)
+

Reload the data class with a new dataset.

+

This erases all existing data/options and replaces it with +the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.data.html b/docs/0.12.0/html/dataprofiler.data_readers.data.html new file mode 100644 index 000000000..0c15b3b5e --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.data.html @@ -0,0 +1,312 @@ + + + + + + + + + Data - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data

+

Contains factory class reading various kinds of data.

+
+
+class dataprofiler.data_readers.data.Data(input_file_path: Optional[Union[str, BytesIO]] = None, data: Optional[Any] = None, data_type: Optional[str] = None, options: Optional[Dict] = None)
+

Bases: object

+

Factory class for reading various kinds of data.

+

Create Factory Data object.

+

Auto-detection of data type if not specified for input files. +Returns the proper data class or specified data class for +the given data or input file.

+
+
Parameters
+
    +
  • input_file_path

  • +
  • data

  • +
  • data_type

  • +
  • options

  • +
+
+
Returns
+

+
+
+
+
+data_classes: List[Dict] = [{'data_class': <class 'dataprofiler.data_readers.json_data.JSONData'>, 'kwargs': {}}, {'data_class': <class 'dataprofiler.data_readers.graph_data.GraphData'>, 'kwargs': {}}, {'data_class': <class 'dataprofiler.data_readers.csv_data.CSVData'>, 'kwargs': {}}, {'data_class': <class 'dataprofiler.data_readers.parquet_data.ParquetData'>, 'kwargs': {}}, {'data_class': <class 'dataprofiler.data_readers.avro_data.AVROData'>, 'kwargs': {}}, {'data_class': <class 'dataprofiler.data_readers.text_data.TextData'>, 'kwargs': {}}]
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.data_utils.html b/docs/0.12.0/html/dataprofiler.data_readers.data_utils.html new file mode 100644 index 000000000..1b2de3da4 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.data_utils.html @@ -0,0 +1,776 @@ + + + + + + + + + Data Utils - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Utils

+

Contains functions for data readers.

+
+
+dataprofiler.data_readers.data_utils.data_generator(data_list: List[str]) Generator[str, None, None]
+

Take a list and return a generator on the list.

+
+
Parameters
+

data_list (list) – list of strings

+
+
Returns
+

item from the list

+
+
Return type
+

generator

+
+
+
+
+
+dataprofiler.data_readers.data_utils.generator_on_file(file_object: Union[StringIO, BytesIO]) Generator[Union[str, bytes], None, None]
+

Take a file and return a generator that returns lines.

+
+
Parameters
+

file_path (path) – path to the file

+
+
Returns
+

Line from file

+
+
Return type
+

generator

+
+
+
+
+
+dataprofiler.data_readers.data_utils.convert_int_to_string(x: int) str
+

Convert the given input to string.

+

In particular, it is int, it converts it ensuring there is no . or 00. +In addition, if the input is np.nan, the output will be ‘nan’ which is +what we need to handle data properly.

+
+
Parameters
+

x (Union[int, float, str, numpy.nan]) –

+
+
Returns
+

+
+
Return type
+

str

+
+
+
+
+
+dataprofiler.data_readers.data_utils.unicode_to_str(data: Union[str, int, float, bool, None, List, Dict], ignore_dicts: bool = False) Union[str, int, float, bool, None, List, Dict]
+

Convert data to string representation if it is a unicode string.

+
+
Parameters
+
    +
  • data (JSONType) – input data

  • +
  • ignore_dicts (boolean) – if set, ignore the dictionary type processing

  • +
+
+
Returns
+

string representation of data

+
+
Return type
+

str

+
+
+
+
+
+dataprofiler.data_readers.data_utils.json_to_dataframe(json_lines: List[Union[str, int, float, bool, None, List, Dict]], selected_columns: Optional[List[str]] = None, read_in_string: bool = False) Tuple[DataFrame, Series]
+

Take list of json objects and return dataframe representing json list.

+
+
Parameters
+
    +
  • json_lines (list(JSONType)) – list of json objects

  • +
  • selected_columns (list(str)) – a list of keys to be processed

  • +
  • read_in_string (bool) – if True, all the values in dataframe will be +converted to string

  • +
+
+
Returns
+

dataframe converted from json list and list of dtypes for each +column

+
+
Return type
+

tuple(pd.DataFrame, pd.Series(dtypes))

+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_json_df(data_generator: Generator, selected_columns: Optional[List[str]] = None, read_in_string: bool = False) Tuple[DataFrame, Series]
+

Return an iterator that returns a chunk of data as dataframe in each call.

+

The source of input to this function is either a +file or a list of JSON structured strings. If the file path is given as +input, the file is expected to have one JSON structures in each line. The +lines that are not valid json will be ignored. Therefore, a file with +pretty printed JSON objects will not be considered valid JSON. If the +input is a data list, it is expected to be a list of strings where each +string is a valid JSON object. if the individual object is not valid +JSON, it will be ignored.

+

NOTE: both data_list and file_path cannot be passed at the same time.

+
+
Parameters
+
    +
  • data_generator (generator) – The generator you want to read.

  • +
  • selected_columns (list(str)) – a list of keys to be processed

  • +
  • read_in_string (bool) – if True, all the values in dataframe will be +converted to string

  • +
+
+
Returns
+

returns an iterator that returns a chunk of file as dataframe in +each call as well as original dtypes of the dataframe columns.

+
+
Return type
+

tuple(pd.DataFrame, pd.Series(dtypes))

+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_json(data_generator: Iterator, selected_columns: Optional[List[str]] = None, read_in_string: bool = False) List[Union[str, int, float, bool, None, List, Dict]]
+

Return the lines of a json.

+

The source of input to this function is either a file or +a list of JSON structured strings. +If the file path is given as input, the file is expected to have one JSON +structures in each line. The lines that are not valid json will be ignored. +Therefore, a file with pretty printed JSON objects will not be considered +valid JSON. If the input is a data list, it is expected to be a list of +strings where each string is a valid JSON object. if the individual object +is not valid JSON, it will be ignored.

+

NOTE: both data_list and file_path cannot be passed at the same time.

+
+
Parameters
+
    +
  • data_generator (generator) – The generator you want to read.

  • +
  • selected_columns (list(str)) – a list of keys to be processed

  • +
  • read_in_string (bool) – if True, all the values in dataframe will be +converted to string

  • +
+
+
Returns
+

returns the lines of a json file

+
+
Return type
+

list(dict)

+
+
+
+
+
+dataprofiler.data_readers.data_utils.reservoir(file: TextIOWrapper, sample_nrows: int) list
+

Implement the mathematical logic of Reservoir sampling.

+
+
Parameters
+
    +
  • file (TextIOWrapper) – wrapper of the opened csv file

  • +
  • sample_nrows (int) – number of rows to sample

  • +
+
+
Raises
+

ValueError()

+
+
Returns
+

sampled values

+
+
Return type
+

list

+
+
+
+
+
+dataprofiler.data_readers.data_utils.rsample(file_path: TextIOWrapper, sample_nrows: int, args: dict) StringIO
+

Implement Reservoir Sampling to sample n rows out of a total of M rows.

+
+
Parameters
+
    +
  • file_path (TextIOWrapper) – path of the csv file to be read in

  • +
  • sample_nrows (int) – number of rows being sampled

  • +
  • args (dict) – options to read the csv file

  • +
+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_csv_df(file_path: Union[str, BytesIO, TextIOWrapper], delimiter: Optional[str], header: Optional[int], sample_nrows: Optional[int] = None, selected_columns: List[str] = [], read_in_string: bool = False, encoding: Optional[str] = 'utf-8') DataFrame
+

Read a CSV file in chunks and return dataframe in form of iterator.

+
+
Parameters
+
    +
  • file_path (str) – path to the CSV file.

  • +
  • delimiter (str) – character used to separate csv values.

  • +
  • header (int) – the header row in the csv file.

  • +
  • selected_columns (list(str)) – a list of columns to be processed

  • +
  • read_in_string (bool) – if True, all the values in dataframe will be +converted to string

  • +
+
+
Returns
+

Iterator

+
+
Return type
+

pd.DataFrame

+
+
+
+
+
+dataprofiler.data_readers.data_utils.convert_unicode_col_to_utf8(input_df: DataFrame) DataFrame
+

Convert all unicode columns in input dataframe to utf-8.

+
+
Parameters
+

input_df (pd.DataFrame) – input dataframe

+
+
Returns
+

corrected dataframe

+
+
Return type
+

pd.DataFrame

+
+
+
+
+
+dataprofiler.data_readers.data_utils.sample_parquet(file_path: str, sample_nrows: int, selected_columns: Optional[List[str]] = None, read_in_string: bool = False) Tuple[DataFrame, Series]
+

Read parquet file, sample specified number of rows from it and return a data frame.

+
+
Parameters
+
    +
  • file_path (str) – path to the Parquet file.

  • +
  • sample_nrows (int) – number of rows being sampled

  • +
  • selected_columns (list) – columns need to be read

  • +
  • read_in_string (bool) – return as string type

  • +
+
+
Returns
+

+
+
Return type
+

Iterator(pd.DataFrame)

+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_parquet_df(file_path: str, sample_nrows: Optional[int] = None, selected_columns: Optional[List[str]] = None, read_in_string: bool = False) Tuple[DataFrame, Series]
+

Return an iterator that returns one row group each time.

+
+
Parameters
+
    +
  • file_path (str) – path to the Parquet file.

  • +
  • sample_nrows (int) – number of rows being sampled

  • +
  • selected_columns (list) – columns need to be read

  • +
  • read_in_string (bool) – return as string type

  • +
+
+
Returns
+

+
+
Return type
+

Iterator(pd.DataFrame)

+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_text_as_list_of_strs(file_path: str, encoding: Optional[str] = None) List[str]
+

Return list of strings relative to the chunk size.

+

Each line is 1 chunk.

+
+
Parameters
+

file_path (str) – path to the file

+
+
Returns
+

+
+
Return type
+

list(str)

+
+
+
+
+
+dataprofiler.data_readers.data_utils.detect_file_encoding(file_path: str, buffer_size: int = 1024, max_lines: int = 20) str
+

Determine encoding of files within initial max_lines of length buffer_size.

+
+
Parameters
+
    +
  • file_path (str) – path to the file

  • +
  • buffer_size (int) – buffer length for each line being read

  • +
  • max_lines (int) – number of lines to read from file of length buffer_size

  • +
+
+
Returns
+

encoding type

+
+
Return type
+

str

+
+
+
+
+
+dataprofiler.data_readers.data_utils.detect_cell_type(cell: str) str
+

Detect the cell type (int, float, etc).

+
+
Parameters
+

cell (str) – String designated for data type detection

+
+
+
+
+
+dataprofiler.data_readers.data_utils.get_delimiter_regex(delimiter: str = ',', quotechar: str = ',') Pattern[str]
+

Build regex for delimiter checks.

+
+
Parameters
+
    +
  • delimiter (str) – Delimiter to be added to regex

  • +
  • quotechar – Quotechar to be added to regex

  • +
+
+
+
+
+
+dataprofiler.data_readers.data_utils.find_nth_loc(string: Optional[str] = None, search_query: Optional[str] = None, n: int = 0, ignore_consecutive: bool = True) Tuple[int, int]
+

Search string via search_query and return nth index in which query occurs.

+

If there are less than ‘n’ the last loc is returned

+
+
Parameters
+
    +
  • string (str) – Input string, to be searched

  • +
  • search_query (str) – char(s) to find nth occurrence of

  • +
  • n (int) – The number of occurrences to iterate through

  • +
  • ignore_consecutive (bool) – Ignore consecutive matches in the search query.

  • +
+
+
Return idx
+

Index of the nth or last occurrence of the search_query

+
+
Rtype idx
+

int

+
+
Return id_count
+

Number of identifications prior to idx

+
+
Rtype id_count
+

int

+
+
+
+
+
+dataprofiler.data_readers.data_utils.load_as_str_from_file(file_path: str, file_encoding: Optional[str] = None, max_lines: int = 10, max_bytes: int = 65536, chunk_size_bytes: int = 1024) str
+

Load data from a csv file up to a specific line OR byte_size.

+
+
Parameters
+
    +
  • file_path (str) – Path to file to load data from

  • +
  • file_encoding (str) – File encoding

  • +
  • max_lines (int) – Maximum number of lines to load from file

  • +
  • max_bytes (int) – Maximum number of bytes to load from file

  • +
  • chunk_size_bytes (int) – Chunk size to load every data load

  • +
+
+
Returns
+

Data as string

+
+
Return type
+

str

+
+
+
+
+
+dataprofiler.data_readers.data_utils.is_valid_url(url_as_string: Any) typing_extensions.TypeGuard[Url]
+

Determine whether a given string is a valid URL.

+
+
Parameters
+

url_as_string (str) – string to be tested if URL

+
+
Returns
+

true if string is a valid URL

+
+
Return type
+

boolean

+
+
+
+
+
+dataprofiler.data_readers.data_utils.url_to_bytes(url_as_string: Url, options: Dict) BytesIO
+

Read in URL and converts it to a byte stream.

+
+
Parameters
+
    +
  • url_as_string (str) – string to read as URL

  • +
  • options (dict) – options for the url

  • +
+
+
Returns
+

BytesIO stream of data downloaded from URL

+
+
Return type
+

BytesIO stream

+
+
+
+
+
+class dataprofiler.data_readers.data_utils.S3Helper
+

Bases: object

+

A utility class for working with Amazon S3.

+
+
This class provides methods to check if a path is an S3 URI

and to create an S3 client.

+
+
+
+
+static is_s3_uri(path: str, logger: Logger) bool
+

Check if the given path is an S3 URI.

+

This function checks for common S3 URI prefixes “s3://” and “s3a://”.

+
+
Parameters
+
    +
  • path (str) – The path to check for an S3 URI.

  • +
  • logger (logging.Logger) – The logger instance for logging.

  • +
+
+
Returns
+

True if the path is an S3 URI, False otherwise.

+
+
Return type
+

bool

+
+
+
+
+
+static create_s3_client(aws_access_key_id: Optional[str] = None, aws_secret_access_key: Optional[str] = None, aws_session_token: Optional[str] = None, region_name: Optional[str] = None) client
+

Create and return an S3 client.

+
+
Parameters
+
    +
  • aws_access_key_id (str) – The AWS access key ID.

  • +
  • aws_secret_access_key (str) – The AWS secret access key.

  • +
  • aws_session_token (str) – The AWS session token +(optional, typically used for temporary credentials).

  • +
  • region_name (str) – The AWS region name (default is ‘us-east-1’).

  • +
+
+
Returns
+

A S3 client instance.

+
+
Return type
+

boto3.client

+
+
+
+
+
+static get_s3_uri(s3_uri: str, s3_client: client) BytesIO
+

Download an object from an S3 URI and return its content as BytesIO.

+
+
Parameters
+
    +
  • s3_uri (str) – The S3 URI specifying the location of the object to download.

  • +
  • s3_client (boto3.client) – An initialized AWS S3 client +for accessing the S3 service.

  • +
+
+
Returns
+

+
A BytesIO object containing the content of

the downloaded S3 object.

+
+
+

+
+
Return type
+

BytesIO

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.filepath_or_buffer.html b/docs/0.12.0/html/dataprofiler.data_readers.filepath_or_buffer.html new file mode 100644 index 000000000..596677ed8 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.filepath_or_buffer.html @@ -0,0 +1,323 @@ + + + + + + + + + Filepath Or Buffer - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Filepath Or Buffer

+

Contains functions and classes for handling filepaths and buffers.

+
+
+dataprofiler.data_readers.filepath_or_buffer.is_stream_buffer(filepath_or_buffer: Any) typing_extensions.TypeGuard[Union[StringIO, BytesIO]]
+

Determine whether a given argument is a filepath or buffer.

+
+
Parameters
+

filepath_or_buffer (str) – path to the file or buffer

+
+
Returns
+

true if string is a buffer or false if string is a filepath

+
+
Return type
+

boolean

+
+
+
+
+
+class dataprofiler.data_readers.filepath_or_buffer.FileOrBufferHandler(filepath_or_buffer: Union[str, StringIO, BytesIO], open_method: str = 'r', encoding: Optional[str] = None, seek_offset: Optional[int] = None, seek_whence: int = 0)
+

Bases: object

+

FileOrBufferHandler class to read a filepath or buffer in.

+

Always returns a readable buffer.

+

Initialize Context manager class.

+

Used for inputting a file or buffer and returning +a structure that is always a buffer.

+
+
Parameters
+
    +
  • filepath_or_buffer (Union[str, StringIO, BytesIO]) – path to the file being loaded or buffer

  • +
  • open_method (string) – value describes the mode the file is opened in

  • +
  • seek_offset (int) – offset from start of the stream

  • +
+
+
Returns
+

TextIOBase or BufferedIOBase class/subclass

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.graph_data.html b/docs/0.12.0/html/dataprofiler.data_readers.graph_data.html new file mode 100644 index 000000000..39dc51650 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.graph_data.html @@ -0,0 +1,417 @@ + + + + + + + + + Graph Data - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Graph Data

+

Contains class for identifying, reading, and loading graph data.

+
+
+class dataprofiler.data_readers.graph_data.GraphData(input_file_path: Optional[str] = None, data: Optional[Graph] = None, options: Optional[Dict] = None)
+

Bases: BaseData

+

GraphData class to identify, read, and load graph data.

+

Initialize Data class for identifying, reading, and loading graph data.

+

Current implementation only accepts file path as input. +An options parameter is also passed in to specify properties of the +input file.

+

Possible Options:

+
options = dict(
+    delimiter= type: str
+    column_names= type: list(str)
+    source_node= type: int
+    destination_node= type: int
+    target_keywords= type: list(str)
+    source_keywords= type: list(str)
+    header= type: any
+    quotechar= type: str
+)
+
+
+

delimiter: delimiter used to decipher the csv input file +column_names: list of column names of the csv +source_node: index of the source node column, range of (0,n-1) +target_node: index of the target node column, range of (0,n-1) +target_keywords: list of keywords to identify target/destination node col +source_keywords: list of keywords to identify source node col +graph_keywords: list of keywords to identify if data has graph data +header: location o the header in the file +quotechar: quote character used in the delimited file

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type: str = 'graph'
+
+
+
+classmethod csv_column_names(file_path: str, header: Optional[int], delimiter: Optional[str], encoding: str = 'utf-8') List[str]
+

Fetch a list of column names from the csv file.

+
+
+
+classmethod is_match(file_path: str, options: Optional[Dict] = None) bool
+

Determine whether the file is a graph.

+
+
Current formats checked:
    +
  • attributed edge list

  • +
+
+
+

This works by finding whether the file contains a target and a source node

+
+
+
+check_integer(string: str) Union[int, str]
+

Check whether string is integer and output integer.

+
+
+
+property data
+

Return data.

+
+
+
+property data_format: Optional[str]
+

Return data format.

+
+
+
+property file_encoding: Optional[str]
+

Return file encoding.

+
+
+
+get_batch_generator(batch_size: int) Generator[Union[DataFrame, List], None, None]
+

Get batch generator.

+
+
+
+info: Optional[str] = None
+
+
+
+property is_structured: bool
+

Determine compatibility with StructuredProfiler.

+
+
+
+property length: int
+

Return the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+reload(input_file_path: Optional[str], data: Any, options: Optional[Dict]) None
+

Reload the data class with a new dataset.

+

This erases all existing +data/options and replaces it with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+options: Optional[Dict]
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.html b/docs/0.12.0/html/dataprofiler.data_readers.html new file mode 100644 index 000000000..a52fef9b9 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.html @@ -0,0 +1,326 @@ + + + + + + + + + Data Readers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.json_data.html b/docs/0.12.0/html/dataprofiler.data_readers.json_data.html new file mode 100644 index 000000000..4195de0f8 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.json_data.html @@ -0,0 +1,422 @@ + + + + + + + + + JSON Data - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

JSON Data

+

Contains class to save and load json data.

+
+
+class dataprofiler.data_readers.json_data.JSONData(input_file_path: Optional[str] = None, data: Optional[Union[DataFrame, str]] = None, options: Optional[Dict] = None)
+

Bases: SpreadSheetDataMixin, BaseData

+

SpreadsheetData class to save and load spreadsheet data.

+

Initialize Data class for loading datasets of type JSON.

+

Can be specified by passing in memory data or via a file path. +Options pertaining the JSON may also be specified using the +options dict parameter. +Possible Options:

+
options = dict(
+    data_format= type: str, choices: "dataframe", "records", "json",
+     "flattened_dataframe"
+    selected_keys= type: list(str)
+    payload_keys= type: Union[str, list(str)]
+)
+
+
+

data_format: user selected format in which to return data +can only be of specified types +selected_keys: keys being selected from the entire dataset +payload_keys: list of dictionary keys that determine the payload

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type: str = 'json'
+
+
+
+property selected_keys: Optional[List[str]]
+

Return selected keys.

+
+
+
+property metadata: Optional[DataFrame]
+

Return a data frame that contains the metadata.

+
+
+
+property data_and_metadata: Optional[DataFrame]
+

Return a data frame that joins the data and the metadata.

+
+
+
+property is_structured
+

Determine compatibility with StructuredProfiler.

+
+
+
+classmethod is_match(file_path: Union[str, StringIO], options: Optional[Dict] = None) bool
+

Test whether first 1000 lines of file has valid JSON format or not.

+

At least 60 percent of the lines in the first 1000 +lines have to be valid json.

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – json read options

  • +
+
+
Returns
+

is file a json file or not

+
+
Return type
+

bool

+
+
+
+
+
+property data
+

Return data.

+
+
+
+property data_format: Optional[str]
+

Return data format.

+
+
+
+property file_encoding: Optional[str]
+

Return file encoding.

+
+
+
+get_batch_generator(batch_size: int) Generator[Union[DataFrame, List], None, None]
+

Get batch generator.

+
+
+
+info: Optional[str] = None
+
+
+
+property length: int
+

Return the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+reload(input_file_path: Optional[str] = None, data: Optional[Union[DataFrame, str]] = None, options: Optional[Dict] = None) None
+

Reload the data class with a new dataset.

+

This erases all existing data/options and replaces it +with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+options: Optional[Dict]
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.parquet_data.html b/docs/0.12.0/html/dataprofiler.data_readers.parquet_data.html new file mode 100644 index 000000000..33ade7d1a --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.parquet_data.html @@ -0,0 +1,412 @@ + + + + + + + + + Parquet Data - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Parquet Data

+

Contains class to save and load parquet data.

+
+
+class dataprofiler.data_readers.parquet_data.ParquetData(input_file_path: Optional[str] = None, data: Optional[Union[DataFrame, str]] = None, options: Optional[Dict] = None)
+

Bases: SpreadSheetDataMixin, BaseData

+

SpreadsheetData class to save and load parquet data.

+

Initialize Data class for loading datasets of type PARQUET.

+

Can be specified by passing in memory data or via a file path. +Options pertaining to PARQUET may also be specified using options dict param. +Possible Options:

+
options = dict(
+    data_format= type: str, choices: "dataframe", "records", "json"
+    selected_columns= type: list(str)
+    header= type: any
+)
+
+
+

data_format: user selected format in which to return data +can only be of specified types +selected_columns: columns being selected from the entire dataset

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type: str = 'parquet'
+
+
+
+property file_encoding: None
+

Set file encoding to None since not detected for avro.

+
+
+
+property selected_columns: List[str]
+

Return selected columns.

+
+
+
+property sample_nrows: Optional[int]
+

Return sample_nrows.

+
+
+
+property is_structured: bool
+

Determine compatibility with StructuredProfiler.

+
+
+
+classmethod is_match(file_path: Union[str, StringIO, BytesIO], options: Optional[Dict] = None) bool
+

Test the given file to check if the file has valid Parquet format.

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – parquet read options

  • +
+
+
Returns
+

is file a parquet file or not

+
+
Return type
+

bool

+
+
+
+
+
+property data
+

Return data.

+
+
+
+property data_format: Optional[str]
+

Return data format.

+
+
+
+get_batch_generator(batch_size: int) Generator[Union[DataFrame, List], None, None]
+

Get batch generator.

+
+
+
+info: Optional[str] = None
+
+
+
+property length: int
+

Return the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+reload(input_file_path: Optional[str] = None, data: Optional[Any] = None, options: Optional[Dict] = None) None
+

Reload the data class with a new dataset.

+

This erases all existing data/options and replaces it +with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+options: Optional[Dict]
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.structured_mixins.html b/docs/0.12.0/html/dataprofiler.data_readers.structured_mixins.html new file mode 100644 index 000000000..395e2698b --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.structured_mixins.html @@ -0,0 +1,306 @@ + + + + + + + + + Structured Mixins - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Structured Mixins

+

Contains mixin data class for loading datasets of tye SpreadSheet.

+
+
+class dataprofiler.data_readers.structured_mixins.SpreadSheetDataMixin(input_file_path: Optional[str], data: Any, options: Dict)
+

Bases: object

+

Mixin data class for loading datasets of type SpreadSheet.

+

Can be specified. +Adds specialized functions for loading data from a string or file.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+

Initialize spreadsheet mixin object.

+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.data_readers.text_data.html b/docs/0.12.0/html/dataprofiler.data_readers.text_data.html new file mode 100644 index 000000000..5db1e3432 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.data_readers.text_data.html @@ -0,0 +1,412 @@ + + + + + + + + + Text Data - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Text Data

+

Contains class for saving and loading text files.

+
+
+class dataprofiler.data_readers.text_data.TextData(input_file_path: Optional[str] = None, data: Optional[List[str]] = None, options: Optional[Dict] = None)
+

Bases: BaseData

+

TextData class to save and load text files.

+

Initialize Data class for loading datasets of type TEXT.

+

Can be specified by +passing in memory data or via a file path. Options pertaining the TEXT +may also be specified using the options dict parameter. +Possible Options:

+
options = dict(
+    data_format= type: str, choices: "text"
+    samples_per_line= type: int
+)
+
+
+

data_format: user selected format in which to return data +can only be of specified types +samples_per_line: chunks by which to read in the specified dataset

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type: str = 'text'
+
+
+
+property samples_per_line: int
+

Return samples per line.

+
+
+
+property is_structured: bool
+

Determine compatibility with StructuredProfiler.

+
+
+
+tokenize() None
+

Tokenize data.

+
+
+
+classmethod is_match(file_path: str, options: Optional[Dict] = None) bool
+

Return True if all are text files.

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – text file read options

  • +
+
+
Returns
+

is file a text file or not

+
+
Return type
+

bool

+
+
+
+
+
+reload(input_file_path: Optional[str] = None, data: Optional[List[str]] = None, options: Optional[Dict] = None) None
+

Reload the data class with a new dataset.

+

This erases all existing +data/options and replaces it with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+property data
+

Return data.

+
+
+
+property data_format: Optional[str]
+

Return data format.

+
+
+
+property file_encoding: Optional[str]
+

Return file encoding.

+
+
+
+get_batch_generator(batch_size: int) Generator[Union[DataFrame, List], None, None]
+

Get batch generator.

+
+
+
+info: Optional[str] = None
+
+
+
+property length: int
+

Return the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+options: Optional[Dict]
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.dp_logging.html b/docs/0.12.0/html/dataprofiler.dp_logging.html new file mode 100644 index 000000000..c6bc962cf --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.dp_logging.html @@ -0,0 +1,300 @@ + + + + + + + + + Dp Logging - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Dp Logging

+

Utility funcs allowing alteration of logging level/verbosity within dp lib.

+
+
+dataprofiler.dp_logging.get_logger()
+

Access DataProfiler-specific logger.

+
+
+
+dataprofiler.dp_logging.set_verbosity(level)
+

Set verbosity level for DataProfiler logger.

+

Must set it to one of the following values: +[logging.NOTSET, logging.DEBUG, logging.INFO,

+
+

logging.WARNING, logging.ERROR, logging.CRITICAL]

+
+
+
Parameters
+

level (int) – Verbosity level from logging module

+
+
+
+
+
+dataprofiler.dp_logging.get_child_logger(name)
+

Return logger for the given filepath.

+
+
Parameters
+

name (str) – name of file in need of accessing child logger

+
+
Returns
+

Logger instance for given file

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.html b/docs/0.12.0/html/dataprofiler.html new file mode 100644 index 000000000..5de5ca3e6 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.html @@ -0,0 +1,392 @@ + + + + + + + + + Dataprofiler - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+ + +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.base_data_labeler.html b/docs/0.12.0/html/dataprofiler.labelers.base_data_labeler.html new file mode 100644 index 000000000..661c46f63 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.base_data_labeler.html @@ -0,0 +1,911 @@ + + + + + + + + + Base Data Labeler - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Data Labeler

+

Contains abstract classes from which labeler classes will inherit.

+
+
+class dataprofiler.labelers.base_data_labeler.BaseDataLabeler(dirpath: Optional[str] = None, load_options: Optional[dict] = None)
+

Bases: object

+

Parent class for data labeler objects.

+

Initialize DataLabeler class.

+
+
Parameters
+
    +
  • dirpath – path to data labeler

  • +
  • load_options – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
+
+
+help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping: dict
+

Retrieve the label encodings.

+
+
Returns
+

dictionary for associating labels to indexes

+
+
+
+
+
+property reverse_label_mapping: dict
+

Retrieve the index to label encoding.

+
+
Returns
+

dictionary for associating indexes to labels

+
+
+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+property preprocessor: data_processing.BaseDataPreprocessor | None
+

Retrieve the data preprocessor.

+
+
Returns
+

returns the preprocessor instance

+
+
+
+
+
+property model: BaseModel
+

Retrieve the data labeler model.

+
+
Returns
+

returns the model instance

+
+
+
+
+
+property postprocessor: data_processing.BaseDataPostprocessor | None
+

Retrieve the data postprocessor.

+
+
Returns
+

returns the postprocessor instance

+
+
+
+
+
+set_params(params: dict) None
+

Allow user to set parameters of pipeline components.

+
+
Done in the following format:
+
params = dict(

preprocessor=dict(…), +model=dict(…), +postprocessor=dict(…)

+
+
+

)

+
+
+

where the key,values pairs for each pipeline component must match +parameters that exist in their components.

+
+
Parameters
+

params (dict) –

dictionary containing a key for a given pipeline +component and its associated value of parameters as such:

+
+

dict(preprocessor=dict(…), model=dict(…), +postprocessor=dict(…))

+
+

+
+
Returns
+

None

+
+
+
+
+
+add_label(label: str, same_as: Optional[str] = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+set_labels(labels: list | dict) None
+

Set the labels for the data labeler.

+
+
Parameters
+

labels (list or dict) – new labels in either encoding list or dict

+
+
Returns
+

None

+
+
+
+
+
+predict(data: Union[DataFrame, Series, ndarray], batch_size: int = 32, predict_options: Optional[dict[str, bool]] = None, error_on_mismatch: bool = False, verbose: bool = True) dict
+

Predict labels of input data based with the data labeler model.

+
+
Parameters
+
    +
  • data (Union[pd.DataFrame, pd.Series, np.ndarray]) – data to be predicted upon

  • +
  • batch_size (int) – batch size of prediction

  • +
  • predict_options (Dict[str, bool]) – optional parameters to allow for predict as a +dict, i.e. dict(show_confidences=True)

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

predictions

+
+
Return type
+

Dict

+
+
+
+
+
+set_preprocessor(data_processor: BaseDataPreprocessor) None
+

Set the data preprocessor for the data labeler.

+
+
Parameters
+

data_processor (data_processing.BaseDataPreprocessor) – processor to set as the preprocessor

+
+
Returns
+

None

+
+
+
+
+
+set_model(model: BaseModel) None
+

Set the model for the data labeler.

+
+
Parameters
+

model (base_model.BaseModel) – model to use within the data labeler

+
+
Returns
+

None

+
+
+
+
+
+set_postprocessor(data_processor: BaseDataPostprocessor) None
+

Set the data postprocessor for the data labeler.

+
+
Parameters
+

data_processor (data_processing.BaseDataPostprocessor) – processor to set as the postprocessor

+
+
Returns
+

None

+
+
+
+
+
+check_pipeline(skip_postprocessor: bool = False, error_on_mismatch: bool = False) None
+

Check whether the processors and models connect together without error.

+
+
Parameters
+
    +
  • skip_postprocessor (bool) – skip checking postprocessor is valid in +pipeline

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod load_from_library(name: str) BaseDataLabeler
+

Load the data labeler from the data labeler zoo in the library.

+
+
Parameters
+

name (str) – name of the data labeler.

+
+
Returns
+

DataLabeler class

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+classmethod load_from_disk(dirpath: str, load_options: Optional[dict] = None) BaseDataLabeler
+

Load the data labeler from a saved location on disk.

+
+
Parameters
+
    +
  • dirpath (str) – path to data labeler files.

  • +
  • load_options (dict) – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
Returns
+

DataLabeler class

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+classmethod load_with_components(preprocessor: BaseDataPreprocessor, model: BaseModel, postprocessor: BaseDataPostprocessor) BaseDataLabeler
+

Load the data labeler from a its set of components.

+
+
Parameters
+
+
+
Returns
+

loaded BaseDataLabeler

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+save_to_disk(dirpath: str) None
+

Save the data labeler to the specified location.

+
+
Parameters
+

dirpath (str) – location to save the data labeler.

+
+
Returns
+

None

+
+
+
+
+
+
+class dataprofiler.labelers.base_data_labeler.TrainableDataLabeler(dirpath: Optional[str] = None, load_options: Optional[dict] = None)
+

Bases: BaseDataLabeler

+

Subclass of BaseDataLabeler that can be trained.

+

Initialize DataLabeler class.

+
+
Parameters
+
    +
  • dirpath – path to data labeler

  • +
  • load_options – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
+
+
+fit(x: DataArray, y: DataArray, validation_split: float = 0.2, labels: list | dict | None = None, reset_weights: bool = False, batch_size: int = 32, epochs: int = 1, error_on_mismatch: bool = False) list
+

Fit the data labeler model for the dataset.

+
+
Parameters
+
    +
  • x (Union[pd.DataFrame, pd.Series, np.ndarray]) – samples to fit model

  • +
  • y (Union[pd.DataFrame, pd.Series, np.ndarray]) – labels associated with the samples to fit model

  • +
  • validation_split (float) – split of the data to have as cross-validation +data

  • +
  • labels (Union[list, dict]) – Encoding or number of labels if refit is needed to new +labels

  • +
  • reset_weights (bool) – Flag to determine whether or not to reset the +weights

  • +
  • batch_size (int) – Size of each batch sent to data labeler model

  • +
  • epochs (int) – number of epochs to iterate over the dataset and send to +the model

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

model output

+
+
+
+
+
+set_model(model: BaseModel) None
+

Set the model for a trainable data labeler.

+

Model must have a train function to be able to be set.

+
+
Parameters
+

model (base_model.BaseModel) – model to use within the data labeler

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_with_components(preprocessor: BaseDataPreprocessor, model: BaseModel, postprocessor: BaseDataPostprocessor) TrainableDataLabeler
+

Load the data labeler from a its set of components.

+
+
Parameters
+
+
+
Returns
+

loaded TrainableDataLabeler

+
+
Return type
+

TrainableDataLabeler

+
+
+
+
+
+add_label(label: str, same_as: Optional[str] = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+check_pipeline(skip_postprocessor: bool = False, error_on_mismatch: bool = False) None
+

Check whether the processors and models connect together without error.

+
+
Parameters
+
    +
  • skip_postprocessor (bool) – skip checking postprocessor is valid in +pipeline

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

None

+
+
+
+
+
+help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping: dict
+

Retrieve the label encodings.

+
+
Returns
+

dictionary for associating labels to indexes

+
+
+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+classmethod load_from_disk(dirpath: str, load_options: Optional[dict] = None) BaseDataLabeler
+

Load the data labeler from a saved location on disk.

+
+
Parameters
+
    +
  • dirpath (str) – path to data labeler files.

  • +
  • load_options (dict) – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
Returns
+

DataLabeler class

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+classmethod load_from_library(name: str) BaseDataLabeler
+

Load the data labeler from the data labeler zoo in the library.

+
+
Parameters
+

name (str) – name of the data labeler.

+
+
Returns
+

DataLabeler class

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+property model: BaseModel
+

Retrieve the data labeler model.

+
+
Returns
+

returns the model instance

+
+
+
+
+
+property postprocessor: data_processing.BaseDataPostprocessor | None
+

Retrieve the data postprocessor.

+
+
Returns
+

returns the postprocessor instance

+
+
+
+
+
+predict(data: Union[DataFrame, Series, ndarray], batch_size: int = 32, predict_options: Optional[dict[str, bool]] = None, error_on_mismatch: bool = False, verbose: bool = True) dict
+

Predict labels of input data based with the data labeler model.

+
+
Parameters
+
    +
  • data (Union[pd.DataFrame, pd.Series, np.ndarray]) – data to be predicted upon

  • +
  • batch_size (int) – batch size of prediction

  • +
  • predict_options (Dict[str, bool]) – optional parameters to allow for predict as a +dict, i.e. dict(show_confidences=True)

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

predictions

+
+
Return type
+

Dict

+
+
+
+
+
+property preprocessor: data_processing.BaseDataPreprocessor | None
+

Retrieve the data preprocessor.

+
+
Returns
+

returns the preprocessor instance

+
+
+
+
+
+property reverse_label_mapping: dict
+

Retrieve the index to label encoding.

+
+
Returns
+

dictionary for associating indexes to labels

+
+
+
+
+
+save_to_disk(dirpath: str) None
+

Save the data labeler to the specified location.

+
+
Parameters
+

dirpath (str) – location to save the data labeler.

+
+
Returns
+

None

+
+
+
+
+
+set_labels(labels: list | dict) None
+

Set the labels for the data labeler.

+
+
Parameters
+

labels (list or dict) – new labels in either encoding list or dict

+
+
Returns
+

None

+
+
+
+
+
+set_params(params: dict) None
+

Allow user to set parameters of pipeline components.

+
+
Done in the following format:
+
params = dict(

preprocessor=dict(…), +model=dict(…), +postprocessor=dict(…)

+
+
+

)

+
+
+

where the key,values pairs for each pipeline component must match +parameters that exist in their components.

+
+
Parameters
+

params (dict) –

dictionary containing a key for a given pipeline +component and its associated value of parameters as such:

+
+

dict(preprocessor=dict(…), model=dict(…), +postprocessor=dict(…))

+
+

+
+
Returns
+

None

+
+
+
+
+
+set_postprocessor(data_processor: BaseDataPostprocessor) None
+

Set the data postprocessor for the data labeler.

+
+
Parameters
+

data_processor (data_processing.BaseDataPostprocessor) – processor to set as the postprocessor

+
+
Returns
+

None

+
+
+
+
+
+set_preprocessor(data_processor: BaseDataPreprocessor) None
+

Set the data preprocessor for the data labeler.

+
+
Parameters
+

data_processor (data_processing.BaseDataPreprocessor) – processor to set as the preprocessor

+
+
Returns
+

None

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.base_model.html b/docs/0.12.0/html/dataprofiler.labelers.base_model.html new file mode 100644 index 000000000..7d4fce876 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.base_model.html @@ -0,0 +1,677 @@ + + + + + + + + + Base Model - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Model

+

Contains abstract classes for labeling data.

+
+
+class dataprofiler.labelers.base_model.AutoSubRegistrationMeta(clsname: str, bases: tuple[type, ...], attrs: dict[str, object])
+

Bases: ABCMeta

+

For registering subclasses.

+

Create auto registration object and return new class.

+
+
+mro()
+

Return a type’s method resolution order.

+
+
+
+register(subclass)
+

Register a virtual subclass of an ABC.

+

Returns the subclass, to allow usage as a class decorator.

+
+
+
+
+class dataprofiler.labelers.base_model.BaseModel(label_mapping: list | dict, parameters: dict)
+

Bases: object

+

For labeling data.

+

Initialize Base Model.

+

Only model and model parameters are stored here. +:param label_mapping: label mapping of the model or list of labels to be

+
+

converted into the label mapping

+
+
+
Parameters
+

parameters (dict) – Contains all the appropriate parameters for the model. +Must contain num_labels.

+
+
Returns
+

None

+
+
+
+
+requires_zero_mapping: bool = False
+
+
+
+property label_mapping: dict[str, int]
+

Return mapping of labels to their encoded values.

+
+
+
+property reverse_label_mapping: dict[int, str]
+

Return reversed order of current labels.

+

Useful for when needed to extract Labels via indices.

+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+property num_labels: int
+

Return max label mapping.

+
+
+
+classmethod get_class(class_name: str) type[BaseModel] | None
+

Get subclasses.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (List[str]) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+add_label(label: str, same_as: str | None = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+set_label_mapping(label_mapping: list[str] | dict[str, int]) None
+

Set the labels for the model.

+
+
Parameters
+

label_mapping (Union[list, dict]) – label mapping of the model or list of labels to be +converted into the label mapping

+
+
Returns
+

None

+
+
+
+
+
+classmethod help() None
+

Help describe alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+abstract reset_weights() None
+

Reset the weights of the model.

+
+
Returns
+

None

+
+
+
+
+
+abstract predict(data: Union[DataFrame, Series, ndarray], batch_size: int, show_confidences: bool, verbose: bool) dict
+

Predict the data with the current model.

+
+
Parameters
+
    +
  • data (iterator of data to process) – model input data to predict on

  • +
  • batch_size (int) – number of samples in the batch of data

  • +
  • show_confidences (bool) – whether user wants prediction confidences

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

char level predictions and confidences

+
+
Return type
+

dict

+
+
+
+
+
+abstract classmethod load_from_disk(dirpath: str) BaseModel
+

Load whole model from disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to load the model from

+
+
Returns
+

loaded model

+
+
Return type
+

BaseModel

+
+
+
+
+
+abstract save_to_disk(dirpath: str) None
+

Save whole model to disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to save the model to

+
+
Returns
+

None

+
+
+
+
+
+
+class dataprofiler.labelers.base_model.BaseTrainableModel(label_mapping: list | dict, parameters: dict)
+

Bases: BaseModel

+

Contains abstract method for training models.

+

Initialize Base Model.

+

Only model and model parameters are stored here. +:param label_mapping: label mapping of the model or list of labels to be

+
+

converted into the label mapping

+
+
+
Parameters
+

parameters (dict) – Contains all the appropriate parameters for the model. +Must contain num_labels.

+
+
Returns
+

None

+
+
+
+
+abstract fit(train_data: DataArray, val_data: DataArray, batch_size: int | None = None, epochs: int | None = None, label_mapping: dict[str, int] | None = None, reset_weights: bool = False, verbose: bool = True) tuple[dict, float | None, dict]
+

Train the current model with the training data and validation data.

+
+
Parameters
+
    +
  • train_data (Union[pd.DataFrame, pd.Series, np.ndarray]) – Training data used to train model

  • +
  • val_data (Union[pd.DataFrame, pd.Series, np.ndarray]) – Validation data used to validate the training

  • +
  • batch_size (int) – Used to determine number of samples in each batch

  • +
  • epochs (int) – Used to determine how many epochs to run

  • +
  • label_mapping (dict) – Mapping of the labels

  • +
  • reset_weights (bool) – Flag to determine whether or not to reset the +model’s weights

  • +
+
+
Returns
+

history, f1, f1_report

+
+
Return type
+

Tuple[dict, float, dict]

+
+
+
+
+
+add_label(label: str, same_as: str | None = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseModel] | None
+

Get subclasses.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (List[str]) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod help() None
+

Help describe alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping: dict[str, int]
+

Return mapping of labels to their encoded values.

+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+abstract classmethod load_from_disk(dirpath: str) BaseModel
+

Load whole model from disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to load the model from

+
+
Returns
+

loaded model

+
+
Return type
+

BaseModel

+
+
+
+
+
+property num_labels: int
+

Return max label mapping.

+
+
+
+abstract predict(data: Union[DataFrame, Series, ndarray], batch_size: int, show_confidences: bool, verbose: bool) dict
+

Predict the data with the current model.

+
+
Parameters
+
    +
  • data (iterator of data to process) – model input data to predict on

  • +
  • batch_size (int) – number of samples in the batch of data

  • +
  • show_confidences (bool) – whether user wants prediction confidences

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

char level predictions and confidences

+
+
Return type
+

dict

+
+
+
+
+
+requires_zero_mapping: bool = False
+
+
+
+abstract reset_weights() None
+

Reset the weights of the model.

+
+
Returns
+

None

+
+
+
+
+
+property reverse_label_mapping: dict[int, str]
+

Return reversed order of current labels.

+

Useful for when needed to extract Labels via indices.

+
+
+
+abstract save_to_disk(dirpath: str) None
+

Save whole model to disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to save the model to

+
+
Returns
+

None

+
+
+
+
+
+set_label_mapping(label_mapping: list[str] | dict[str, int]) None
+

Set the labels for the model.

+
+
Parameters
+

label_mapping (Union[list, dict]) – label mapping of the model or list of labels to be +converted into the label mapping

+
+
Returns
+

None

+
+
+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.char_load_tf_model.html b/docs/0.12.0/html/dataprofiler.labelers.char_load_tf_model.html new file mode 100644 index 000000000..28b3f4732 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.char_load_tf_model.html @@ -0,0 +1,493 @@ + + + + + + + + + Char Load Tf Model - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Char Load Tf Model

+

Contains class for training data labeler model.

+
+
+class dataprofiler.labelers.char_load_tf_model.CharLoadTFModel(model_path: str, label_mapping: dict[str, int], parameters: Optional[dict] = None)
+

Bases: BaseTrainableModel

+

For training data labeler model.

+

Initialize Loadable TF Model.

+
+
Parameters
+
    +
  • model_path (str) – path to model to load

  • +
  • label_mapping (dict) – maps labels to their encoded integers

  • +
  • parameters (dict) –

    Contains all the appropriate parameters for the +model. Must contain num_labels. Other possible parameters are:

    +
    +

    max_length, max_char_encoding_id, dim_embed, size_fc +dropout, size_conv, num_fil, optimizer, default_label

    +
    +

  • +
+
+
Returns
+

None

+
+
+
+
+requires_zero_mapping: bool = False
+
+
+
+set_label_mapping(label_mapping: list[str] | dict[str, int]) None
+

Set the labels for the model.

+
+
Parameters
+

label_mapping (dict) – label mapping of the model

+
+
Returns
+

None

+
+
+
+
+
+save_to_disk(dirpath: str) None
+

Save whole model to disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to save the model to

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) CharLoadTFModel
+

Load whole model from disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to load the model from

+
+
Returns
+

loaded CharLoadTFModel

+
+
Return type
+

CharLoadTFModel

+
+
+
+
+
+reset_weights() None
+

Reset the weights of the model.

+
+
Returns
+

None

+
+
+
+
+
+fit(train_data: DataArray, val_data: DataArray = None, batch_size: int = None, epochs: int = None, label_mapping: dict[str, int] = None, reset_weights: bool = False, verbose: bool = True) tuple[dict, float | None, dict]
+

Train the current model with the training data and validation data.

+
+
Parameters
+
    +
  • train_data (Union[list, np.ndarray]) – Training data used to train model

  • +
  • val_data (Union[list, np.ndarray]) – Validation data used to validate the training

  • +
  • batch_size (int) – Used to determine number of samples in each batch

  • +
  • label_mapping (Union[dict, None]) – maps labels to their encoded integers

  • +
  • reset_weights (bool) – Flag to determine whether to reset the weights or +not

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

history, f1, f1_report

+
+
Return type
+

Tuple[dict, float, dict]

+
+
+
+
+
+predict(data: Union[DataFrame, Series, ndarray], batch_size: int = 32, show_confidences: bool = False, verbose: bool = True) dict
+

Run model and get predictions.

+
+
Parameters
+
    +
  • data (Union[list, numpy.ndarray]) – text input

  • +
  • batch_size (int) – number of samples in the batch of data

  • +
  • show_confidences – whether user wants prediction confidences

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

char level predictions and confidences

+
+
Return type
+

dict

+
+
+
+
+
+details() None
+

Print the relevant details of the model.

+

Details include summary, parameters, label mapping.

+
+
+
+add_label(label: str, same_as: str | None = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseModel] | None
+

Get subclasses.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (List[str]) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod help() None
+

Help describe alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping: dict[str, int]
+

Return mapping of labels to their encoded values.

+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+property num_labels: int
+

Return max label mapping.

+
+
+
+property reverse_label_mapping: dict[int, str]
+

Return reversed order of current labels.

+

Useful for when needed to extract Labels via indices.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.character_level_cnn_model.html b/docs/0.12.0/html/dataprofiler.labelers.character_level_cnn_model.html new file mode 100644 index 000000000..ab0bc8322 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.character_level_cnn_model.html @@ -0,0 +1,1301 @@ + + + + + + + + + Character Level Cnn Model - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Character Level Cnn Model

+

Contains classes for char data labeling.

+
+
+dataprofiler.labelers.character_level_cnn_model.build_embd_dictionary(filename: str) dict[str, numpy.ndarray]
+

Return a numpy embedding dictionary from embed file with GloVe-like format.

+
+
Parameters
+

filename (str) – Path to the embed file for loading

+
+
+
+
+
+dataprofiler.labelers.character_level_cnn_model.create_glove_char(n_dims: int, source_file: Optional[str] = None) None
+

Embed GloVe chars embeddings from source file to n_dims principal components.

+

Embed in a new file.

+
+
Parameters
+
    +
  • n_dims (int) – Final number of principal component dims of the embeddings

  • +
  • source_file (str) – Location of original embeddings to factor down

  • +
+
+
+
+
+
+class dataprofiler.labelers.character_level_cnn_model.ThreshArgMaxLayer(*args, **kwargs)
+

Bases: Layer

+

Keras layer applying a thresholded argmax.

+

Apply a minimum threshold to the argmax value.

+

When below this threshold the index will be the default.

+
+
Parameters
+
    +
  • num_labels (int) – number of entities

  • +
  • threshold (float) – default set to 0 so all confidences pass.

  • +
  • default_ind (int) – default index

  • +
+
+
Returns
+

final argmax threshold layer for the model

+
+
+

:return : tensor containing argmax thresholded integers, labels out +:rtype: tf.Tensor

+
+
+get_config()
+

Return a serializable config for saving the layer.

+
+
+
+call(argmax_layer: Tensor, confidence_layer: Tensor) Tensor
+

Apply the threshold argmax to the input tensor.

+
+
+
+add_loss(loss)
+

Can be called inside of the call() method to add a scalar loss.

+

Example:

+

```python +class MyLayer(Layer):

+
+

… +def call(self, x):

+
+

self.add_loss(ops.sum(x)) +return x

+
+
+

```

+
+
+
+add_metric()
+
+
+
+add_variable(shape, initializer, dtype=None, trainable=True, autocast=True, regularizer=None, constraint=None, name=None)
+

Add a weight variable to the layer.

+

Alias of add_weight().

+
+
+
+add_weight(shape=None, initializer=None, dtype=None, trainable=True, autocast=True, regularizer=None, constraint=None, aggregation='mean', name=None)
+

Add a weight variable to the layer.

+
+
Parameters
+
    +
  • shape – Shape tuple for the variable. Must be fully-defined +(no None entries). Defaults to () (scalar) if unspecified.

  • +
  • initializer – Initializer object to use to populate the initial +variable value, or string name of a built-in initializer +(e.g. “random_normal”). If unspecified, defaults to +“glorot_uniform” for floating-point variables and to “zeros” +for all other types (e.g. int, bool).

  • +
  • dtype – Dtype of the variable to create, e.g. “float32”. If +unspecified, defaults to the layer’s variable dtype +(which itself defaults to “float32” if unspecified).

  • +
  • trainable – Boolean, whether the variable should be trainable via +backprop or whether its updates are managed manually. Defaults +to True.

  • +
  • autocast – Boolean, whether to autocast layers variables when +accessing them. Defaults to True.

  • +
  • regularizer – Regularizer object to call to apply penalty on the +weight. These penalties are summed into the loss function +during optimization. Defaults to None.

  • +
  • constraint – Contrainst object to call on the variable after any +optimizer update, or string name of a built-in constraint. +Defaults to None.

  • +
  • aggregation – String, one of ‘mean’, ‘sum’, +‘only_first_replica’. Annotates the variable with the type +of multi-replica aggregation to be used for this variable +when writing custom data parallel training loops.

  • +
  • name – String name of the variable. Useful for debugging purposes.

  • +
+
+
+
+
+
+build(input_shape)
+
+
+
+build_from_config(config)
+

Builds the layer’s states with the supplied config dict.

+

By default, this method calls the build(config[“input_shape”]) method, +which creates weights based on the layer’s input shape in the supplied +config. If your config contains other information needed to load the +layer’s state, you should override this method.

+
+
Parameters
+

config – Dict containing the input shape associated with this layer.

+
+
+
+
+
+property compute_dtype
+

The dtype of the computations performed by the layer.

+
+
+
+compute_mask(inputs, previous_mask)
+
+
+
+compute_output_shape(*args, **kwargs)
+
+
+
+compute_output_spec(*args, **kwargs)
+
+
+
+count_params()
+

Count the total number of scalars composing the weights.

+
+
Returns
+

An integer count.

+
+
+
+
+
+property dtype
+

Alias of layer.variable_dtype.

+
+
+
+property dtype_policy
+
+
+
+classmethod from_config(config)
+

Creates a layer from its config.

+

This method is the reverse of get_config, +capable of instantiating the same layer from the config +dictionary. It does not handle layer connectivity +(handled by Network), nor weights (handled by set_weights).

+
+
Parameters
+

config – A Python dictionary, typically the +output of get_config.

+
+
Returns
+

A layer instance.

+
+
+
+
+
+get_build_config()
+

Returns a dictionary with the layer’s input shape.

+

This method returns a config dict that can be used by +build_from_config(config) to create all states (e.g. Variables and +Lookup tables) needed by the layer.

+

By default, the config only contains the input shape that the layer +was built with. If you’re writing a custom layer that creates state in +an unusual way, you should override this method to make sure this state +is already created when Keras attempts to load its value upon model +loading.

+
+
Returns
+

A dict containing the input shape associated with the layer.

+
+
+
+
+
+get_weights()
+

Return the values of layer.weights as a list of NumPy arrays.

+
+
+
+property input
+

Retrieves the input tensor(s) of a symbolic operation.

+

Only returns the tensor(s) corresponding to the first time +the operation was called.

+
+
Returns
+

Input tensor or list of input tensors.

+
+
+
+
+
+property input_dtype
+

The dtype layer inputs should be converted to.

+
+
+
+property input_spec
+
+
+
+load_own_variables(store)
+

Loads the state of the layer.

+

You can override this method to take full control of how the state of +the layer is loaded upon calling keras.models.load_model().

+
+
Parameters
+

store – Dict from which the state of the model will be loaded.

+
+
+
+
+
+property losses
+

List of scalar losses from add_loss, regularizers and sublayers.

+
+
+
+property metrics
+

List of all metrics.

+
+
+
+property metrics_variables
+

List of all metric variables.

+
+
+
+property non_trainable_variables
+

List of all non-trainable layer state.

+

This extends layer.non_trainable_weights to include all state used by +the layer including state for metrics and `SeedGenerator`s.

+
+
+
+property non_trainable_weights
+

List of all non-trainable weight variables of the layer.

+

These are the weights that should not be updated by the optimizer during +training. Unlike, layer.non_trainable_variables this excludes metric +state and random seeds.

+
+
+
+property output
+

Retrieves the output tensor(s) of a layer.

+

Only returns the tensor(s) corresponding to the first time +the operation was called.

+
+
Returns
+

Output tensor or list of output tensors.

+
+
+
+
+
+quantize(mode)
+
+
+
+quantized_call(*args, **kwargs)
+
+
+
+save_own_variables(store)
+

Saves the state of the layer.

+

You can override this method to take full control of how the state of +the layer is saved upon calling model.save().

+
+
Parameters
+

store – Dict where the state of the model will be saved.

+
+
+
+
+
+set_weights(weights)
+

Sets the values of layer.weights from a list of NumPy arrays.

+
+
+
+stateless_call(trainable_variables, non_trainable_variables, *args, return_losses=False, **kwargs)
+

Call the layer without any side effects.

+
+
Parameters
+
    +
  • trainable_variables – List of trainable variables of the model.

  • +
  • non_trainable_variables – List of non-trainable variables of the +model.

  • +
  • *args – Positional arguments to be passed to call().

  • +
  • return_losses – If True, stateless_call() will return the list of +losses created during call() as part of its return values.

  • +
  • **kwargs – Keyword arguments to be passed to call().

  • +
+
+
Returns
+

+
A tuple. By default, returns (outputs, non_trainable_variables).

If return_losses = True, then returns +(outputs, non_trainable_variables, losses).

+
+
+

+
+
+

Note: non_trainable_variables include not only non-trainable weights +such as BatchNormalization statistics, but also RNG seed state +(if there are any random operations part of the layer, such as dropout), +and Metric state (if there are any metrics attached to the layer). +These are all elements of state of the layer.

+

Example:

+

```python +model = … +data = … +trainable_variables = model.trainable_variables +non_trainable_variables = model.non_trainable_variables +# Call the model with zero side effects +outputs, non_trainable_variables = model.stateless_call(

+
+

trainable_variables, +non_trainable_variables, +data,

+
+

) +# Attach the updated state to the model +# (until you do this, the model is still in its pre-call state). +for ref_var, value in zip(

+
+

model.non_trainable_variables, non_trainable_variables

+
+
+
):

ref_var.assign(value)

+
+
+

```

+
+
+
+property supports_masking
+

Whether this layer supports computing a mask using compute_mask.

+
+
+
+symbolic_call(*args, **kwargs)
+
+
+
+property trainable
+

Settable boolean, whether this layer should be trainable or not.

+
+
+
+property trainable_variables
+

List of all trainable layer state.

+

This is equivalent to layer.trainable_weights.

+
+
+
+property trainable_weights
+

List of all trainable weight variables of the layer.

+

These are the weights that get updated by the optimizer during training.

+
+
+
+property variable_dtype
+

The dtype of the state (weights) of the layer.

+
+
+
+property variables
+

List of all layer state, including random seeds.

+

This extends layer.weights to include all state used by the layer +including `SeedGenerator`s.

+

Note that metrics variables are not included here, use +metrics_variables to visit all the metric variables.

+
+
+
+property weights
+

List of all weight variables of the layer.

+

Unlike, layer.variables this excludes metric state and random seeds.

+
+
+
+
+class dataprofiler.labelers.character_level_cnn_model.EncodingLayer(*args, **kwargs)
+

Bases: Layer

+

Encodes strings to integers.

+

Encode characters for the list of sentences.

+
+
Parameters
+
    +
  • max_char_encoding_id (int) – Maximum integer value for encoding the +input

  • +
  • max_len (int) – Maximum char length in a sample

  • +
+
+
+
+
+get_config()
+

Return a serializable config for saving the layer.

+
+
+
+call(input_str_tensor: Tensor) Tensor
+

Encode characters for the list of sentences.

+
+
Parameters
+

input_str_tensor (tf.tensor) – input list of sentences converted to tensor

+
+
+

:return : tensor containing encoded list of input sentences +:rtype: tf.Tensor

+
+
+
+add_loss(loss)
+

Can be called inside of the call() method to add a scalar loss.

+

Example:

+

```python +class MyLayer(Layer):

+
+

… +def call(self, x):

+
+

self.add_loss(ops.sum(x)) +return x

+
+
+

```

+
+
+
+add_metric()
+
+
+
+add_variable(shape, initializer, dtype=None, trainable=True, autocast=True, regularizer=None, constraint=None, name=None)
+

Add a weight variable to the layer.

+

Alias of add_weight().

+
+
+
+add_weight(shape=None, initializer=None, dtype=None, trainable=True, autocast=True, regularizer=None, constraint=None, aggregation='mean', name=None)
+

Add a weight variable to the layer.

+
+
Parameters
+
    +
  • shape – Shape tuple for the variable. Must be fully-defined +(no None entries). Defaults to () (scalar) if unspecified.

  • +
  • initializer – Initializer object to use to populate the initial +variable value, or string name of a built-in initializer +(e.g. “random_normal”). If unspecified, defaults to +“glorot_uniform” for floating-point variables and to “zeros” +for all other types (e.g. int, bool).

  • +
  • dtype – Dtype of the variable to create, e.g. “float32”. If +unspecified, defaults to the layer’s variable dtype +(which itself defaults to “float32” if unspecified).

  • +
  • trainable – Boolean, whether the variable should be trainable via +backprop or whether its updates are managed manually. Defaults +to True.

  • +
  • autocast – Boolean, whether to autocast layers variables when +accessing them. Defaults to True.

  • +
  • regularizer – Regularizer object to call to apply penalty on the +weight. These penalties are summed into the loss function +during optimization. Defaults to None.

  • +
  • constraint – Contrainst object to call on the variable after any +optimizer update, or string name of a built-in constraint. +Defaults to None.

  • +
  • aggregation – String, one of ‘mean’, ‘sum’, +‘only_first_replica’. Annotates the variable with the type +of multi-replica aggregation to be used for this variable +when writing custom data parallel training loops.

  • +
  • name – String name of the variable. Useful for debugging purposes.

  • +
+
+
+
+
+
+build(input_shape)
+
+
+
+build_from_config(config)
+

Builds the layer’s states with the supplied config dict.

+

By default, this method calls the build(config[“input_shape”]) method, +which creates weights based on the layer’s input shape in the supplied +config. If your config contains other information needed to load the +layer’s state, you should override this method.

+
+
Parameters
+

config – Dict containing the input shape associated with this layer.

+
+
+
+
+
+property compute_dtype
+

The dtype of the computations performed by the layer.

+
+
+
+compute_mask(inputs, previous_mask)
+
+
+
+compute_output_shape(*args, **kwargs)
+
+
+
+compute_output_spec(*args, **kwargs)
+
+
+
+count_params()
+

Count the total number of scalars composing the weights.

+
+
Returns
+

An integer count.

+
+
+
+
+
+property dtype
+

Alias of layer.variable_dtype.

+
+
+
+property dtype_policy
+
+
+
+classmethod from_config(config)
+

Creates a layer from its config.

+

This method is the reverse of get_config, +capable of instantiating the same layer from the config +dictionary. It does not handle layer connectivity +(handled by Network), nor weights (handled by set_weights).

+
+
Parameters
+

config – A Python dictionary, typically the +output of get_config.

+
+
Returns
+

A layer instance.

+
+
+
+
+
+get_build_config()
+

Returns a dictionary with the layer’s input shape.

+

This method returns a config dict that can be used by +build_from_config(config) to create all states (e.g. Variables and +Lookup tables) needed by the layer.

+

By default, the config only contains the input shape that the layer +was built with. If you’re writing a custom layer that creates state in +an unusual way, you should override this method to make sure this state +is already created when Keras attempts to load its value upon model +loading.

+
+
Returns
+

A dict containing the input shape associated with the layer.

+
+
+
+
+
+get_weights()
+

Return the values of layer.weights as a list of NumPy arrays.

+
+
+
+property input
+

Retrieves the input tensor(s) of a symbolic operation.

+

Only returns the tensor(s) corresponding to the first time +the operation was called.

+
+
Returns
+

Input tensor or list of input tensors.

+
+
+
+
+
+property input_dtype
+

The dtype layer inputs should be converted to.

+
+
+
+property input_spec
+
+
+
+load_own_variables(store)
+

Loads the state of the layer.

+

You can override this method to take full control of how the state of +the layer is loaded upon calling keras.models.load_model().

+
+
Parameters
+

store – Dict from which the state of the model will be loaded.

+
+
+
+
+
+property losses
+

List of scalar losses from add_loss, regularizers and sublayers.

+
+
+
+property metrics
+

List of all metrics.

+
+
+
+property metrics_variables
+

List of all metric variables.

+
+
+
+property non_trainable_variables
+

List of all non-trainable layer state.

+

This extends layer.non_trainable_weights to include all state used by +the layer including state for metrics and `SeedGenerator`s.

+
+
+
+property non_trainable_weights
+

List of all non-trainable weight variables of the layer.

+

These are the weights that should not be updated by the optimizer during +training. Unlike, layer.non_trainable_variables this excludes metric +state and random seeds.

+
+
+
+property output
+

Retrieves the output tensor(s) of a layer.

+

Only returns the tensor(s) corresponding to the first time +the operation was called.

+
+
Returns
+

Output tensor or list of output tensors.

+
+
+
+
+
+quantize(mode)
+
+
+
+quantized_call(*args, **kwargs)
+
+
+
+save_own_variables(store)
+

Saves the state of the layer.

+

You can override this method to take full control of how the state of +the layer is saved upon calling model.save().

+
+
Parameters
+

store – Dict where the state of the model will be saved.

+
+
+
+
+
+set_weights(weights)
+

Sets the values of layer.weights from a list of NumPy arrays.

+
+
+
+stateless_call(trainable_variables, non_trainable_variables, *args, return_losses=False, **kwargs)
+

Call the layer without any side effects.

+
+
Parameters
+
    +
  • trainable_variables – List of trainable variables of the model.

  • +
  • non_trainable_variables – List of non-trainable variables of the +model.

  • +
  • *args – Positional arguments to be passed to call().

  • +
  • return_losses – If True, stateless_call() will return the list of +losses created during call() as part of its return values.

  • +
  • **kwargs – Keyword arguments to be passed to call().

  • +
+
+
Returns
+

+
A tuple. By default, returns (outputs, non_trainable_variables).

If return_losses = True, then returns +(outputs, non_trainable_variables, losses).

+
+
+

+
+
+

Note: non_trainable_variables include not only non-trainable weights +such as BatchNormalization statistics, but also RNG seed state +(if there are any random operations part of the layer, such as dropout), +and Metric state (if there are any metrics attached to the layer). +These are all elements of state of the layer.

+

Example:

+

```python +model = … +data = … +trainable_variables = model.trainable_variables +non_trainable_variables = model.non_trainable_variables +# Call the model with zero side effects +outputs, non_trainable_variables = model.stateless_call(

+
+

trainable_variables, +non_trainable_variables, +data,

+
+

) +# Attach the updated state to the model +# (until you do this, the model is still in its pre-call state). +for ref_var, value in zip(

+
+

model.non_trainable_variables, non_trainable_variables

+
+
+
):

ref_var.assign(value)

+
+
+

```

+
+
+
+property supports_masking
+

Whether this layer supports computing a mask using compute_mask.

+
+
+
+symbolic_call(*args, **kwargs)
+
+
+
+property trainable
+

Settable boolean, whether this layer should be trainable or not.

+
+
+
+property trainable_variables
+

List of all trainable layer state.

+

This is equivalent to layer.trainable_weights.

+
+
+
+property trainable_weights
+

List of all trainable weight variables of the layer.

+

These are the weights that get updated by the optimizer during training.

+
+
+
+property variable_dtype
+

The dtype of the state (weights) of the layer.

+
+
+
+property variables
+

List of all layer state, including random seeds.

+

This extends layer.weights to include all state used by the layer +including `SeedGenerator`s.

+

Note that metrics variables are not included here, use +metrics_variables to visit all the metric variables.

+
+
+
+property weights
+

List of all weight variables of the layer.

+

Unlike, layer.variables this excludes metric state and random seeds.

+
+
+
+
+class dataprofiler.labelers.character_level_cnn_model.CharacterLevelCnnModel(label_mapping: dict[str, int], parameters: Optional[dict] = None)
+

Bases: BaseTrainableModel

+

Class for training char data labeler.

+

Initialize CNN Model.

+

Initialize epoch_id.

+
+
Parameters
+
    +
  • label_mapping (dict) – maps labels to their encoded integers

  • +
  • parameters (dict) –

    Contains all the appropriate parameters for the +model. Must contain num_labels. Other possible parameters are:

    +
    +

    max_length, max_char_encoding_id, dim_embed, size_fc +dropout, size_conv, num_fil, optimizer, default_label

    +
    +

  • +
+
+
Returns
+

None

+
+
+
+
+requires_zero_mapping: bool = True
+
+
+
+set_label_mapping(label_mapping: list[str] | dict[str, int]) None
+

Set the labels for the model.

+
+
Parameters
+

label_mapping (dict) – label mapping of the model

+
+
Returns
+

None

+
+
+
+
+
+save_to_disk(dirpath: str) None
+

Save whole model to disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to save the model to

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) CharacterLevelCnnModel
+

Load whole model from disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to load the model from

+
+
Returns
+

None

+
+
+
+
+
+reset_weights() None
+

Reset the weights of the model.

+
+
Returns
+

None

+
+
+
+
+
+fit(train_data: DataArray, val_data: DataArray | None = None, batch_size: int = None, epochs: int = None, label_mapping: dict[str, int] = None, reset_weights: bool = False, verbose: bool = True) tuple[dict, float | None, dict]
+

Train the current model with the training data and validation data.

+
+
Parameters
+
    +
  • train_data (Union[list, np.ndarray]) – Training data used to train model

  • +
  • val_data (Union[list, np.ndarray]) – Validation data used to validate the training

  • +
  • batch_size (int) – Used to determine number of samples in each batch

  • +
  • label_mapping (Union[dict, None]) – maps labels to their encoded integers

  • +
  • reset_weights (bool) – Flag to determine whether to reset the weights or +not

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

history, f1, f1_report

+
+
Return type
+

Tuple[dict, float, dict]

+
+
+
+
+
+predict(data: Union[DataFrame, Series, ndarray], batch_size: int = 32, show_confidences: bool = False, verbose: bool = True) dict
+

Run model and get predictions.

+
+
Parameters
+
    +
  • data (Union[list, numpy.ndarray]) – text input

  • +
  • batch_size (int) – number of samples in the batch of data

  • +
  • show_confidences – whether user wants prediction confidences

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

char level predictions and confidences

+
+
Return type
+

dict

+
+
+
+
+
+details() None
+

Print the relevant details of the model.

+

Details include summary, parameters, and label mapping.

+
+
+
+add_label(label: str, same_as: str | None = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseModel] | None
+

Get subclasses.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (List[str]) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod help() None
+

Help describe alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping: dict[str, int]
+

Return mapping of labels to their encoded values.

+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+property num_labels: int
+

Return max label mapping.

+
+
+
+property reverse_label_mapping: dict[int, str]
+

Return reversed order of current labels.

+

Useful for when needed to extract Labels via indices.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.classification_report_utils.html b/docs/0.12.0/html/dataprofiler.labelers.classification_report_utils.html new file mode 100644 index 000000000..963ac4e13 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.classification_report_utils.html @@ -0,0 +1,453 @@ + + + + + + + + + Classification Report Utils - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Classification Report Utils

+

Contains functions for classification.

+
+
+dataprofiler.labelers.classification_report_utils.convert_confusion_matrix_to_MCM(conf_matrix: list | np.ndarray) np.ndarray
+

Convert a confusion matrix into the MCM format.

+

Format for precision/recall/fscore/ +support computation by sklearn.

+

The format is as specified by sklearn below: +In multilabel confusion matrix \(MCM\), the count of true negatives +is \(MCM_{:,0,0}\), false negatives is \(MCM_{:,1,0}\), +true positives is \(MCM_{:,1,1}\) and false positives is +\(MCM_{:,0,1}\). +Note: this utilizes code/ideology from sklearn.

+
+
Parameters
+

conf_matrix (Union[list, np.ndarray]) – confusion matrix, which is a square matrix describing +false positives and false negatives, true positives and true negatives +for classification

+
+
Returns
+

MCM format for readability by sklearn confusion reports.

+
+
Return type
+

np.ndarray

+
+
+
+
+
+dataprofiler.labelers.classification_report_utils.precision_recall_fscore_support(MCM: np.ndarray, beta: float = 1.0, labels: np.ndarray | None = None, pos_label: str | int = 1, average: str | None = None, warn_for: tuple[str, ...] | set[str] = ('precision', 'recall', 'f-score'), sample_weight: np.ndarray | None = None) tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray | None]
+

Perform same functionality as recision_recall_fscore_support function.

+

Copy of the precision_recall_fscore_support function from sklearn.metrics +with the update to receiving the MCM instead of calculating each time it is +called.

+
+
Parameters
+
    +
  • MCM (array, shape (n_outputs, 2, 2)) – Multi-classification confusion matrix as referenced by the sklearn +metrics module. A 2x2 confusion matrix corresponding to each output in +the input. In multilabel confusion matrix \(MCM\), the count of +true negatives is \(MCM_{:,0,0}\), false negatives is +\(MCM_{:,1,0}\), true positives is \(MCM_{:,1,1}\) and false +positives is \(MCM_{:,0,1}\).

  • +
  • beta (float, 1.0 by default) – The strength of recall versus precision in the F-score.

  • +
  • labels (list, optional) – The set of labels to include when average != 'binary', and their +order if average is None. Labels present in the data can be +excluded, for example to calculate a multiclass average ignoring a +majority negative class, while labels not present in the data will +result in 0 components in a macro average. For multilabel targets, +labels are column indices. By default, all labels in y_true and +y_pred are used in sorted order.

  • +
  • pos_label (str or int, 1 by default) – The class to report if average='binary' and the data is binary. +If the data are multiclass or multilabel, this will be ignored; +setting labels=[pos_label] and average != 'binary' will report +scores for that label only.

  • +
  • average (string, [None (default), 'binary', 'micro', 'macro', 'weighted']) –

    If None, the scores for each class are returned. Otherwise, this +determines the type of averaging performed on the data:

    +
    +
    'binary':

    Only report results for the class specified by pos_label. +This is applicable only if targets (y_{true,pred}) are binary.

    +
    +
    'micro':

    Calculate metrics globally by counting the total true positives, +false negatives and false positives.

    +
    +
    'macro':

    Calculate metrics for each label, and find their unweighted +mean. This does not take label imbalance into account.

    +
    +
    'weighted':

    Calculate metrics for each label, and find their average weighted +by support (the number of true instances for each label). This +alters ‘macro’ to account for label imbalance; it can result in an +F-score that is not between precision and recall.

    +
    +
    +

  • +
  • warn_for (tuple or set, for internal use) – This determines which warnings will be made in the case that this +function is being used to return only one of its metrics.

  • +
  • sample_weight (array-like of shape = [n_samples], optional) – Sample weights.

  • +
+
+
Returns
+

    +
  • precision (float (if average is not None) or array of float, shape = [n_unique_labels])

  • +
  • recall (float (if average is not None) or array of float, , shape = [n_unique_labels])

  • +
  • fbeta_score (float (if average is not None) or array of float, shape = [n_unique_labels])

  • +
  • support (int (if average is not None) or array of int, shape = [n_unique_labels]) – The number of occurrences of each label in y_true.

  • +
+

+
+
+

References

+
+
1
+

Wikipedia entry for the Precision and recall

+
+
2
+

Wikipedia entry for the F1-score

+
+
3
+

Discriminative Methods for Multi-labeled Classification Advances +in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu +Godbole, Sunita Sarawagi

+
+
+

Notes

+

When true positive + false positive == 0, precision is undefined; +When true positive + false negative == 0, recall is undefined. +In such cases, the metric will be set to 0, as will f-score, and +UndefinedMetricWarning will be raised.

+
+
+
+dataprofiler.labelers.classification_report_utils.classification_report(conf_matrix: np.ndarray, labels: list | np.ndarray | None = None, target_names: list[str] | None = None, sample_weight: np.ndarray | None = None, digits: int = 2, output_dict: bool = False) str | dict
+

Build a text report showing the main classification metrics.

+

Copy of the classification_report function from sklearn.metrics +with the update to receiving the conf_matrix instead of calculating each +time it is called.

+

Read more in the User Guide.

+
+
Parameters
+
    +
  • conf_matrix (array, shape = [n_labels, n_labels]) – confusion matrix, which is a square matrix describing +false positives and false negatives, true positives and true negatives +for classification.

  • +
  • labels (array, shape = [n_labels]) – Optional list of label indices to include in the report.

  • +
  • target_names (list of strings) – Optional display names matching the labels (same order).

  • +
  • sample_weight (array-like of shape = [n_samples], optional) – Sample weights.

  • +
  • digits (int) – Number of digits for formatting output floating point values. +When output_dict is True, this will be ignored and the +returned values will not be rounded.

  • +
  • output_dict (bool (default = False)) – If True, return output as dict

  • +
+
+
Returns
+

report – Text summary of the precision, recall, F1 score for each class. +Dictionary returned if output_dict is True. Dictionary has the +following structure:

+
{'label 1': {'precision':0.5,
+             'recall':1.0,
+             'f1-score':0.67,
+             'support':1},
+ 'label 2': { ... },
+  ...
+}
+
+
+

The reported averages include macro average (averaging the unweighted +mean per label), weighted average (averaging the support-weighted mean +per label), sample average (only for multilabel classification) and +micro average (averaging the total true positives, false negatives and +false positives) it is only shown for multi-label or multi-class +with a subset of classes because it is accuracy otherwise. +See also:func:precision_recall_fscore_support for more details +on averages.

+

Note that in binary classification, recall of the positive class +is also known as “sensitivity”; recall of the negative class is +“specificity”.

+

+
+
Return type
+

string / dict

+
+
+
+

See also

+

precision_recall_fscore_support, confusion_matrix, multilabel_confusion_matrix

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.column_name_model.html b/docs/0.12.0/html/dataprofiler.labelers.column_name_model.html new file mode 100644 index 000000000..55ba3927f --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.column_name_model.html @@ -0,0 +1,453 @@ + + + + + + + + + Column Name Model - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Column Name Model

+

Contains class for column name data labeling model.

+
+
+class dataprofiler.labelers.column_name_model.ColumnNameModel(label_mapping: dict[str, int], parameters: Optional[dict] = None)
+

Bases: BaseModel

+

Class for column name data labeling model.

+

Initialize function for ColumnNameModel.

+
+
Parameters
+

parameters (dict) –

Contains all the appropriate parameters for the model. +Possible parameters are:

+
+

max_length, max_num_chars, dim_embed

+
+

+
+
Returns
+

None

+
+
+
+
+reset_weights() None
+

Reset weights function.

+
+
+
+predict(data: Union[DataFrame, Series, ndarray], batch_size: Optional[int] = None, show_confidences: bool = False, verbose: bool = True) dict
+

Apply the process.cdist for similarity score on input list of strings.

+
+
Parameters
+
    +
  • data (iterator) – list of strings to predict upon

  • +
  • batch_size (N/A) – does not impact this model and should be fixed to not +be required.

  • +
  • show_confidences – Parameter disabled. Confidence values returned +by default.

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

char level predictions and confidences

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) ColumnNameModel
+

Load whole model from disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to load the model from

+
+
Returns
+

None

+
+
+
+
+
+save_to_disk(dirpath: str) None
+

Save whole model to disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to save the model to

+
+
Returns
+

None

+
+
+
+
+
+add_label(label: str, same_as: str | None = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseModel] | None
+

Get subclasses.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (List[str]) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod help() None
+

Help describe alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping: dict[str, int]
+

Return mapping of labels to their encoded values.

+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+property num_labels: int
+

Return max label mapping.

+
+
+
+requires_zero_mapping: bool = False
+
+
+
+property reverse_label_mapping: dict[int, str]
+

Return reversed order of current labels.

+

Useful for when needed to extract Labels via indices.

+
+
+
+set_label_mapping(label_mapping: list[str] | dict[str, int]) None
+

Set the labels for the model.

+
+
Parameters
+

label_mapping (Union[list, dict]) – label mapping of the model or list of labels to be +converted into the label mapping

+
+
Returns
+

None

+
+
+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.data_labelers.html b/docs/0.12.0/html/dataprofiler.labelers.data_labelers.html new file mode 100644 index 000000000..55ae6f0d3 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.data_labelers.html @@ -0,0 +1,982 @@ + + + + + + + + + Data Labelers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Labelers

+

Module to train and choose between structured and unstructured data labelers.

+
+
+dataprofiler.labelers.data_labelers.train_structured_labeler(data: None | pd.DataFrame, default_label: int = None, save_dirpath: str = None, epochs: int = 2) TrainableDataLabeler
+

Use provided data to create and save a structured data labeler.

+
+
Parameters
+
    +
  • data (Union[None, pd.DataFrame]) – data to be trained upon

  • +
  • save_dirpath (Union[None, str]) – path to save data labeler

  • +
  • epochs (int) – number of epochs to loop training the data

  • +
+
+
Returns
+

structured data labeler

+
+
Return type
+

TrainableDataLabeler

+
+
+
+
+
+class dataprofiler.labelers.data_labelers.UnstructuredDataLabeler(dirpath: Optional[str] = None, load_options: Optional[dict] = None)
+

Bases: BaseDataLabeler

+

BaseDataLabeler subclass specified as unstructured with internal variable.

+

Initialize DataLabeler class.

+
+
Parameters
+
    +
  • dirpath – path to data labeler

  • +
  • load_options – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
+
+
+add_label(label: str, same_as: Optional[str] = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+check_pipeline(skip_postprocessor: bool = False, error_on_mismatch: bool = False) None
+

Check whether the processors and models connect together without error.

+
+
Parameters
+
    +
  • skip_postprocessor (bool) – skip checking postprocessor is valid in +pipeline

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

None

+
+
+
+
+
+help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping: dict
+

Retrieve the label encodings.

+
+
Returns
+

dictionary for associating labels to indexes

+
+
+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+classmethod load_from_disk(dirpath: str, load_options: Optional[dict] = None) BaseDataLabeler
+

Load the data labeler from a saved location on disk.

+
+
Parameters
+
    +
  • dirpath (str) – path to data labeler files.

  • +
  • load_options (dict) – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
Returns
+

DataLabeler class

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+classmethod load_from_library(name: str) BaseDataLabeler
+

Load the data labeler from the data labeler zoo in the library.

+
+
Parameters
+

name (str) – name of the data labeler.

+
+
Returns
+

DataLabeler class

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+classmethod load_with_components(preprocessor: BaseDataPreprocessor, model: BaseModel, postprocessor: BaseDataPostprocessor) BaseDataLabeler
+

Load the data labeler from a its set of components.

+
+
Parameters
+
+
+
Returns
+

loaded BaseDataLabeler

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+property model: BaseModel
+

Retrieve the data labeler model.

+
+
Returns
+

returns the model instance

+
+
+
+
+
+property postprocessor: data_processing.BaseDataPostprocessor | None
+

Retrieve the data postprocessor.

+
+
Returns
+

returns the postprocessor instance

+
+
+
+
+
+predict(data: Union[DataFrame, Series, ndarray], batch_size: int = 32, predict_options: Optional[dict[str, bool]] = None, error_on_mismatch: bool = False, verbose: bool = True) dict
+

Predict labels of input data based with the data labeler model.

+
+
Parameters
+
    +
  • data (Union[pd.DataFrame, pd.Series, np.ndarray]) – data to be predicted upon

  • +
  • batch_size (int) – batch size of prediction

  • +
  • predict_options (Dict[str, bool]) – optional parameters to allow for predict as a +dict, i.e. dict(show_confidences=True)

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

predictions

+
+
Return type
+

Dict

+
+
+
+
+
+property preprocessor: data_processing.BaseDataPreprocessor | None
+

Retrieve the data preprocessor.

+
+
Returns
+

returns the preprocessor instance

+
+
+
+
+
+property reverse_label_mapping: dict
+

Retrieve the index to label encoding.

+
+
Returns
+

dictionary for associating indexes to labels

+
+
+
+
+
+save_to_disk(dirpath: str) None
+

Save the data labeler to the specified location.

+
+
Parameters
+

dirpath (str) – location to save the data labeler.

+
+
Returns
+

None

+
+
+
+
+
+set_labels(labels: list | dict) None
+

Set the labels for the data labeler.

+
+
Parameters
+

labels (list or dict) – new labels in either encoding list or dict

+
+
Returns
+

None

+
+
+
+
+
+set_model(model: BaseModel) None
+

Set the model for the data labeler.

+
+
Parameters
+

model (base_model.BaseModel) – model to use within the data labeler

+
+
Returns
+

None

+
+
+
+
+
+set_params(params: dict) None
+

Allow user to set parameters of pipeline components.

+
+
Done in the following format:
+
params = dict(

preprocessor=dict(…), +model=dict(…), +postprocessor=dict(…)

+
+
+

)

+
+
+

where the key,values pairs for each pipeline component must match +parameters that exist in their components.

+
+
Parameters
+

params (dict) –

dictionary containing a key for a given pipeline +component and its associated value of parameters as such:

+
+

dict(preprocessor=dict(…), model=dict(…), +postprocessor=dict(…))

+
+

+
+
Returns
+

None

+
+
+
+
+
+set_postprocessor(data_processor: BaseDataPostprocessor) None
+

Set the data postprocessor for the data labeler.

+
+
Parameters
+

data_processor (data_processing.BaseDataPostprocessor) – processor to set as the postprocessor

+
+
Returns
+

None

+
+
+
+
+
+set_preprocessor(data_processor: BaseDataPreprocessor) None
+

Set the data preprocessor for the data labeler.

+
+
Parameters
+

data_processor (data_processing.BaseDataPreprocessor) – processor to set as the preprocessor

+
+
Returns
+

None

+
+
+
+
+
+
+class dataprofiler.labelers.data_labelers.StructuredDataLabeler(dirpath: Optional[str] = None, load_options: Optional[dict] = None)
+

Bases: BaseDataLabeler

+

BaseDataLabeler subclass specified as structured with internal variable.

+

Initialize DataLabeler class.

+
+
Parameters
+
    +
  • dirpath – path to data labeler

  • +
  • load_options – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
+
+
+add_label(label: str, same_as: Optional[str] = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+check_pipeline(skip_postprocessor: bool = False, error_on_mismatch: bool = False) None
+

Check whether the processors and models connect together without error.

+
+
Parameters
+
    +
  • skip_postprocessor (bool) – skip checking postprocessor is valid in +pipeline

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

None

+
+
+
+
+
+help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping: dict
+

Retrieve the label encodings.

+
+
Returns
+

dictionary for associating labels to indexes

+
+
+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+classmethod load_from_disk(dirpath: str, load_options: Optional[dict] = None) BaseDataLabeler
+

Load the data labeler from a saved location on disk.

+
+
Parameters
+
    +
  • dirpath (str) – path to data labeler files.

  • +
  • load_options (dict) – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
Returns
+

DataLabeler class

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+classmethod load_from_library(name: str) BaseDataLabeler
+

Load the data labeler from the data labeler zoo in the library.

+
+
Parameters
+

name (str) – name of the data labeler.

+
+
Returns
+

DataLabeler class

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+classmethod load_with_components(preprocessor: BaseDataPreprocessor, model: BaseModel, postprocessor: BaseDataPostprocessor) BaseDataLabeler
+

Load the data labeler from a its set of components.

+
+
Parameters
+
+
+
Returns
+

loaded BaseDataLabeler

+
+
Return type
+

BaseDataLabeler

+
+
+
+
+
+property model: BaseModel
+

Retrieve the data labeler model.

+
+
Returns
+

returns the model instance

+
+
+
+
+
+property postprocessor: data_processing.BaseDataPostprocessor | None
+

Retrieve the data postprocessor.

+
+
Returns
+

returns the postprocessor instance

+
+
+
+
+
+predict(data: Union[DataFrame, Series, ndarray], batch_size: int = 32, predict_options: Optional[dict[str, bool]] = None, error_on_mismatch: bool = False, verbose: bool = True) dict
+

Predict labels of input data based with the data labeler model.

+
+
Parameters
+
    +
  • data (Union[pd.DataFrame, pd.Series, np.ndarray]) – data to be predicted upon

  • +
  • batch_size (int) – batch size of prediction

  • +
  • predict_options (Dict[str, bool]) – optional parameters to allow for predict as a +dict, i.e. dict(show_confidences=True)

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

predictions

+
+
Return type
+

Dict

+
+
+
+
+
+property preprocessor: data_processing.BaseDataPreprocessor | None
+

Retrieve the data preprocessor.

+
+
Returns
+

returns the preprocessor instance

+
+
+
+
+
+property reverse_label_mapping: dict
+

Retrieve the index to label encoding.

+
+
Returns
+

dictionary for associating indexes to labels

+
+
+
+
+
+save_to_disk(dirpath: str) None
+

Save the data labeler to the specified location.

+
+
Parameters
+

dirpath (str) – location to save the data labeler.

+
+
Returns
+

None

+
+
+
+
+
+set_labels(labels: list | dict) None
+

Set the labels for the data labeler.

+
+
Parameters
+

labels (list or dict) – new labels in either encoding list or dict

+
+
Returns
+

None

+
+
+
+
+
+set_model(model: BaseModel) None
+

Set the model for the data labeler.

+
+
Parameters
+

model (base_model.BaseModel) – model to use within the data labeler

+
+
Returns
+

None

+
+
+
+
+
+set_params(params: dict) None
+

Allow user to set parameters of pipeline components.

+
+
Done in the following format:
+
params = dict(

preprocessor=dict(…), +model=dict(…), +postprocessor=dict(…)

+
+
+

)

+
+
+

where the key,values pairs for each pipeline component must match +parameters that exist in their components.

+
+
Parameters
+

params (dict) –

dictionary containing a key for a given pipeline +component and its associated value of parameters as such:

+
+

dict(preprocessor=dict(…), model=dict(…), +postprocessor=dict(…))

+
+

+
+
Returns
+

None

+
+
+
+
+
+set_postprocessor(data_processor: BaseDataPostprocessor) None
+

Set the data postprocessor for the data labeler.

+
+
Parameters
+

data_processor (data_processing.BaseDataPostprocessor) – processor to set as the postprocessor

+
+
Returns
+

None

+
+
+
+
+
+set_preprocessor(data_processor: BaseDataPreprocessor) None
+

Set the data preprocessor for the data labeler.

+
+
Parameters
+

data_processor (data_processing.BaseDataPreprocessor) – processor to set as the preprocessor

+
+
Returns
+

None

+
+
+
+
+
+
+class dataprofiler.labelers.data_labelers.DataLabeler(labeler_type: str, dirpath: Optional[str] = None, load_options: Optional[dict] = None, trainable: bool = False)
+

Bases: object

+

Wrapper class for choosing between structured and unstructured labeler.

+

Create structured and unstructured data labeler objects.

+
+
Parameters
+
    +
  • dirpath (str) – Path to load data labeler

  • +
  • load_options (Dict) – Optional arguments to include for load.

  • +
  • trainable (bool) – variable to dictate whether you want a trainable data +labeler

  • +
+
+
Returns
+

+
+
+
+
+labeler_classes = {'structured': <class 'dataprofiler.labelers.data_labelers.StructuredDataLabeler'>, 'unstructured': <class 'dataprofiler.labelers.data_labelers.UnstructuredDataLabeler'>}
+
+
+
+classmethod load_from_library(name: str, trainable: bool = False) BaseDataLabeler
+

Load the data labeler from the data labeler zoo in the library.

+
+
Parameters
+
    +
  • name (str) – name of the data labeler.

  • +
  • trainable (bool) – variable to dictate whether you want a trainable data +labeler

  • +
+
+
Returns
+

DataLabeler class

+
+
+
+
+
+classmethod load_from_disk(dirpath: str, load_options: Optional[dict] = None, trainable: bool = False) BaseDataLabeler
+

Load the data labeler from a saved location on disk.

+
+
Parameters
+
    +
  • dirpath (str) – path to data labeler files.

  • +
  • load_options (dict) – optional arguments to include for load i.e. class +for model or processors

  • +
  • trainable (bool) – variable to dictate whether you want a trainable data +labeler

  • +
+
+
Returns
+

DataLabeler class

+
+
+
+
+
+classmethod load_with_components(preprocessor: BaseDataPreprocessor, model: BaseModel, postprocessor: BaseDataPostprocessor, trainable: bool = False) BaseDataLabeler
+

Load the data labeler from a its set of components.

+
+
Parameters
+
+
+
Returns
+

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.data_processing.html b/docs/0.12.0/html/dataprofiler.labelers.data_processing.html new file mode 100644 index 000000000..77f43b15e --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.data_processing.html @@ -0,0 +1,1414 @@ + + + + + + + + + Data Processing - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Processing

+

Contains pre-built processors for data labeling/processing.

+
+
+class dataprofiler.labelers.data_processing.AutoSubRegistrationMeta(clsname: str, bases: tuple[type, ...], attrs: dict[str, object])
+

Bases: ABCMeta

+

For registering subclasses.

+

Create AutoSubRegistration object.

+
+
+mro()
+

Return a type’s method resolution order.

+
+
+
+register(subclass)
+

Register a virtual subclass of an ABC.

+

Returns the subclass, to allow usage as a class decorator.

+
+
+
+
+class dataprofiler.labelers.data_processing.BaseDataProcessor(**parameters: Any)
+

Bases: object

+

Abstract Data processing class.

+

Initialize BaseDataProcessor object.

+
+
+processor_type: str
+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+abstract classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+abstract process(*args: Any, **kwargs: Any) Any
+

Process data.

+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+
+class dataprofiler.labelers.data_processing.BaseDataPreprocessor(**parameters: Any)
+

Bases: BaseDataProcessor

+

Abstract Data preprocessing class.

+

Initialize BaseDataPreprocessor object.

+
+
+processor_type: str = 'preprocessor'
+
+
+
+abstract process(data: np.ndarray, labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32) Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[np.ndarray, np.ndarray] | np.ndarray
+

Preprocess data.

+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+abstract classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+
+class dataprofiler.labelers.data_processing.BaseDataPostprocessor(**parameters: Any)
+

Bases: BaseDataProcessor

+

Abstract Data postprocessing class.

+

Initialize BaseDataPostprocessor object.

+
+
+processor_type: str = 'postprocessor'
+
+
+
+abstract process(data: ndarray, results: dict, label_mapping: dict[str, int]) dict
+

Postprocess data.

+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+abstract classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+
+class dataprofiler.labelers.data_processing.DirectPassPreprocessor
+

Bases: BaseDataPreprocessor

+

Subclass of BaseDataPreprocessor for preprocessing data.

+

Initialize the DirectPassPreprocessor class.

+
+
+classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+process(data: np.ndarray, labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32) tuple[np.ndarray, np.ndarray] | np.ndarray
+

Preprocess data.

+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+processor_type: str = 'preprocessor'
+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+
+class dataprofiler.labelers.data_processing.CharPreprocessor(max_length: int = 3400, default_label: str = 'UNKNOWN', pad_label: str = 'PAD', flatten_split: float = 0, flatten_separator: str = ' ', is_separate_at_max_len: bool = False, **kwargs: Any)
+

Bases: BaseDataPreprocessor

+

Subclass of BaseDataPreprocessor for preprocessing char data.

+

Initialize the CharPreprocessor class.

+
+
Parameters
+
    +
  • max_length (int) – Maximum char length in a sample.

  • +
  • default_label (string (could be int, char, etc.)) – Key for label_mapping that is the default label

  • +
  • pad_label (string (could be int, char, etc.)) – Key for label_mapping that is the pad label

  • +
  • flatten_split (float) – approximate output of split between flattened and +non-flattened characters, value between [0, 1]. When the current +flattened split becomes more than the flatten_split value, any +leftover sample or subsequent samples will be non-flattened until +the current flattened split is below the flatten_split value

  • +
  • flatten_separator (str) – separator used to put between flattened +samples.

  • +
  • is_separate_at_max_len (bool) – if true, separates at max_length, +otherwise at nearest separator

  • +
+
+
+
+
+classmethod help() None
+

Describe alterable parameters.

+

Input data formats. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+process(data: np.ndarray, labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32) Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]
+

Flatten batches of data.

+
+
Parameters
+
    +
  • data (numpy.ndarray) – List of strings to create embeddings for

  • +
  • labels (numpy.ndarray) – labels for each input character

  • +
  • label_mapping (Union[None, dict]) – maps labels to their encoded integers

  • +
  • batch_size (int) – Number of samples in the batch of data

  • +
+
+
Return batch_data
+

A dict containing samples of size batch_size

+
+
Rtype batch_data
+

dicts

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+processor_type: str = 'preprocessor'
+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+
+class dataprofiler.labelers.data_processing.CharEncodedPreprocessor(encoding_map: dict[str, int] | None = None, max_length: int = 5000, default_label: str = 'UNKNOWN', pad_label: str = 'PAD', flatten_split: float = 0, flatten_separator: str = ' ', is_separate_at_max_len: bool = False)
+

Bases: CharPreprocessor

+

Subclass of CharPreprocessor for preprocessing char encoded data.

+

Initialize the CharEncodedPreprocessor class.

+
+
Parameters
+
    +
  • encoding_map (dict) – char to int encoding map

  • +
  • max_length (int) – Maximum char length in a sample.

  • +
  • default_label (string (could be int, char, etc.)) – Key for label_mapping that is the default label

  • +
  • pad_label (string (could be int, char, etc.)) – Key for label_mapping that is the pad label

  • +
  • flatten_split (float) – approximate output of split between flattened and +non-flattened characters, value between [0, 1]. When the current +flattened split becomes more than the flatten_split value, any +leftover sample or subsequent samples will be non-flattened until +the current flattened split is below the flatten_split value

  • +
  • flatten_separator (str) – separator used to put between flattened +samples.

  • +
  • is_separate_at_max_len (bool) – if true, separates at max_length, +otherwise at nearest separator

  • +
+
+
+
+
+process(data: np.ndarray, labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32) Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]
+

Process structured data for being processed by CharacterLevelCnnModel.

+
+
Parameters
+
    +
  • data (numpy.ndarray) – List of strings to create embeddings for

  • +
  • labels (numpy.ndarray) – labels for each input character

  • +
  • label_mapping (Union[dict, None]) – maps labels to their encoded integers

  • +
  • batch_size (int) – Number of samples in the batch of data

  • +
+
+
Return batch_data
+

A dict containing samples of size batch_size

+
+
Rtype batch_data
+

dict

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod help() None
+

Describe alterable parameters.

+

Input data formats. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+processor_type: str = 'preprocessor'
+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+
+class dataprofiler.labelers.data_processing.CharPostprocessor(default_label: str = 'UNKNOWN', pad_label: str = 'PAD', flatten_separator: str = ' ', use_word_level_argmax: bool = False, output_format: str = 'character_argmax', separators: tuple[str, ...] = (' ', ',', ';', "'", '"', ':', '\n', '\t', '.'), word_level_min_percent: float = 0.75)
+

Bases: BaseDataPostprocessor

+

Subclass of BaseDataPostprocessor for postprocessing char data.

+

Initialize the CharPostprocessor class.

+
+
Parameters
+
    +
  • default_label (string (could be int, char, etc.)) – Key for label_mapping that is the default label

  • +
  • pad_label (string (could be int, char, etc.)) – Key for label_mapping that is the pad label

  • +
  • flatten_separator (str) – separator used to put between flattened +samples.

  • +
  • use_word_level_argmax (bool) – whether to require the argmax value of +each character in a word to determine the word’s entity

  • +
  • output_format (str) – (character_argmax vs NER) where character_argmax +is a list of encodings for each character in the input text and NER +is in the dict format which specifies start,end,label for each +entity in a sentence

  • +
  • separators (tuple(str)) – list of characters to use for separating words within +the character predictions

  • +
  • word_level_min_percent (float) – threshold on generating dominant +word_level labeling

  • +
+
+
+
+
+classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+static convert_to_NER_format(predictions: list[list], label_mapping: dict[str, int], default_label: str, pad_label: str) list[list]
+

Convert word level predictions to specified format.

+
+
Parameters
+
    +
  • predictions (list) – predictions

  • +
  • label_mapping (dict) – labels and corresponding integers

  • +
  • default_label (str) – default label in label_mapping

  • +
  • pad_label (str) – pad label in label_mapping

  • +
+
+
Returns
+

formatted predictions

+
+
Return type
+

list

+
+
+
+
+
+static match_sentence_lengths(data: ndarray, results: dict, flatten_separator: str, inplace: bool = True) dict
+

Convert results from model into same ragged data shapes as original data.

+
+
Parameters
+
    +
  • data (numpy.ndarray) – original input data to the data labeler

  • +
  • results (dict) – dict of model character level predictions and confs

  • +
  • flatten_separator (str) – string which joins to samples together when +flattening

  • +
  • inplace (bool) – flag to modify results in place

  • +
+
+
Returns
+

dict(pred=…) or dict(pred=…, conf=…)

+
+
+
+
+
+process(data: ndarray, results: dict, label_mapping: dict[str, int]) dict
+

Conduct processing on data given predictions, label_mapping, and default_label.

+
+
Parameters
+
    +
  • data (Union[np.ndarray, pd.DataFrame]) – original input data to the data labeler

  • +
  • results (dict) – dict of model character level predictions and confs

  • +
  • label_mapping (dict) – labels and corresponding integers

  • +
+
+
Returns
+

dict of predictions and if they exist, confidences

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+processor_type: str = 'postprocessor'
+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+
+class dataprofiler.labelers.data_processing.StructCharPreprocessor(max_length: int = 3400, default_label: str = 'UNKNOWN', pad_label: str = 'PAD', flatten_separator: str = '\x01\x01\x01\x01\x01', is_separate_at_max_len: bool = False)
+

Bases: CharPreprocessor

+

Subclass of CharPreprocessor for preprocessing struct char data.

+

Initialize the StructCharPreprocessor class.

+
+
Parameters
+
    +
  • max_length (int) – Maximum char length in a sample.

  • +
  • default_label (string (could be int, char, etc.)) – Key for label_mapping that is the default label

  • +
  • pad_label (string (could be int, char, etc.)) – Key for label_mapping that is the pad label

  • +
  • flatten_separator (str) – separator used to put between flattened +samples.

  • +
  • is_separate_at_max_len (bool) – if true, separates at max_length, +otherwise at nearest separator

  • +
+
+
+
+
+classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for preprocessors.

+
+
Returns
+

None

+
+
+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+convert_to_unstructured_format(data: np.ndarray, labels: list[str] | npt.NDArray[np.str_] | None) tuple[str, list[tuple[int, int, str]] | None]
+

Convert data samples list to StructCharPreprocessor required input data format.

+
+
Parameters
+
    +
  • data (numpy.ndarray) – list of strings

  • +
  • labels (Optional[Union[List[str], npt.NDArray[np.str_]]]) – labels for each input character

  • +
+
+
Returns
+

data in the following format +text=”<SAMPLE><SEPARATOR><SAMPLE>…”, +entities=[(start=<INT>, end=<INT>, label=”<LABEL>”),

+
+

…(num_samples in data)])

+
+

+
+
Return type
+

Tuple[str, Optional[List[Tuple[int, int, str]]]]

+
+
+
+
+
+process(data: np.ndarray, labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32) Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]
+

Process structured data for being processed by CharacterLevelCnnModel.

+
+
Parameters
+
    +
  • data (numpy.ndarray) – List of strings to create embeddings for

  • +
  • labels (numpy.ndarray) – labels for each input character

  • +
  • label_mapping (Union[dict, None]) – maps labels to their encoded integers

  • +
  • batch_size (int) – Number of samples in the batch of data

  • +
+
+
Return batch_data
+

A dict containing samples of size batch_size

+
+
Rtype batch_data
+

dict

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+processor_type: str = 'preprocessor'
+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+
+class dataprofiler.labelers.data_processing.StructCharPostprocessor(default_label: str = 'UNKNOWN', pad_label: str = 'PAD', flatten_separator: str = '\x01\x01\x01\x01\x01', is_pred_labels: bool = True, random_state: random.Random | int | list | tuple | None = None)
+

Bases: BaseDataPostprocessor

+

Subclass of BaseDataPostprocessor for postprocessing struct char data.

+

Initialize the StructCharPostprocessor class.

+
+
Parameters
+
    +
  • default_label (str) – Key for label_mapping that is the default label

  • +
  • pad_label (str) – Key for label_mapping that is the pad label

  • +
  • flatten_separator (str) – separator used to put between flattened +samples.

  • +
  • is_pred_labels (bool) – (default: true) if true, will convert the model +indexes to the label strings given the label_mapping

  • +
  • random_state (random.Random) – random state setting to be used for randomly +selecting a prediction when two labels have equal opportunity for +a given sample.

  • +
+
+
+
+
+classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+static match_sentence_lengths(data: ndarray, results: dict, flatten_separator: str, inplace: bool = True) dict
+

Convert results from model into same ragged data shapes as original data.

+
+
Parameters
+
    +
  • data (np.ndarray) – original input data to the data labeler

  • +
  • results (dict) – dict of model character level predictions and confs

  • +
  • flatten_separator (str) – string which joins to samples together when +flattening

  • +
  • inplace (bool) – flag to modify results in place

  • +
+
+
Returns
+

dict(pred=…) or dict(pred=…, conf=…)

+
+
+
+
+
+convert_to_structured_analysis(sentences: ndarray, results: dict, label_mapping: dict[str, int], default_label: str, pad_label: str) dict
+

Convert unstructured results to a structured column analysis.

+

This assumes the column was flattened into a single sample, and takes mode of +all character predictions except for the separator labels. In cases of +tie, chose anything but background, otherwise randomly choose between +the remaining labels.

+
+
Parameters
+
    +
  • sentences (numpy.ndarray) – samples which were predicted upon

  • +
  • results (dict) – character predictions for each sample return from model

  • +
  • label_mapping (dict) – maps labels to their encoded integers

  • +
  • default_label (str) – Key for label_mapping that is the default label

  • +
  • pad_label (str) – Key for label_mapping that is the pad label

  • +
+
+
Returns
+

prediction value for a single column

+
+
+
+
+
+process(data: ndarray, results: dict, label_mapping: dict[str, int]) dict
+

Postprocess CharacterLevelCnnModel results when given structured data.

+

Said structured data is processed by StructCharPreprocessor.

+
+
Parameters
+
    +
  • data (Union[numpy.ndarray, pandas.DataFrame]) – original input data to the data labeler

  • +
  • results – dict of model character level predictions and confs

  • +
  • results – dict

  • +
  • label_mapping (dict) – maps labels to their encoded integers

  • +
+
+
Returns
+

dict of predictions and if they exist, confidences

+
+
Return type
+

dict

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+processor_type: str = 'postprocessor'
+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+
+class dataprofiler.labelers.data_processing.RegexPostProcessor(aggregation_func: str = 'split', priority_order: list | np.ndarray | None = None, random_state: random.Random | int | list | tuple | None = None)
+

Bases: BaseDataPostprocessor

+

Subclass of BaseDataPostprocessor for postprocessing regex data.

+

Initialize the RegexPostProcessor class.

+
+
Parameters
+
    +
  • aggregation_func (str) – aggregation function to apply to regex model +output (split, random, priority)

  • +
  • priority_order (Union[list, numpy.ndarray]) – if priority is set as the aggregation function, +the order in which entities are given priority must be set

  • +
  • random_state (random.Random) – random state setting to be used for randomly +selecting a prediction when two labels have equal opportunity for +a given sample.

  • +
+
+
+
+
+classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+static priority_prediction(results: dict, entity_priority_order: ndarray) None
+

Use priority of regex to give entity determination.

+
+
Parameters
+
    +
  • results (dict) – regex from model in format: dict(pred=…, conf=…)

  • +
  • entity_priority_order (np.ndarray) – list of entity priorities (lowest has +higher priority)

  • +
+
+
Returns
+

None

+
+
+
+
+
+static split_prediction(results: dict) None
+

Split the prediction across votes.

+
+
Parameters
+

results (dict) – regex from model in format: dict(pred=…, conf=…)

+
+
Returns
+

None

+
+
+
+
+
+process(data: ndarray, results: dict, label_mapping: dict[str, int]) dict
+

Preprocess data.

+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+processor_type: str = 'postprocessor'
+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+
+class dataprofiler.labelers.data_processing.StructRegexPostProcessor(random_state: random.Random | int | list | tuple | None = None)
+

Bases: BaseDataPostprocessor

+

Subclass of BaseDataPostprocessor for postprocessing struct regex data.

+

Initialize the RegexPostProcessor class.

+
+
Parameters
+

random_state (random.Random) – random state setting to be used for randomly +selecting a prediction when two labels have equal opportunity for +a given sample.

+
+
+
+
+set_params(**kwargs: Any) None
+

Given kwargs, set the parameters if they exist.

+
+
+
+classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+process(data: ndarray, results: dict, label_mapping: dict[str, int]) dict
+

Preprocess data.

+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+processor_type: str = 'postprocessor'
+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+
+class dataprofiler.labelers.data_processing.ColumnNameModelPostprocessor
+

Bases: BaseDataPostprocessor

+

Subclass of BaseDataPostprocessor for postprocessing regex data.

+

Initialize the ColumnNameModelPostProcessor class.

+
+
+classmethod help() None
+

Describe alterable parameters.

+

Input data formats for preprocessors. +Output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+process(data: np.ndarray, results: dict, label_mapping: dict[str, int] | None = None) dict
+

Preprocess data.

+
+
+
+classmethod get_class(class_name: str) type[BaseDataProcessor] | None
+

Get class of BaseDataProcessor object.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) Processor
+

Load data processor from a given path on disk.

+
+
+
+classmethod load_from_library(name: str) BaseDataProcessor
+

Load data processor from within the library.

+
+
+
+processor_type: str = 'postprocessor'
+
+
+
+save_to_disk(dirpath: str) None
+

Save data processor to a path on disk.

+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.html b/docs/0.12.0/html/dataprofiler.labelers.html new file mode 100644 index 000000000..e39017de6 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.html @@ -0,0 +1,362 @@ + + + + + + + + + Labelers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Labelers

+
+

Modules

+
+
+ +

The following will list the built-in models, processors, and data labelers.

+
+
Models:
    +
  1. CharacterLevelCnnModel - character classification of text.

  2. +
  3. RegexModel - character classification of text.

  4. +
+
+
Processors:
+
Preprocessors
    +
  1. CharPreprocessor

  2. +
  3. StructCharPreprocessor

  4. +
  5. DirectPassPreprocessor

  6. +
+
+
PostProcessors
    +
  1. CharPreprocessor

  2. +
  3. StructCharPostprocessor

  4. +
  5. RegexPostProcessor

  6. +
+
+
+
+
Data Labelers:
+
Classes
    +
  1. UnstructuredDataLabeler

  2. +
  3. StructuredDataLabeler

  4. +
+
+
Files to load from disk using BaseDataLabeler.load_from_library(<NAME>)
    +
  1. unstructured_model

  2. +
  3. structured_model

  4. +
  5. regex_model

  6. +
+
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.labeler_utils.html b/docs/0.12.0/html/dataprofiler.labelers.labeler_utils.html new file mode 100644 index 000000000..cd5e4acb6 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.labeler_utils.html @@ -0,0 +1,575 @@ + + + + + + + + + Labeler Utils - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Labeler Utils

+

Contains functions for the data labeler.

+
+
+dataprofiler.labelers.labeler_utils.f1_report_dict_to_str(f1_report: dict, label_names: list[str]) str
+

Return the report string from the f1_report dict.

+
+
Example Output:
+
+

precision recall f1-score support

+
+

class 0 0.00 0.00 0.00 1 +class 1 1.00 0.67 0.80 3

+
+

micro avg 0.67 0.50 0.57 4 +macro avg 0.50 0.33 0.40 4

+
+
+

weighted avg 0.75 0.50 0.60 4

+

Note: this is generally taken from the classification_report function +inside sklearn. +:param f1_report: f1 report dictionary from sklearn +:type f1_report: dict +:param label_names: names of labels included in the report +:type label_names: list(str) +:return: string representing f1_report printout +:rtype: str

+
+
+
+dataprofiler.labelers.labeler_utils.evaluate_accuracy(predicted_entities_in_index: list[list[int]], true_entities_in_index: list[list[int]], num_labels: int, entity_rev_dict: dict[int, str], verbose: bool = True, omitted_labels: tuple[str, ...] = ('PAD', 'UNKNOWN'), confusion_matrix_file: str | None = None) tuple[float, dict]
+

Evaluate accuracy from comparing predicted labels with true labels.

+
+
Parameters
+
    +
  • predicted_entities_in_index (list(array(int))) – predicted encoded labels for input +sentences

  • +
  • true_entities_in_index (list(array(int))) – true encoded labels for input sentences

  • +
  • entity_rev_dict (dict([index, entity])) – dictionary to convert indices to entities

  • +
  • verbose (boolean) – print additional information for debugging

  • +
  • omitted_labels (list() of text labels) – labels to omit from the accuracy evaluation

  • +
  • confusion_matrix_file (str) – File name (and dir) for confusion matrix

  • +
+
+
+

:return : f1-score +:rtype: float

+
+
+
+dataprofiler.labelers.labeler_utils.get_tf_layer_index_from_name(model: tf.keras.Model, layer_name: str) int | None
+

Return the index of the layer given the layer name within a tf model.

+
+
Parameters
+
    +
  • model – tf keras model to search

  • +
  • layer_name – name of the layer to find

  • +
+
+
Returns
+

layer index if it exists or None

+
+
+
+
+
+dataprofiler.labelers.labeler_utils.hide_tf_logger_warnings() None
+

Filter out a set of warnings from the tf logger.

+
+
+
+dataprofiler.labelers.labeler_utils.protected_register_keras_serializable(package: str = 'Custom', name: str | None = None) Callable
+

Protect against already registered keras serializable layers.

+

Ensures that if it was already registered, it will not try to +register it again.

+
+
+
+class dataprofiler.labelers.labeler_utils.FBetaScore(num_classes: int, average: str | None = None, beta: float = 1.0, threshold: float | None = None, name: str = 'fbeta_score', dtype: str | None = None, **kwargs: Any)
+

Bases: Metric

+

Computes F-Beta score.

+

Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283

+

# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==============================================================================

+

It is the weighted harmonic mean of precision +and recall. Output range is [0, 1]. Works for +both multi-class and multi-label classification. +$$ +F_{beta} = (1 + beta^2) * frac{textrm{precision} * +textrm{precision}}{(beta^2 cdot textrm{precision}) + textrm{recall}} +$$ +:param num_classes: Number of unique classes in the dataset. +:param average: Type of averaging to be performed on data.

+
+

Acceptable values are None, micro, macro and +weighted. Default value is None.

+
+
+
Parameters
+
    +
  • beta – Determines the weight of precision and recall +in harmonic mean. Determines the weight given to the +precision and recall. Default value is 1.

  • +
  • threshold – Elements of y_pred greater than threshold are +converted to be 1, and the rest 0. If threshold is +None, the argmax is converted to 1, and the rest 0.

  • +
  • name – (Optional) String name of the metric instance.

  • +
  • dtype – (Optional) Data type of the metric result.

  • +
+
+
Returns
+

float.

+
+
Return type
+

F-Beta Score

+
+
+

Initialize FBetaScore class.

+
+
+update_state(y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor | None = None) None
+

Update state.

+
+
+
+result() Tensor
+

Return f1 score.

+
+
+
+get_config() dict
+

Return the serializable config of the metric.

+
+
+
+add_variable(shape, initializer, dtype=None, aggregation='sum', name=None)
+
+
+
+add_weight(shape=(), initializer=None, dtype=None, name=None)
+
+
+
+property dtype
+
+
+
+classmethod from_config(config)
+
+
+
+reset_state()
+

Reset all of the metric state variables.

+

This function is called between epochs/steps, +when a metric is evaluated during training.

+
+
+
+stateless_reset_state()
+
+
+
+stateless_result(metric_variables)
+
+
+
+stateless_update_state(metric_variables, *args, **kwargs)
+
+
+
+property variables
+
+
+
+
+class dataprofiler.labelers.labeler_utils.F1Score(num_classes: int, average: str | None = None, threshold: float | None = None, name: str = 'f1_score', dtype: str | None = None)
+

Bases: FBetaScore

+

Computes F-1 Score.

+

# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==============================================================================

+

It is the harmonic mean of precision and recall. +Output range is [0, 1]. Works for both multi-class +and multi-label classification. +$$ +F_1 = 2 cdot frac{textrm{precision} +cdot textrm{recall}}{textrm{precision} + textrm{recall}} +$$ +:param num_classes: Number of unique classes in the dataset. +:param average: Type of averaging to be performed on data.

+
+

Acceptable values are None, micro, macro +and weighted. Default value is None.

+
+
+
Parameters
+
    +
  • threshold – Elements of y_pred above threshold are +considered to be 1, and the rest 0. If threshold is +None, the argmax is converted to 1, and the rest 0.

  • +
  • name – (Optional) String name of the metric instance.

  • +
  • dtype – (Optional) Data type of the metric result.

  • +
+
+
Returns
+

float.

+
+
Return type
+

F-1 Score

+
+
+

Initialize F1Score object.

+
+
+add_variable(shape, initializer, dtype=None, aggregation='sum', name=None)
+
+
+
+add_weight(shape=(), initializer=None, dtype=None, name=None)
+
+
+
+property dtype
+
+
+
+classmethod from_config(config)
+
+
+
+reset_state()
+

Reset all of the metric state variables.

+

This function is called between epochs/steps, +when a metric is evaluated during training.

+
+
+
+result() Tensor
+

Return f1 score.

+
+
+
+stateless_reset_state()
+
+
+
+stateless_result(metric_variables)
+
+
+
+stateless_update_state(metric_variables, *args, **kwargs)
+
+
+
+update_state(y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor | None = None) None
+

Update state.

+
+
+
+property variables
+
+
+
+get_config() dict
+

Get configuration.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.regex_model.html b/docs/0.12.0/html/dataprofiler.labelers.regex_model.html new file mode 100644 index 000000000..ccedaa26d --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.regex_model.html @@ -0,0 +1,488 @@ + + + + + + + + + Regex Model - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Regex Model

+

Contains class for regex data labeling model.

+
+
+class dataprofiler.labelers.regex_model.RegexModel(label_mapping: dict[str, int], parameters: Optional[dict] = None)
+

Bases: BaseModel

+

Class for regex data labeling model.

+

Initialize Regex Model.

+
+
Example regex_patterns:
+
regex_patterns = {
+
“LABEL_1”: [

“LABEL_1_pattern_1”, +“LABEL_1_pattern_2”, +…

+
+
+

], +“LABEL_2”: [

+
+

“LABEL_2_pattern_1”, +“LABEL_2_pattern_2”, +…

+
+
+
+

}

+
+
Example encapsulators:
+
encapsulators = {

‘start’: r’(?<![w.$%-])’, +‘end’: r’(?:(?=(b|[ ]))|(?=[^w%$]([^w]|$))|$)’,

+
+
+

}

+
+
+
+
Parameters
+
    +
  • label_mapping (dict) – maps labels to their encoded integers

  • +
  • parameters (dict) –

    Contains all the appropriate parameters for the model. +Possible parameters are:

    +
    +

    max_length, max_num_chars, dim_embed

    +
    +

  • +
+
+
Returns
+

None

+
+
+
+
+reset_weights() None
+

Reset weights.

+
+
+
+predict(data: Union[DataFrame, Series, ndarray], batch_size: Optional[int] = None, show_confidences: bool = False, verbose: bool = True) dict
+

Apply the regex patterns (within regex_model) to the input_string.

+

Create predictions for all matching patterns. Each pattern has an +associated entity and the predictions of each character within the +string are given a True or False identification for each entity. All +characters not identified by ANY of the regex patterns in the +pattern_dict are considered background characters, and are replaced with +the default_label value.

+
+
Parameters
+
    +
  • data (iterator) – list of strings to predict upon

  • +
  • batch_size (N/A) – does not impact this model and should be fixed to not +be required.

  • +
  • show_confidences – whether user wants prediction confidences

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

char level predictions and confidences

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_disk(dirpath: str) RegexModel
+

Load whole model from disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to load the model from

+
+
Returns
+

None

+
+
+
+
+
+save_to_disk(dirpath: str) None
+

Save whole model to disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to save the model to

+
+
Returns
+

None

+
+
+
+
+
+add_label(label: str, same_as: str | None = None) None
+

Add a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod get_class(class_name: str) type[BaseModel] | None
+

Get subclasses.

+
+
+
+get_parameters(param_list: list[str] | None = None) dict
+

Return a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (List[str]) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod help() None
+

Help describe alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping: dict[str, int]
+

Return mapping of labels to their encoded values.

+
+
+
+property labels: list[str]
+

Retrieve the label.

+
+
Returns
+

list of labels

+
+
+
+
+
+property num_labels: int
+

Return max label mapping.

+
+
+
+requires_zero_mapping: bool = False
+
+
+
+property reverse_label_mapping: dict[int, str]
+

Return reversed order of current labels.

+

Useful for when needed to extract Labels via indices.

+
+
+
+set_label_mapping(label_mapping: list[str] | dict[str, int]) None
+

Set the labels for the model.

+
+
Parameters
+

label_mapping (Union[list, dict]) – label mapping of the model or list of labels to be +converted into the label mapping

+
+
Returns
+

None

+
+
+
+
+
+set_params(**kwargs: Any) None
+

Set the parameters if they exist given kwargs.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.labelers.utils.html b/docs/0.12.0/html/dataprofiler.labelers.utils.html new file mode 100644 index 000000000..df4e36e67 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.labelers.utils.html @@ -0,0 +1,310 @@ + + + + + + + + + Utils - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Utils

+

Contains functions for checking for installations/dependencies.

+
+
+dataprofiler.labelers.utils.warn_missing_module(labeler_function: str, module_name: str) None
+

Return a warning if a given graph module doesn’t exist.

+
+
Parameters
+
    +
  • labeler_function (str) – Name of the graphing function

  • +
  • module_name (str) – module name that was missing

  • +
+
+
+
+
+
+dataprofiler.labelers.utils.require_module(names: List[str]) Callable
+

Check if a set of modules exists in sys.modules prior to running function.

+

If they do not, give a user a warning and do not run the +function.

+
+
Parameters
+

names (list[str]) – list of module names to check for in sys.modules

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.plugins.decorators.html b/docs/0.12.0/html/dataprofiler.plugins.decorators.html new file mode 100644 index 000000000..1f016375d --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.plugins.decorators.html @@ -0,0 +1,283 @@ + + + + + + + + + Decorators - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Decorators

+

Contains function for generating plugins data.

+
+
+dataprofiler.plugins.decorators.plugin_decorator(typ, name)
+

Populate plugins_dict with decorated plugin functions.

+
+
Parameters
+
    +
  • typ – Broader classification/type of a plugin

  • +
  • name – Specific name of a plugin

  • +
+
+
Returns
+

function

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.plugins.html b/docs/0.12.0/html/dataprofiler.plugins.html new file mode 100644 index 000000000..2d54f63e8 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.plugins.html @@ -0,0 +1,321 @@ + + + + + + + + + Plugins - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Plugins

+
+

Modules

+
+
+
+ +
+
+
+dataprofiler.plugins.load_plugins()
+

Digs through plugins folder for possible plugins to be imported +and consequently added to the plugins_dict if properly decorated

+
+
Returns
+

None

+
+
+
+
+
+dataprofiler.plugins.get_plugins(typ)
+

Fetches a dictionary of plugins of a certain type

+
+
Parameters
+

typ – Broader classification/type of a plugin

+
+
Returns
+

dict

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.base_column_profilers.html b/docs/0.12.0/html/dataprofiler.profilers.base_column_profilers.html new file mode 100644 index 000000000..9f205ec87 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.base_column_profilers.html @@ -0,0 +1,459 @@ + + + + + + + + + Base Column Profilers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Column Profilers

+

Contains parent column profiler class.

+
+
+class dataprofiler.profilers.base_column_profilers.BaseColumnProfiler(name: str | None, options: BaseOption | None = None)
+

Bases: Generic[BaseColumnProfilerT]

+

Abstract class for profiling a column of data.

+

Initialize base class properties for the subclass.

+
+
Parameters
+

name (String) – Name of the dataset

+
+
+
+
+col_type = None
+
+
+
+diff(other_profile: BaseColumnProfilerT, options: Optional[dict] = None) dict
+

Find the differences for columns.

+
+
Parameters
+

other_profile (BaseColumnProfiler) – profile to find the difference with

+
+
Returns
+

the stat differences

+
+
Return type
+

dict

+
+
+
+
+
+abstract update(df_series: DataFrame) BaseColumnProfiler
+

Update the profile.

+
+
Parameters
+

df_series (Pandas Dataframe) – Data to profile.

+
+
+
+
+
+abstract property profile: dict
+

Return the profile of the column.

+
+
+
+abstract report(remove_disabled_flag: bool = False) dict
+

Return report.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled +options should be excluded in the report.

+
+
+
+
+
+classmethod load_from_dict(data: dict[str, Any], config: dict | None = None) BaseColumnProfilerT
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

BaseColumnProfiler

+
+
+
+
+
+
+class dataprofiler.profilers.base_column_profilers.BaseColumnPrimitiveTypeProfiler(name: str | None)
+

Bases: BaseColumnProfiler[BaseColumnPrimitiveTypeProfilerT]

+

Abstract class for profiling primative data type for col of data.

+

Initialize base class properties for the subclass.

+
+
Parameters
+

name (String) – Name of the data

+
+
+
+
+sample_size: int
+
+
+
+col_type = None
+
+
+
+diff(other_profile: BaseColumnProfilerT, options: Optional[dict] = None) dict
+

Find the differences for columns.

+
+
Parameters
+

other_profile (BaseColumnProfiler) – profile to find the difference with

+
+
Returns
+

the stat differences

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_dict(data: dict[str, Any], config: dict | None = None) BaseColumnProfilerT
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

BaseColumnProfiler

+
+
+
+
+
+abstract property profile: dict
+

Return the profile of the column.

+
+
+
+abstract report(remove_disabled_flag: bool = False) dict
+

Return report.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled +options should be excluded in the report.

+
+
+
+
+
+abstract update(df_series: DataFrame) BaseColumnProfiler
+

Update the profile.

+
+
Parameters
+

df_series (Pandas Dataframe) – Data to profile.

+
+
+
+
+
+name: str | None
+
+
+
+metadata: dict
+
+
+
+times: dict
+
+
+
+thread_safe: bool
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.categorical_column_profile.html b/docs/0.12.0/html/dataprofiler.profilers.categorical_column_profile.html new file mode 100644 index 000000000..a60b4be6e --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.categorical_column_profile.html @@ -0,0 +1,451 @@ + + + + + + + + + Categorical Column Profile - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Categorical Column Profile

+

Contains class for categorical column profiler.

+
+
+class dataprofiler.profilers.categorical_column_profile.CategoricalColumn(name: str | None, options: CategoricalOptions = None)
+

Bases: BaseColumnProfiler[CategoricalColumn]

+

Categorical column profile subclass of BaseColumnProfiler.

+

Represents a column int the dataset which is a categorical column.

+

Initialize column base properties and itself.

+
+
Parameters
+

name (String) – Name of data

+
+
+
+
+type = 'category'
+
+
+
+property gini_impurity: float | None
+

Return Gini Impurity.

+

Gini Impurity is a way to calculate +likelihood of an incorrect classification of a new instance of +a random variable.

+

G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes. +We are traversing through categories and calculating with the column

+
+
Returns
+

None or Gini Impurity probability

+
+
+
+
+
+property unalikeability: float | None
+

Return Unlikeability.

+

Unikeability checks for “how often observations differ from one another” +Reference: Perry, M. and Kader, G. Variation as Unalikeability. +Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.

+

U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n) +Cij = 1 if i!=j, 0 if i=j

+
+
Returns
+

None or unlikeability probability

+
+
+
+
+
+diff(other_profile: CategoricalColumn, options: Optional[dict] = None) dict
+

Find the differences for CategoricalColumns.

+
+
Parameters
+

other_profile (CategoricalColumn) – profile to find the difference with

+
+
Returns
+

the CategoricalColumn differences

+
+
Return type
+

dict

+
+
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Return report.

+

This is a private abstract method.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled +options should be excluded in the report.

+
+
+
+
+
+classmethod load_from_dict(data: dict, config: dict | None = None)
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

CategoricalColumn

+
+
+
+
+
+property profile: dict
+

Return the profile of the column.

+

For categorical_count, it will display the top k categories most +frequently occurred in descending order.

+
+
+
+property categories: list[str]
+

Return categories.

+
+
+
+property categorical_counts: dict[str, int]
+

Return counts of each category.

+
+
+
+property unique_ratio: float
+

Return ratio of unique categories to sample_size.

+
+
+
+property unique_count: int
+

Return ratio of unique categories to sample_size.

+
+
+
+property is_match: bool
+

Return true if column is categorical.

+
+
+
+col_type = None
+
+
+
+name: str | None
+
+
+
+sample_size: int
+
+
+
+metadata: dict
+
+
+
+times: dict
+
+
+
+thread_safe: bool
+
+
+
+update(df_series: Series) CategoricalColumn
+

Update the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – Data to profile.

+
+
Returns
+

updated CategoricalColumn

+
+
Return type
+

CategoricalColumn

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.column_profile_compilers.html b/docs/0.12.0/html/dataprofiler.profilers.column_profile_compilers.html new file mode 100644 index 000000000..845d5962f --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.column_profile_compilers.html @@ -0,0 +1,679 @@ + + + + + + + + + Column Profile Compilers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Column Profile Compilers

+

For generating a report.

+
+
+class dataprofiler.profilers.column_profile_compilers.BaseCompiler(df_series: Optional[Series] = None, options: Optional[StructuredOptions] = None, pool: Optional[Pool] = None)
+

Bases: Generic[BaseCompilerT]

+

Abstract class for generating a report.

+

Initialize BaseCompiler object.

+
+
+abstract report(remove_disabled_flag: bool = False) dict
+

Return report.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled options should be excluded in report.

+
+
+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
+
+diff(other: BaseCompilerT, options: Optional[dict] = None) dict
+

Find the difference between 2 compilers and returns the report.

+
+
Parameters
+

other (BaseCompiler) – profile compiler finding the difference with this one.

+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+update_profile(df_series: Series, pool: Pool = None) BaseCompiler | None
+

Update the profiles from the data frames.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseCompiler
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Compiler with attributes populated.

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+
+class dataprofiler.profilers.column_profile_compilers.ColumnPrimitiveTypeProfileCompiler(df_series: Optional[Series] = None, options: Optional[StructuredOptions] = None, pool: Optional[Pool] = None)
+

Bases: BaseCompiler[ColumnPrimitiveTypeProfileCompiler]

+

For generating ordered column profile reports.

+

Initialize BaseCompiler object.

+
+
+report(remove_disabled_flag: bool = False) dict
+

Return report.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled options should be excluded in report.

+
+
+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
+
+property selected_data_type: str | None
+

Find the selected data_type in a primitive compiler.

+
+
Returns
+

name of the selected data type

+
+
Return type
+

str

+
+
+
+
+
+diff(other: ColumnPrimitiveTypeProfileCompiler, options: Optional[dict] = None) dict
+

Find the difference between 2 compilers and returns the report.

+
+
Parameters
+

other (ColumnPrimitiveTypeProfileCompiler) – profile compiler finding the difference with this one.

+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseCompiler
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Compiler with attributes populated.

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+update_profile(df_series: Series, pool: Pool = None) BaseCompiler | None
+

Update the profiles from the data frames.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+
+class dataprofiler.profilers.column_profile_compilers.ColumnStatsProfileCompiler(df_series: Optional[Series] = None, options: Optional[StructuredOptions] = None, pool: Optional[Pool] = None)
+

Bases: BaseCompiler[ColumnStatsProfileCompiler]

+

For generating OrderColumn and CategoricalColumn reports.

+

Initialize BaseCompiler object.

+
+
+report(remove_disabled_flag: bool = False) dict
+

Return report.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled options should be excluded in report.

+
+
+
+
+
+diff(other: ColumnStatsProfileCompiler, options: Optional[dict] = None) dict
+

Find the difference between 2 compilers and returns the report.

+
+
Parameters
+

other (ColumnStatsProfileCompiler) – profile compiler finding the difference with this one.

+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseCompiler
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Compiler with attributes populated.

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
+
+update_profile(df_series: Series, pool: Pool = None) BaseCompiler | None
+

Update the profiles from the data frames.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+
+class dataprofiler.profilers.column_profile_compilers.ColumnDataLabelerCompiler(df_series: Optional[Series] = None, options: Optional[StructuredOptions] = None, pool: Optional[Pool] = None)
+

Bases: BaseCompiler[ColumnDataLabelerCompiler]

+

For generating DataLabelerColumn report.

+

Initialize BaseCompiler object.

+
+
+report(remove_disabled_flag: bool = False) dict
+

Return report.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled options should be excluded in report.

+
+
+
+
+
+diff(other: ColumnDataLabelerCompiler, options: Optional[dict] = None) dict
+

Find the difference between 2 compilers and return the report.

+
+
Parameters
+
    +
  • other (ColumnDataLabelerCompiler) – profile compiler finding the difference with this one.

  • +
  • options (dict) – options to change results of the difference

  • +
+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseCompiler
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Compiler with attributes populated.

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
+
+update_profile(df_series: Series, pool: Pool = None) BaseCompiler | None
+

Update the profiles from the data frames.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+
+class dataprofiler.profilers.column_profile_compilers.UnstructuredCompiler(df_series: Optional[Series] = None, options: Optional[StructuredOptions] = None, pool: Optional[Pool] = None)
+

Bases: BaseCompiler[UnstructuredCompiler]

+

For generating TextProfiler and UnstructuredLabelerProfile reports.

+

Initialize BaseCompiler object.

+
+
+report(remove_disabled_flag: bool = False) dict
+

Report profile attrs of class and potentially pop val from self.profile.

+
+
+
+diff(other: UnstructuredCompiler, options: Optional[dict] = None) dict
+

Find the difference between 2 compilers and return the report.

+
+
Parameters
+
    +
  • other (UnstructuredCompiler) – profile compiler finding the difference with this one.

  • +
  • options (dict) – options to impact the results of the diff

  • +
+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseCompiler
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Compiler with attributes populated.

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
+
+update_profile(df_series: Series, pool: Pool = None) BaseCompiler | None
+

Update the profiles from the data frames.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.data_labeler_column_profile.html b/docs/0.12.0/html/dataprofiler.profilers.data_labeler_column_profile.html new file mode 100644 index 000000000..7b676a8f9 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.data_labeler_column_profile.html @@ -0,0 +1,448 @@ + + + + + + + + + Data Labeler Column Profile - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Labeler Column Profile

+

Contains class for for profiling data labeler col.

+
+
+class dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn(name: str | None, options: DataLabelerOptions = None)
+

Bases: BaseColumnProfiler[DataLabelerColumn]

+

Sublass of BaseColumnProfiler for profiling data labeler col.

+

Initialize Data Label profiling for structured datasets.

+
+
Parameters
+
    +
  • name (String) – name of column being profiled

  • +
  • options (DataLabelerOptions) – Options for the data labeler column

  • +
+
+
+
+
+type = 'data_labeler'
+
+
+
+thread_safe: bool
+
+
+
+static assert_equal_conditions(data_labeler: DataLabelerColumn, data_labeler2: DataLabelerColumn) None
+

Ensure data labelers have the same values. Raise error otherwise.

+
+
Parameters
+
+
+
Returns
+

None

+
+
+
+
+
+property reverse_label_mapping: dict
+

Return reverse label mapping.

+
+
+
+property possible_data_labels: list[str]
+

Return possible data labels.

+
+
+
+property rank_distribution: dict[str, int]
+

Return rank distribution.

+
+
+
+property sum_predictions: ndarray
+

Sum predictions.

+
+
+
+property data_label: str | None
+

Return data labels which best fit data it has seen based on DataLabeler used.

+

Data labels must be within the minimum probability +differential of the top predicted value. If nothing is more than +minimum top label value, it says it could not determine the data label.

+
+
+
+property avg_predictions: dict[str, float] | None
+

Average all sample predictions for each data label.

+
+
+
+property label_representation: dict[str, float] | None
+

Represent label found within the dataset based on ranked voting.

+

When top_k=1, this is simply the distribution of data labels found +within the dataset.

+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
+
+classmethod load_from_dict(data, config: dict | None = None) DataLabelerColumn
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

DataLabelerColumn

+
+
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Return report.

+

Private abstract method.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled +options should be excluded in the report.

+
+
+
+
+
+col_type = None
+
+
+
+diff(other_profile: DataLabelerColumn, options: Optional[dict] = None) dict
+

Generate differences between the orders of two DataLabeler columns.

+
+
Returns
+

Dict containing the differences between orders in their

+
+
+

appropriate output formats +:rtype: dict

+
+
+
+name: str | None
+
+
+
+sample_size: int
+
+
+
+metadata: dict
+
+
+
+times: dict
+
+
+
+update(df_series: Series) DataLabelerColumn
+

Update the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

updated DataLabelerColumn

+
+
Return type
+

DataLabelerColumn

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.datetime_column_profile.html b/docs/0.12.0/html/dataprofiler.profilers.datetime_column_profile.html new file mode 100644 index 000000000..643b60fb4 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.datetime_column_profile.html @@ -0,0 +1,407 @@ + + + + + + + + + Datetime Column Profile - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Datetime Column Profile

+

Contains class for profiling datetime column.

+
+
+class dataprofiler.profilers.datetime_column_profile.DateTimeColumn(name: str | None, options: DateTimeOptions = None)
+

Bases: BaseColumnPrimitiveTypeProfiler[DateTimeColumn]

+

Datetime column profile subclass of BaseColumnProfiler.

+

Represents a column int the dataset which is a datetime column.

+

Initialize it and the column base properties.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (DateTimeOptions) – Options for the datetime column

  • +
+
+
+
+
+type = 'datetime'
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Return report.

+

Private abstract method.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled +options should be excluded in the report.

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None)
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

DateTimeColumn

+
+
+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
+
+property data_type_ratio: float | None
+

Calculate the ratio of samples which match this data type.

+
+
Returns
+

ratio of data type

+
+
Return type
+

float

+
+
+
+
+
+diff(other_profile: DateTimeColumn, options: Optional[dict] = None) dict
+

Generate differences between max, min, and formats of two DateTime cols.

+
+
Returns
+

Dict containing the differences between max, min, and format in their

+
+
+

appropriate output formats +:rtype: dict

+
+
+
+update(df_series: Series) DateTimeColumn
+

Update the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

None

+
+
+
+
+
+col_type = None
+
+
+
+name: str | None
+
+
+
+sample_size: int
+
+
+
+metadata: dict
+
+
+
+times: dict
+
+
+
+thread_safe: bool
+
+
+
+match_count: int
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.float_column_profile.html b/docs/0.12.0/html/dataprofiler.profilers.float_column_profile.html new file mode 100644 index 000000000..89b2d1473 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.float_column_profile.html @@ -0,0 +1,548 @@ + + + + + + + + + Float Column Profile - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Float Column Profile

+

Float profile analysis for individual col within structured profiling.

+
+
+class dataprofiler.profilers.float_column_profile.FloatColumn(name: str | None, options: FloatOptions = None)
+

Bases: NumericStatsMixin[FloatColumn], BaseColumnPrimitiveTypeProfiler[FloatColumn]

+

Float column profile mixin with numerical stats.

+

Represents a column in the dataset which is a float column.

+

Initialize column base properties and itself.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (FloatOptions) – Options for the float column

  • +
+
+
+
+
+type: str | None = 'float'
+
+
+
+diff(other_profile: FloatColumn, options: Optional[dict] = None) dict
+

Find the differences for FloatColumns.

+
+
Parameters
+

other_profile (FloatColumn) – profile to find the difference with

+
+
Returns
+

the FloatColumn differences

+
+
Return type
+

dict

+
+
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Report profile attribute of class; potentially pop val from self.profile.

+
+
+
+classmethod load_from_dict(data, config: dict | None = None)
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

FloatColumn

+
+
+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
Returns
+

+
+
+
+
+
+property precision: dict[str, float | None]
+

Report statistics on the significant figures of each element in the data.

+
+
Returns
+

Precision statistics

+
+
Return type
+

dict

+
+
+
+
+
+property data_type_ratio: float | None
+

Calculate the ratio of samples which match this data type.

+
+
Returns
+

ratio of data type

+
+
Return type
+

float

+
+
+
+
+
+col_type = None
+
+
+
+static is_float(x: str) bool
+

Return True if x is float.

+

For “0.80” this function returns True +For “1.00” this function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is float or not

+
+
Return type
+

bool

+
+
+
+
+
+static is_int(x: str) bool
+

Return True if x is integer.

+

For “0.80” This function returns False +For “1.00” This function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is integer or not

+
+
Return type
+

bool

+
+
+
+
+
+property kurtosis: float | np.float64
+

Return kurtosis value.

+
+
+
+property mean: float | np.float64
+

Return mean value.

+
+
+
+property median: float
+

Estimate the median of the data.

+
+
Returns
+

the median

+
+
Return type
+

float

+
+
+
+
+
+property median_abs_deviation: float | np.float64
+

Get median absolute deviation estimated from the histogram of the data.

+
+

Subtract bin edges from the median value +Fold the histogram to positive and negative parts around zero +Impose the two bin edges from the two histogram +Calculate the counts for the two histograms with the imposed bin edges +Superimpose the counts from the two histograms +Interpolate the median absolute deviation from the superimposed counts

+
+
+
Returns
+

median absolute deviation

+
+
+
+
+
+property mode: list[float]
+

Find an estimate for the mode[s] of the data.

+
+
Returns
+

the mode(s) of the data

+
+
Return type
+

list(float)

+
+
+
+
+
+static np_type_to_type(val: Any) Any
+

Convert numpy variables to base python type variables.

+
+
Parameters
+

val (numpy type or base type) – value to check & change

+
+
Return val
+

base python type

+
+
Rtype val
+

int or float

+
+
+
+
+
+property skewness: float | np.float64
+

Return skewness value.

+
+
+
+property stddev: float | np.float64
+

Return stddev value.

+
+
+
+update(df_series: Series) FloatColumn
+

Update the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

updated FloatColumn

+
+
Return type
+

FloatColumn

+
+
+
+
+
+property variance: float | np.float64
+

Return variance.

+
+
+
+name: str | None
+
+
+
+sample_size: int
+
+
+
+metadata: dict
+
+
+
+times: dict
+
+
+
+thread_safe: bool
+
+
+
+match_count: int
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.graph_profiler.html b/docs/0.12.0/html/dataprofiler.profilers.graph_profiler.html new file mode 100644 index 000000000..650f44d2c --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.graph_profiler.html @@ -0,0 +1,384 @@ + + + + + + + + + Graph Profiler - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Graph Profiler

+

Class and functions to calculate and profile properties of graph data.

+
+
+class dataprofiler.profilers.graph_profiler.GraphProfiler(data: nx.Graph | GraphData, options: ProfilerOptions = None)
+

Bases: object

+

GraphProfiler class.

+

Creates a profile describing a graph dataset +Statistical properties of graph

+

Initialize Graph Profiler.

+
+
Parameters
+
+
+
+
+
+times: dict[str, float]
+

Properties

+
+
+
+property profile: dict
+

Return the profile of the graph.

+
+
Returns
+

the profile of the graph in data

+
+
+
+
+
+diff(other_profile: GraphProfiler, options: Optional[dict] = None) dict
+

Find the differences for two graph profiles.

+
+
Parameters
+
    +
  • other_profile (GraphProfiler) – profile to find the difference with

  • +
  • options (dict) – options for diff output

  • +
+
+
Returns
+

the difference between profiles

+
+
Return type
+

dict

+
+
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Report on profile attribute of the class.

+

Pop value from self.profile if key not in self.__calculations

+
+
+
+update(graph: Graph) GraphProfiler
+

Update the graph profile.

+
+
Parameters
+

data (NetworkX Graph) – networkx graph

+
+
Returns
+

None

+
+
+
+
+
+save(filepath: Optional[str] = None) None
+

Save profiler to disk.

+
+
Parameters
+

filepath (String) – Path of file to save to

+
+
Returns
+

None

+
+
+
+
+
+classmethod load(filepath: str) GraphProfiler
+

Load profiler from disk.

+
+
Parameters
+

filepath (String) – Path of file to load from

+
+
Returns
+

GraphProfiler being loaded

+
+
Return type
+

GraphProfiler

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.helpers.html b/docs/0.12.0/html/dataprofiler.profilers.helpers.html new file mode 100644 index 000000000..88b44ca4d --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.helpers.html @@ -0,0 +1,335 @@ + + + + + + + + + Helpers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Helpers

+
+

Modules

+
+
+ +

This package provides helper functions for generating reports.

+
+
+dataprofiler.profilers.helpers.calculate_quantiles(num_quantile_groups: int, quantiles: dict[int, int]) dict[int, int]
+

Calculate and return quantiles.

+
+
Parameters
+
    +
  • num_quantile_groups (int) – number of quantile groups

  • +
  • quantiles (dict[int, int]) – original quantiles

  • +
+
+
Returns
+

calculated quantiles

+
+
Return type
+

dict[int, int]

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.helpers.report_helpers.html b/docs/0.12.0/html/dataprofiler.profilers.helpers.report_helpers.html new file mode 100644 index 000000000..7cc6f419f --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.helpers.report_helpers.html @@ -0,0 +1,326 @@ + + + + + + + + + Report Helpers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Report Helpers

+

Contains helper functions for generating report.

+
+
+dataprofiler.profilers.helpers.report_helpers.calculate_quantiles(num_quantile_groups: int, quantiles: dict[int, int]) dict[int, int]
+

Calculate and return quantiles.

+
+
Parameters
+
    +
  • num_quantile_groups (int) – number of quantile groups

  • +
  • quantiles (dict[int, int]) – original quantiles

  • +
+
+
Returns
+

calculated quantiles

+
+
Return type
+

dict[int, int]

+
+
+
+
+
+dataprofiler.profilers.helpers.report_helpers.flat_dict(od: dict, separator: str = '_', key: str = '') dict
+

Flatten nested dictionary.

+

Each level is collapsed and +joined with the specified seperator.

+
+
Parameters
+
    +
  • od (dict) – dictionary or dictionary-like object

  • +
  • seperator (str) – character(s) joining successive levels

  • +
  • key (str) – concatenated keys

  • +
+
+
Returns
+

unnested dictionary

+
+
Return type
+

dict

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.histogram_utils.html b/docs/0.12.0/html/dataprofiler.profilers.histogram_utils.html new file mode 100644 index 000000000..d1842b186 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.histogram_utils.html @@ -0,0 +1,289 @@ + + + + + + + + + Histogram Utils - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.html b/docs/0.12.0/html/dataprofiler.profilers.html new file mode 100644 index 000000000..28bca8409 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.html @@ -0,0 +1,343 @@ + + + + + + + + + Profilers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+ + +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.int_column_profile.html b/docs/0.12.0/html/dataprofiler.profilers.int_column_profile.html new file mode 100644 index 000000000..c4d27586f --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.int_column_profile.html @@ -0,0 +1,541 @@ + + + + + + + + + Int Column Profile - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Int Column Profile

+

Int profile analysis for individual col within structured profiling.

+
+
+class dataprofiler.profilers.int_column_profile.IntColumn(name: str | None, options: IntOptions = None)
+

Bases: NumericStatsMixin[IntColumn], BaseColumnPrimitiveTypeProfiler[IntColumn]

+

Integer column profile mixin with of numerical stats.

+

Represents a column in the dataset which is an integer column.

+

Initialize column base properties and itself.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (IntOptions) – Options for the integer column

  • +
+
+
+
+
+type: str | None = 'int'
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Return the report.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled +options should be excluded in the report.

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None)
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

IntColumn

+
+
+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
Returns
+

+
+
+
+
+
+property data_type_ratio: float | None
+

Calculate the ratio of samples which match this data type.

+
+
Returns
+

ratio of data type

+
+
Return type
+

float

+
+
+
+
+
+update(df_series: Series) IntColumn
+

Update the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

updated IntColumn

+
+
Return type
+

IntColumn

+
+
+
+
+
+col_type = None
+
+
+
+diff(other_profile: NumericStatsMixinT, options: Optional[dict] = None) dict
+

Find the differences for several numerical stats.

+
+
Parameters
+

other_profile (NumericStatsMixin Profile) – profile to find the difference with

+
+
Returns
+

the numerical stats differences

+
+
Return type
+

dict

+
+
+
+
+
+static is_float(x: str) bool
+

Return True if x is float.

+

For “0.80” this function returns True +For “1.00” this function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is float or not

+
+
Return type
+

bool

+
+
+
+
+
+static is_int(x: str) bool
+

Return True if x is integer.

+

For “0.80” This function returns False +For “1.00” This function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is integer or not

+
+
Return type
+

bool

+
+
+
+
+
+property kurtosis: float | np.float64
+

Return kurtosis value.

+
+
+
+property mean: float | np.float64
+

Return mean value.

+
+
+
+property median: float
+

Estimate the median of the data.

+
+
Returns
+

the median

+
+
Return type
+

float

+
+
+
+
+
+property median_abs_deviation: float | np.float64
+

Get median absolute deviation estimated from the histogram of the data.

+
+

Subtract bin edges from the median value +Fold the histogram to positive and negative parts around zero +Impose the two bin edges from the two histogram +Calculate the counts for the two histograms with the imposed bin edges +Superimpose the counts from the two histograms +Interpolate the median absolute deviation from the superimposed counts

+
+
+
Returns
+

median absolute deviation

+
+
+
+
+
+property mode: list[float]
+

Find an estimate for the mode[s] of the data.

+
+
Returns
+

the mode(s) of the data

+
+
Return type
+

list(float)

+
+
+
+
+
+static np_type_to_type(val: Any) Any
+

Convert numpy variables to base python type variables.

+
+
Parameters
+

val (numpy type or base type) – value to check & change

+
+
Return val
+

base python type

+
+
Rtype val
+

int or float

+
+
+
+
+
+property skewness: float | np.float64
+

Return skewness value.

+
+
+
+property stddev: float | np.float64
+

Return stddev value.

+
+
+
+property variance: float | np.float64
+

Return variance.

+
+
+
+name: str | None
+
+
+
+sample_size: int
+
+
+
+metadata: dict
+
+
+
+times: dict
+
+
+
+thread_safe: bool
+
+
+
+match_count: int
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.json_decoder.html b/docs/0.12.0/html/dataprofiler.profilers.json_decoder.html new file mode 100644 index 000000000..5d4e7f501 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.json_decoder.html @@ -0,0 +1,548 @@ + + + + + + + + + JSON Decoder - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

JSON Decoder

+

Contains methods to decode components of a Profiler.

+
+
+dataprofiler.profilers.json_decoder.get_column_profiler_class(class_name: str) type[BaseColumnProfiler]
+

Use name of class to return default-constructed version of that class.

+
+
Raises ValueError if class_name is not name of a subclass of

BaseColumnProfiler.

+
+
+
+
Parameters
+

class_name (str representing name of class) – name of BaseColumnProfiler subclass retrieved by +calling type(instance).__name__

+
+
Returns
+

subclass of BaseColumnProfiler object

+
+
+
+
+
+dataprofiler.profilers.json_decoder.get_compiler_class(class_name: str) type[col_pro_compiler.BaseCompiler]
+

Use name of class to return default-constructed version of that class.

+
+
Raises ValueError if class_name is not name of a subclass of

BaseCompiler.

+
+
+
+
Parameters
+

class_name (str representing name of class) – name of BaseCompiler subclass retrieved by +calling type(instance).__name__

+
+
Returns
+

subclass of BaseCompiler object

+
+
+
+
+
+dataprofiler.profilers.json_decoder.get_option_class(class_name: str) type[BaseOption]
+

Use name of class to return default-constructed version of that class.

+
+
Raises ValueError if class_name is not name of a subclass of

BaseOptions.

+
+
+
+
Parameters
+

class_name (str representing name of class) – name of BaseOptions subclass retrieved by +calling type(instance).__name__

+
+
Returns
+

subclass of BaseOptions object

+
+
+
+
+
+dataprofiler.profilers.json_decoder.get_profiler_class(class_name: str) type[BaseProfiler]
+

Use name of class to return default-constructed version of that class.

+
+
Raises ValueError if class_name is not name of a subclass of

BaseProfiler.

+
+
+
+
Parameters
+

class_name (str representing name of class) – name of BaseProfiler subclass retrieved by +calling type(instance).__name__

+
+
Raises
+

ValueError if the profiler class does not exist

+
+
Returns
+

subclass of BaseProfiler object

+
+
+
+
+
+dataprofiler.profilers.json_decoder.get_structured_col_profiler_class(class_name: str) type[StructuredColProfiler]
+

Use name of class to return default-constructed version of that class.

+
+
Raises ValueError if class_name is not name of a subclass of

StructuredColProfiler.

+
+
+
+
Parameters
+

class_name (str representing name of class) – name of StructuredColProfiler subclass retrieved by +calling type(instance).__name__

+
+
Returns
+

subclass of StructuredColProfiler object

+
+
+
+
+
+dataprofiler.profilers.json_decoder.load_column_profile(serialized_json: dict, config: dict | None = None) BaseColumnProfiler
+

Construct subclass of BaseColumnProfiler given a serialized JSON.

+
+
Expected format of serialized_json (see json_encoder):
+
{

“class”: <str name of class that was serialized> +“data”: {

+
+

<attr1>: <value1> +<attr2>: <value2> +…

+
+

}

+
+
+

}

+
+
+
+
Parameters
+
    +
  • serialized_json (a dict that was created by calling json.loads on +a JSON representation using the custom encoder) – JSON representation of column profiler that was +serialized using the custom encoder in profilers.json_encoder

  • +
  • config (Dict | None) – config for overriding data params when loading from dict

  • +
+
+
Returns
+

subclass of BaseColumnProfiler that has been deserialized from +JSON

+
+
+
+
+
+dataprofiler.profilers.json_decoder.load_compiler(serialized_json: dict, config: dict | None = None) col_pro_compiler.BaseCompiler
+

Construct subclass of BaseCompiler given a serialized JSON.

+
+
Expected format of serialized_json (see json_encoder):
+
{

“class”: <str name of class that was serialized> +“data”: {

+
+

<attr1>: <value1> +<attr2>: <value2> +…

+
+

}

+
+
+

}

+
+
+
+
Parameters
+
    +
  • serialized_json (a dict that was created by calling json.loads on +a JSON representation using the custom encoder) – JSON representation of profile compiler that was +serialized using the custom encoder in profilers.json_encoder

  • +
  • config (Dict | None) – config for overriding data params when loading from dict

  • +
+
+
Returns
+

subclass of BaseCompiler that has been deserialized from +JSON

+
+
+
+
+
+dataprofiler.profilers.json_decoder.load_option(serialized_json: dict, config: dict | None = None) BaseOption
+

Construct subclass of BaseOption given a serialized JSON.

+
+
Expected format of serialized_json (see json_encoder):
+
{

“class”: <str name of class that was serialized> +“data”: {

+
+

<attr1>: <value1> +<attr2>: <value2> +…

+
+

}

+
+
+

}

+
+
+
+
Parameters
+
    +
  • serialized_json (a dict that was created by calling json.loads on +a JSON representation using the custom encoder) – JSON representation of option that was +serialized using the custom encoder in profilers.json_encoder

  • +
  • config (Dict | None) – config for overriding data params when loading from dict

  • +
+
+
Returns
+

subclass of BaseOption that has been deserialized from +JSON

+
+
+
+
+
+dataprofiler.profilers.json_decoder.load_profiler(serialized_json: dict, config=None) BaseProfiler
+

Construct subclass of BaseProfiler given a serialized JSON.

+
+
Expected format of serialized_json (see json_encoder):
+
{

“class”: <str name of class that was serialized> +“data”: {

+
+

<attr1>: <value1> +<attr2>: <value2> +…

+
+

}

+
+
+

}

+
+
+
+
Parameters
+
    +
  • serialized_json (a dict that was created by calling json.loads on +a JSON representation using the custom encoder) – JSON representation of column profiler that was +serialized using the custom encoder in profilers.json_encoder

  • +
  • config (Dict | None) – config for overriding data params when loading from dict

  • +
+
+
Returns
+

subclass of BaseProfiler that has been deserialized from +JSON

+
+
+
+
+
+dataprofiler.profilers.json_decoder.load_structured_col_profiler(serialized_json: dict, config: dict | None = None) StructuredColProfiler
+

Construct subclass of BaseProfiler given a serialized JSON.

+
+
Expected format of serialized_json (see json_encoder):
+
{

“class”: <str name of class that was serialized> +“data”: {

+
+

<attr1>: <value1> +<attr2>: <value2> +…

+
+

}

+
+
+

}

+
+
+
+
Parameters
+
    +
  • serialized_json (a dict that was created by calling json.loads on +a JSON representation using the custom encoder) – JSON representation of column profiler that was +serialized using the custom encoder in profilers.json_encoder

  • +
  • config (Dict | None) – config for overriding data params when loading from dict

  • +
+
+
Returns
+

subclass of BaseCompiler that has been deserialized from +JSON

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.json_encoder.html b/docs/0.12.0/html/dataprofiler.profilers.json_encoder.html new file mode 100644 index 000000000..989e11400 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.json_encoder.html @@ -0,0 +1,365 @@ + + + + + + + + + JSON Encoder - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

JSON Encoder

+

Contains ProfilerEncoder class.

+
+
+class dataprofiler.profilers.json_encoder.ProfileEncoder(*, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, default=None)
+

Bases: JSONEncoder

+

JSONify profiler objects and it subclasses and contents.

+

Constructor for JSONEncoder, with sensible defaults.

+

If skipkeys is false, then it is a TypeError to attempt +encoding of keys that are not str, int, float or None. If +skipkeys is True, such items are simply skipped.

+

If ensure_ascii is true, the output is guaranteed to be str +objects with all incoming non-ASCII characters escaped. If +ensure_ascii is false, the output can contain non-ASCII characters.

+

If check_circular is true, then lists, dicts, and custom encoded +objects will be checked for circular references during encoding to +prevent an infinite recursion (which would cause an RecursionError). +Otherwise, no such check takes place.

+

If allow_nan is true, then NaN, Infinity, and -Infinity will be +encoded as such. This behavior is not JSON specification compliant, +but is consistent with most JavaScript based encoders and decoders. +Otherwise, it will be a ValueError to encode such floats.

+

If sort_keys is true, then the output of dictionaries will be +sorted by key; this is useful for regression tests to ensure +that JSON serializations can be compared on a day-to-day basis.

+

If indent is a non-negative integer, then JSON array +elements and object members will be pretty-printed with that +indent level. An indent level of 0 will only insert newlines. +None is the most compact representation.

+

If specified, separators should be an (item_separator, key_separator) +tuple. The default is (’, ‘, ‘: ‘) if indent is None and +(‘,’, ‘: ‘) otherwise. To get the most compact JSON representation, +you should specify (‘,’, ‘:’) to eliminate whitespace.

+

If specified, default is a function that gets called for objects +that can’t otherwise be serialized. It should return a JSON encodable +version of the object or raise a TypeError.

+
+
+default(to_serialize)
+

Specify how an object should be serialized.

+
+
Parameters
+

to_serialize (a BaseColumnProfile object) – an object to be serialized

+
+
Raises
+

NotImplementedError

+
+
Returns
+

a datatype serializble by json.JSONEncoder

+
+
+
+
+
+encode(o)
+

Return a JSON string representation of a Python data structure.

+
>>> from json.encoder import JSONEncoder
+>>> JSONEncoder().encode({"foo": ["bar", "baz"]})
+'{"foo": ["bar", "baz"]}'
+
+
+
+
+
+item_separator = ', '
+
+
+
+iterencode(o, _one_shot=False)
+

Encode the given object and yield each string +representation as available.

+

For example:

+
for chunk in JSONEncoder().iterencode(bigobject):
+    mysocket.write(chunk)
+
+
+
+
+
+key_separator = ': '
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.numerical_column_stats.html b/docs/0.12.0/html/dataprofiler.profilers.numerical_column_stats.html new file mode 100644 index 000000000..4196b27ee --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.numerical_column_stats.html @@ -0,0 +1,537 @@ + + + + + + + + + Numerical Column Stats - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Numerical Column Stats

+

Build model for dataset by identifying col type along with its respective params.

+
+
+class dataprofiler.profilers.numerical_column_stats.abstractstaticmethod(function: Callable)
+

Bases: staticmethod

+

For making function an abstract method.

+

Initialize abstract static method.

+
+
+
+class dataprofiler.profilers.numerical_column_stats.NumericStatsMixin(options: Optional[NumericalOptions] = None)
+

Bases: BaseColumnProfiler[NumericStatsMixinT]

+

Abstract numerical column profile subclass of BaseColumnProfiler.

+

Represents column in the dataset which is a text column. +Has Subclasses itself.

+

Initialize column base properties and itself.

+
+
Parameters
+

options (NumericalOptions) – Options for the numerical stats.

+
+
+
+
+type: str | None = None
+
+
+
+profile() dict
+

Return profile of the column.

+
+
Returns
+

+
+
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Call the profile and remove the disabled columns from profile’s report.

+
+

“Disabled column” is defined as a column +that is not present in self.__calculations but is present +in the self.profile.

+
+
+
Variables
+

remove_disabled_flag – true/false value to tell the code to remove +values missing in __calculations

+
+
Returns
+

Profile object pop’d based on values missing from __calculations

+
+
Return type
+

Profile

+
+
+
+
+
+diff(other_profile: NumericStatsMixinT, options: Optional[dict] = None) dict
+

Find the differences for several numerical stats.

+
+
Parameters
+

other_profile (NumericStatsMixin Profile) – profile to find the difference with

+
+
Returns
+

the numerical stats differences

+
+
Return type
+

dict

+
+
+
+
+
+property mean: float | np.float64
+

Return mean value.

+
+
+
+property mode: list[float]
+

Find an estimate for the mode[s] of the data.

+
+
Returns
+

the mode(s) of the data

+
+
Return type
+

list(float)

+
+
+
+
+
+property median: float
+

Estimate the median of the data.

+
+
Returns
+

the median

+
+
Return type
+

float

+
+
+
+
+
+property variance: float | np.float64
+

Return variance.

+
+
+
+property stddev: float | np.float64
+

Return stddev value.

+
+
+
+property skewness: float | np.float64
+

Return skewness value.

+
+
+
+property kurtosis: float | np.float64
+

Return kurtosis value.

+
+
+
+property median_abs_deviation: float | np.float64
+

Get median absolute deviation estimated from the histogram of the data.

+
+

Subtract bin edges from the median value +Fold the histogram to positive and negative parts around zero +Impose the two bin edges from the two histogram +Calculate the counts for the two histograms with the imposed bin edges +Superimpose the counts from the two histograms +Interpolate the median absolute deviation from the superimposed counts

+
+
+
Returns
+

median absolute deviation

+
+
+
+
+
+col_type = None
+
+
+
+classmethod load_from_dict(data: dict[str, Any], config: dict | None = None) BaseColumnProfilerT
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

BaseColumnProfiler

+
+
+
+
+
+name: str | None
+
+
+
+sample_size: int
+
+
+
+metadata: dict
+
+
+
+times: dict
+
+
+
+thread_safe: bool
+
+
+
+abstract update(df_series: Series) NumericStatsMixin
+

Update the numerical profile properties with an uncleaned dataset.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series with nulls removed

+
+
Returns
+

None

+
+
+
+
+
+static is_float(x: str) bool
+

Return True if x is float.

+

For “0.80” this function returns True +For “1.00” this function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is float or not

+
+
Return type
+

bool

+
+
+
+
+
+static is_int(x: str) bool
+

Return True if x is integer.

+

For “0.80” This function returns False +For “1.00” This function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is integer or not

+
+
Return type
+

bool

+
+
+
+
+
+static np_type_to_type(val: Any) Any
+

Convert numpy variables to base python type variables.

+
+
Parameters
+

val (numpy type or base type) – value to check & change

+
+
Return val
+

base python type

+
+
Rtype val
+

int or float

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.order_column_profile.html b/docs/0.12.0/html/dataprofiler.profilers.order_column_profile.html new file mode 100644 index 000000000..d45457129 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.order_column_profile.html @@ -0,0 +1,403 @@ + + + + + + + + + Order Column Profile - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Order Column Profile

+

Index profile analysis for individual col within structured profiling.

+
+
+class dataprofiler.profilers.order_column_profile.Comparable(*args, **kwargs)
+

Bases: Protocol

+

Protocol for ensuring comparable types, in this case both floats or strings.

+
+
+
+class dataprofiler.profilers.order_column_profile.OrderColumn(name: str | None, options: OrderOptions = None)
+

Bases: BaseColumnProfiler[OrderColumn]

+

Index column profile subclass of BaseColumnProfiler.

+

Represents a column in the dataset which is an index column.

+

Initialize column base properties and self.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (OrderOptions) – Options for the Order column

  • +
+
+
+
+
+type = 'order'
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Private abstract method for returning report.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled +options should be excluded in the report.

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None)
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – options for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

CategoricalColumn

+
+
+
+
+
+property profile: dict
+

Property for profile. Returns the profile of the column.

+
+
Returns
+

+
+
+
+
+
+diff(other_profile: OrderColumn, options: Optional[dict] = None) dict
+

Generate the differences between the orders of two OrderColumns.

+
+
Returns
+

Dict containing the differences between orders in their

+
+
+

appropriate output formats +:rtype: dict

+
+
+
+update(df_series: Series) OrderColumn
+

Update the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

updated OrderColumn

+
+
Return type
+

OrderColumn

+
+
+
+
+
+col_type = None
+
+
+
+name: str | None
+
+
+
+sample_size: int
+
+
+
+metadata: dict
+
+
+
+times: dict
+
+
+
+thread_safe: bool
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.profile_builder.html b/docs/0.12.0/html/dataprofiler.profilers.profile_builder.html new file mode 100644 index 000000000..f8c746e64 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.profile_builder.html @@ -0,0 +1,893 @@ + + + + + + + + + Profile Builder - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Profile Builder

+

Build model for dataset by identifying col type along with its respective params.

+
+
+class dataprofiler.profilers.profile_builder.StructuredColProfiler(df_series: Optional[Series] = None, sample_size: Optional[int] = None, min_sample_size: int = 5000, sampling_ratio: float = 0.2, min_true_samples: int = 0, sample_ids: Optional[ndarray] = None, pool: Optional[Pool] = None, column_index: Optional[int] = None, options: Optional[StructuredOptions] = None)
+

Bases: object

+

For profiling structured data columns.

+

Instantiate the StructuredColProfiler class for a given column.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – Data to be profiled

  • +
  • sample_size (int) – Number of samples to use in generating profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • sample_ids (list(list)) – Randomized list of sample indices

  • +
  • pool (multiprocessing.Pool) – pool utilized for multiprocessing

  • +
  • column_index (int) – index of the given column

  • +
  • options (StructuredOptions Object) – Options for the structured profiler.

  • +
+
+
+
+
+update_column_profilers(clean_sampled_df: Series, pool: Optional[Pool] = None) None
+

Calculate type statistics and label dataset.

+
+
Parameters
+
    +
  • clean_sampled_df (Pandas.Series) – sampled series with none types dropped

  • +
  • pool (multiprocessing.pool) – pool utilized for multiprocessing

  • +
+
+
+
+
+
+diff(other_profile: StructuredColProfiler, options: Optional[dict] = None) dict
+

Find the difference between 2 StructuredCols and return the report.

+
+
Parameters
+
    +
  • other_profile (StructuredColProfiler) – Structured col finding the difference with this +one.

  • +
  • options (dict) – options to change results of the difference

  • +
+
+
Returns
+

difference of the structured column

+
+
Return type
+

dict

+
+
+
+
+
+report(remove_disabled_flag: bool = False) OrderedDict
+

Return profile.

+
+
+
+classmethod load_from_dict(data, config: dict | None = None) StructuredColProfiler
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading structured column profiler

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

StructuredColProfiler

+
+
+
+
+
+property profile: dict
+

Return a report.

+
+
+
+update_profile(df_series: Series, sample_size: Optional[int] = None, min_true_samples: Optional[int] = None, sample_ids: Optional[ndarray] = None, pool: Optional[Pool] = None) None
+

Update the column profiler.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – Data to be profiled

  • +
  • sample_size (int) – Number of samples to use in generating profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • sample_ids (list(list)) – Randomized list of sample indices

  • +
  • pool (multiprocessing.Pool) – pool utilized for multiprocessing

  • +
+
+
+
+
+
+static clean_data_and_get_base_stats(df_series: pd.Series, sample_size: int, null_values: dict[str, re.RegexFlag | int] = None, min_true_samples: int = None, sample_ids: np.ndarray | list[list[int]] | None = None) tuple[pd.Series, dict]
+

Identify null characters and return them in a dictionary.

+

Remove any nulls in column.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column

  • +
  • sample_size (int) – Number of samples to use in generating the profile

  • +
  • null_values (Dict[str, Union[re.RegexFlag, int]]) – Dictionary mapping null values to regex flag where +the key represents the null value to remove from the data and the +flag represents the regex flag to apply

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • sample_ids (list(list)) – Randomized list of sample indices

  • +
+
+
Returns
+

updated column with null removed and dictionary of null +parameters

+
+
Return type
+

pd.Series, dict

+
+
+
+
+
+
+class dataprofiler.profilers.profile_builder.BaseProfiler(data: Data | None, samples_per_update: int = None, min_true_samples: int = 0, options: BaseOption = None)
+

Bases: object

+

Abstract class for profiling data.

+

Instantiate the BaseProfiler class.

+
+
Parameters
+
    +
  • data (Data class object) – Data to be profiled

  • +
  • samples_per_update (int) – Number of samples to use in generating +profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • options (ProfilerOptions Object) – Options for the profiler.

  • +
+
+
Returns
+

Profiler

+
+
+
+
+diff(other_profile: BaseProfiler, options: Optional[dict] = None) dict
+

Find the difference of two profiles.

+
+
Parameters
+

other_profile (BaseProfiler) – profile being added to this one.

+
+
Returns
+

diff of the two profiles

+
+
Return type
+

dict

+
+
+
+
+
+property profile: BaseCompiler | list[StructuredColProfiler]
+

Return the stored profiles for the given profiler.

+
+
Returns
+

BaseCompiler | list[StructuredColProfiler]

+
+
+
+
+
+report(report_options: Optional[dict] = None) dict
+

Return profile report based on all profiled data fed into the profiler.

+
+
User can specify the output_formats: (pretty, compact, serializable, flat).
+
Pretty: floats are rounded to four decimal places, and lists are

shortened.

+
+
Compact: Similar to pretty, but removes detailed statistics such as

runtimes, label probabilities, index locations of null types, +etc.

+
+
+

Serializable: Output is json serializable and not prettified +Flat: Nested output is returned as a flattened dictionary

+
+
+
+
Variables
+

report_options – optional format changes to the report +dict(output_format=<FORMAT>)

+
+
Returns
+

dictionary report

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseProfilerT
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for overriding data params when loading from dict

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

BaseProfiler

+
+
+
+
+
+update_profile(data: data_readers.base_data.BaseData | pd.DataFrame | pd.Series, sample_size: int = None, min_true_samples: int = None) None
+

Update the profile for data provided.

+

User can specify the sample size to profile the data with. +Additionally, the user can specify the +minimum number of non-null samples to profile.

+
+
Parameters
+
    +
  • data (Union[data_readers.base_data.BaseData, pandas.DataFrame, +pandas.Series]) – data to be profiled

  • +
  • sample_size (int) – number of samples to profile from the data

  • +
  • min_true_samples (int) – minimum number of non-null samples to profile

  • +
+
+
Returns
+

None

+
+
+
+
+
+save(filepath: Optional[str] = None, save_method: str = 'pickle') None
+

Save profiler to disk.

+
+
Parameters
+
    +
  • filepath (String) – Path of file to save to

  • +
  • save_method (String) – The desired saving method (must be “pickle” or “json”)

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod load(filepath: str, load_method: str | None = None) BaseProfiler
+

Load profiler from disk.

+
+
Parameters
+
    +
  • filepath (String) – Path of file to load from

  • +
  • load_method (Optional[String]) – The desired loading method, default = None

  • +
+
+
Returns
+

Profiler being loaded, StructuredProfiler or +UnstructuredProfiler

+
+
Return type
+

BaseProfiler

+
+
+
+
+
+
+class dataprofiler.profilers.profile_builder.UnstructuredProfiler(data: Data, samples_per_update: Optional[int] = None, min_true_samples: int = 0, options: Optional[BaseOption] = None)
+

Bases: BaseProfiler

+

For profiling unstructured data.

+

Instantiate the UnstructuredProfiler class.

+
+
Parameters
+
    +
  • data (Data class object) – Data to be profiled

  • +
  • samples_per_update (int) – Number of samples to use in generating +profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • options (ProfilerOptions Object) – Options for the profiler.

  • +
+
+
Returns
+

UnstructuredProfiler

+
+
+
+
+diff(other_profile: UnstructuredProfiler, options: dict | None = None) dict
+

Find difference between 2 unstuctured profiles and return the report.

+
+
Parameters
+
    +
  • other_profile (UnstructuredProfiler) – profile finding the difference with this one.

  • +
  • options (dict) – options to impact the results of the diff

  • +
+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+property profile: BaseCompiler
+

Return the stored profiles for the given profiler.

+
+
Returns
+

BaseCompiler

+
+
+
+
+
+report(report_options: Optional[dict] = None) dict
+

Return unstructured report based on all profiled data fed into profiler.

+
+
User can specify the output_formats: (pretty, compact, serializable, flat).
+
Pretty: floats are rounded to four decimal places, and lists are

shortened.

+
+
Compact: Similar to pretty, but removes detailed statistics such as

runtimes, label probabilities, index locations of null types, +etc.

+
+
+

Serializable: Output is json serializable and not prettified +Flat: Nested output is returned as a flattened dictionary

+
+
+
+
Variables
+

report_options – optional format changes to the report +dict(output_format=<FORMAT>)

+
+
Returns
+

dictionary report

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None)
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading profiler params from dictionary

  • +
+
+
Raises
+

NotImplementedError

+
+
+
+
+
+save(filepath: Optional[str] = None, save_method: str = 'pickle') None
+

Save profiler to disk.

+
+
Parameters
+
    +
  • filepath (String) – Path of file to save to

  • +
  • save_method (String) – The desired saving method (“pickle” | “json”)

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod load(filepath: str, load_method: str | None = None) BaseProfiler
+

Load profiler from disk.

+
+
Parameters
+
    +
  • filepath (String) – Path of file to load from

  • +
  • load_method (Optional[String]) – The desired loading method, default = None

  • +
+
+
Returns
+

Profiler being loaded, StructuredProfiler or +UnstructuredProfiler

+
+
Return type
+

BaseProfiler

+
+
+
+
+
+update_profile(data: data_readers.base_data.BaseData | pd.DataFrame | pd.Series, sample_size: int = None, min_true_samples: int = None) None
+

Update the profile for data provided.

+

User can specify the sample size to profile the data with. +Additionally, the user can specify the +minimum number of non-null samples to profile.

+
+
Parameters
+
    +
  • data (Union[data_readers.base_data.BaseData, pandas.DataFrame, +pandas.Series]) – data to be profiled

  • +
  • sample_size (int) – number of samples to profile from the data

  • +
  • min_true_samples (int) – minimum number of non-null samples to profile

  • +
+
+
Returns
+

None

+
+
+
+
+
+
+class dataprofiler.profilers.profile_builder.StructuredProfiler(data: Data, samples_per_update: Optional[int] = None, min_true_samples: int = 0, options: Optional[BaseOption] = None)
+

Bases: BaseProfiler

+

For profiling structured data.

+

Instantiate the StructuredProfiler class.

+
+
Parameters
+
    +
  • data (Data class object) – Data to be profiled

  • +
  • samples_per_update (int) – Number of samples to use in generating +profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • options (ProfilerOptions Object) – Options for the profiler.

  • +
+
+
Returns
+

StructuredProfiler

+
+
+
+
+diff(other_profile: StructuredProfiler, options: dict | None = None) dict
+

Find the difference between 2 Profiles and return the report.

+
+
Parameters
+
    +
  • other_profile (StructuredProfiler) – profile finding the difference with this one

  • +
  • options (dict) – options to change results of the difference

  • +
+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+property profile: list[dataprofiler.profilers.profile_builder.StructuredColProfiler]
+

Return the stored profiles for the given profiler.

+
+
Returns
+

list[StructuredColProfiler]

+
+
+
+
+
+report(report_options: Optional[dict] = None) dict
+

Return a report.

+
+
+
+classmethod load_from_dict(data, config: dict | None = None) StructuredProfiler
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

StructuredProfiler

+
+
+
+
+
+save(filepath: Optional[str] = None, save_method: str = 'pickle') None
+

Save profiler to disk.

+
+
Parameters
+
    +
  • filepath (String) – Path of file to save to

  • +
  • save_method (String) – The desired saving method (must be “pickle” or “json”)

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod load(filepath: str, load_method: str | None = None) BaseProfiler
+

Load profiler from disk.

+
+
Parameters
+
    +
  • filepath (String) – Path of file to load from

  • +
  • load_method (Optional[String]) – The desired loading method, default = None

  • +
+
+
Returns
+

Profiler being loaded, StructuredProfiler or +UnstructuredProfiler

+
+
Return type
+

BaseProfiler

+
+
+
+
+
+update_profile(data: data_readers.base_data.BaseData | pd.DataFrame | pd.Series, sample_size: int = None, min_true_samples: int = None) None
+

Update the profile for data provided.

+

User can specify the sample size to profile the data with. +Additionally, the user can specify the +minimum number of non-null samples to profile.

+
+
Parameters
+
    +
  • data (Union[data_readers.base_data.BaseData, pandas.DataFrame, +pandas.Series]) – data to be profiled

  • +
  • sample_size (int) – number of samples to profile from the data

  • +
  • min_true_samples (int) – minimum number of non-null samples to profile

  • +
+
+
Returns
+

None

+
+
+
+
+
+
+class dataprofiler.profilers.profile_builder.Profiler(data: Data, samples_per_update: int = None, min_true_samples: int = 0, options: ProfilerOptions = None, profiler_type: str = None)
+

Bases: object

+

For profiling data.

+

Instantiate Structured and Unstructured Profilers.

+

This is a factory class.

+
+
Parameters
+
    +
  • data (Data class object) – Data to be profiled, type allowed depends on the +profiler_type

  • +
  • samples_per_update (int) – Number of samples to use to generate profile

  • +
  • min_true_samples (int) – Min number of samples required for the profiler

  • +
  • options (ProfilerOptions Object) – Options for the profiler.

  • +
  • profiler_type (str) – Type of Profiler (“graph”/”structured”/”unstructured”)

  • +
+
+
Returns
+

Union[GraphProfiler, StructuredProfiler, UnstructuredProfiler]

+
+
+
+
+classmethod load(filepath: str, load_method: str | None = None) BaseProfiler
+

Load profiler from disk.

+
+
Parameters
+
    +
  • filepath (String) – Path of file to load from

  • +
  • load_method (Optional[String]) – The desired loading method, default = “None”

  • +
+
+
Returns
+

Profiler being loaded, StructuredProfiler or +UnstructuredProfiler

+
+
Return type
+

BaseProfiler

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.profiler_options.html b/docs/0.12.0/html/dataprofiler.profilers.profiler_options.html new file mode 100644 index 000000000..4be368809 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.profiler_options.html @@ -0,0 +1,2397 @@ + + + + + + + + + Profiler Options - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Profiler Options

+

Specify the options when running the data profiler.

+
+
+class dataprofiler.profilers.profiler_options.BaseOption
+

Bases: Generic[BaseOptionT]

+

For configuring options.

+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.BooleanOption(is_enabled: bool = True)
+

Bases: BaseOption[BooleanOptionT]

+

For setting Boolean options.

+

Initialize Boolean option.

+
+
Variables
+

is_enabled (bool) – boolean option to enable/disable the option.

+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.HistogramAndQuantilesOption(is_enabled: bool = True, bin_count_or_method: str | int | list[str] = 'auto', num_quantiles: int = 1000)
+

Bases: BooleanOption[HistogramAndQuantilesOption]

+

For setting histogram options.

+

Initialize Options for histograms.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the option.

  • +
  • bin_count_or_method (Union[str, int, list(str)]) – bin count or the method with which to +calculate histograms

  • +
  • num_quantiles (int) – number of quantiles

  • +
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.ModeOption(is_enabled: bool = True, max_k_modes: int = 5)
+

Bases: BooleanOption[ModeOption]

+

For setting mode estimation options.

+

Initialize Options for mode estimation.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the option.

  • +
  • max_k_modes (int) – the max number of modes to return, if applicable

  • +
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.BaseInspectorOptions(is_enabled: bool = True)
+

Bases: BooleanOption[BaseInspectorOptionsT]

+

For setting Base options.

+

Initialize Base options for all the columns.

+
+
Variables
+

is_enabled (bool) – boolean option to enable/disable the column.

+
+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.NumericalOptions
+

Bases: BaseInspectorOptions[NumericalOptionsT]

+

For configuring options for Numerican Stats Mixin.

+

Initialize Options for the Numerical Stats Mixin.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • min (BooleanOption) – boolean option to enable/disable min

  • +
  • max (BooleanOption) – boolean option to enable/disable max

  • +
  • mode (ModeOption) – option to enable/disable mode and set return count

  • +
  • median (BooleanOption) – option to enable/disable median

  • +
  • sum (BooleanOption) – boolean option to enable/disable sum

  • +
  • variance (BooleanOption) – boolean option to enable/disable variance

  • +
  • skewness (BooleanOption) – boolean option to enable/disable skewness

  • +
  • kurtosis (BooleanOption) – boolean option to enable/disable kurtosis

  • +
  • histogram_and_quantiles (BooleanOption) – boolean option to enable/disable +histogram_and_quantiles

  • +
+
+
+

:ivar bias_correction : boolean option to enable/disable existence of bias +:vartype bias_correction: BooleanOption +:ivar num_zeros: boolean option to enable/disable num_zeros +:vartype num_zeros: BooleanOption +:ivar num_negatives: boolean option to enable/disable num_negatives +:vartype num_negatives: BooleanOption +:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric

+
+

stats

+
+
+
+
+
+property is_numeric_stats_enabled: bool
+

Return the state of numeric stats being enabled / disabled.

+

If any numeric stats property is enabled it will return True, +otherwise it will return False.

+
+
Returns
+

true if any numeric stats property is enabled, otherwise false

+
+
Rtype bool
+

+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Include is_enabled.

+

is_enabled: Turns on or off the column.

+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.IntOptions
+

Bases: NumericalOptions[IntOptions]

+

For configuring options for Int Column.

+

Initialize Options for the Int Column.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • min (BooleanOption) – boolean option to enable/disable min

  • +
  • max (BooleanOption) – boolean option to enable/disable max

  • +
  • mode (ModeOption) – option to enable/disable mode and set return count

  • +
  • median (BooleanOption) – option to enable/disable median

  • +
  • sum (BooleanOption) – boolean option to enable/disable sum

  • +
  • variance (BooleanOption) – boolean option to enable/disable variance

  • +
  • skewness (BooleanOption) – boolean option to enable/disable skewness

  • +
  • kurtosis (BooleanOption) – boolean option to enable/disable kurtosis

  • +
  • histogram_and_quantiles (BooleanOption) – boolean option to enable/disable +histogram_and_quantiles

  • +
+
+
+

:ivar bias_correction : boolean option to enable/disable existence of bias +:vartype bias_correction: BooleanOption +:ivar num_zeros: boolean option to enable/disable num_zeros +:vartype num_zeros: BooleanOption +:ivar num_negatives: boolean option to enable/disable num_negatives +:vartype num_negatives: BooleanOption +:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric

+
+

stats

+
+
+
+
+
+property is_numeric_stats_enabled: bool
+

Return the state of numeric stats being enabled / disabled.

+

If any numeric stats property is enabled it will return True, +otherwise it will return False.

+
+
Returns
+

true if any numeric stats property is enabled, otherwise false

+
+
Rtype bool
+

+
+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Include is_enabled.

+

is_enabled: Turns on or off the column.

+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.PrecisionOptions(is_enabled: bool = True, sample_ratio: Optional[float] = None)
+

Bases: BooleanOption[PrecisionOptions]

+

For configuring options for precision.

+

Initialize Options for precision.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • sample_ratio (float) – float option to determine ratio of valid +float samples in determining percision. +This ratio will override any defaults.

  • +
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.FloatOptions
+

Bases: NumericalOptions[FloatOptions]

+

For configuring options for Float Column.

+

Initialize Options for the Float Column.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • min (BooleanOption) – boolean option to enable/disable min

  • +
  • max (BooleanOption) – boolean option to enable/disable max

  • +
  • mode (ModeOption) – option to enable/disable mode and set return count

  • +
  • median (BooleanOption) – option to enable/disable median

  • +
  • sum (BooleanOption) – boolean option to enable/disable sum

  • +
  • variance (BooleanOption) – boolean option to enable/disable variance

  • +
  • skewness (BooleanOption) – boolean option to enable/disable skewness

  • +
  • kurtosis (BooleanOption) – boolean option to enable/disable kurtosis

  • +
  • histogram_and_quantiles (BooleanOption) – boolean option to enable/disable +histogram_and_quantiles

  • +
+
+
+

:ivar bias_correction : boolean option to enable/disable existence of bias +:vartype bias_correction: BooleanOption +:ivar num_zeros: boolean option to enable/disable num_zeros +:vartype num_zeros: BooleanOption +:ivar num_negatives: boolean option to enable/disable num_negatives +:vartype num_negatives: BooleanOption +:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric

+
+

stats

+
+
+
+
+
+property is_numeric_stats_enabled: bool
+

Return the state of numeric stats being enabled / disabled.

+

If any numeric stats property is enabled it will return True, +otherwise it will return False.

+
+
Returns
+

true if any numeric stats property is enabled, otherwise false

+
+
Rtype bool
+

+
+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Include is_enabled.

+

is_enabled: Turns on or off the column.

+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.TextOptions
+

Bases: NumericalOptions[TextOptions]

+

For configuring options for Text Column.

+

Initialize Options for the Text Column.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • vocab (BooleanOption) – boolean option to enable/disable vocab

  • +
  • min (BooleanOption) – boolean option to enable/disable min

  • +
  • max (BooleanOption) – boolean option to enable/disable max

  • +
  • mode (ModeOption) – option to enable/disable mode and set return count

  • +
  • median (BooleanOption) – option to enable/disable median

  • +
  • sum (BooleanOption) – boolean option to enable/disable sum

  • +
  • variance (BooleanOption) – boolean option to enable/disable variance

  • +
  • skewness (BooleanOption) – boolean option to enable/disable skewness

  • +
  • kurtosis (BooleanOption) – boolean option to enable/disable kurtosis

  • +
+
+
+

:ivar bias_correction : boolean option to enable/disable existence of bias +:vartype bias_correction: BooleanOption +:ivar histogram_and_quantiles: boolean option to enable/disable

+
+

histogram_and_quantiles

+
+
+
Variables
+
    +
  • num_zeros (BooleanOption) – boolean option to enable/disable num_zeros

  • +
  • num_negatives (BooleanOption) – boolean option to enable/disable num_negatives

  • +
  • is_numeric_stats_enabled (bool) – boolean to enable/disable all numeric +stats

  • +
+
+
+
+
+property is_numeric_stats_enabled: bool
+

Return the state of numeric stats being enabled / disabled.

+

If any numeric stats property is enabled it will return True, otherwise +it will return False. Although it seems redundant, this method is needed +in order for the function below, the setter function +also called is_numeric_stats_enabled, to properly work.

+
+
Returns
+

true if any numeric stats property is enabled, otherwise false

+
+
Rtype bool
+

+
+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Include is_enabled.

+

is_enabled: Turns on or off the column.

+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.DateTimeOptions
+

Bases: BaseInspectorOptions[DateTimeOptions]

+

For configuring options for Datetime Column.

+

Initialize Options for the Datetime Column.

+
+
Variables
+

is_enabled (bool) – boolean option to enable/disable the column.

+
+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.OrderOptions
+

Bases: BaseInspectorOptions[OrderOptions]

+

For configuring options for Order Column.

+

Initialize options for the Order Column.

+
+
Variables
+

is_enabled (bool) – boolean option to enable/disable the column.

+
+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.CategoricalOptions(is_enabled: bool = True, top_k_categories: int | None = None, max_sample_size_to_check_stop_condition: int | None = None, stop_condition_unique_value_ratio: float | None = None, cms: bool = False, cms_confidence: float | None = 0.95, cms_relative_error: float | None = 0.01, cms_max_num_heavy_hitters: int | None = 5000)
+

Bases: BaseInspectorOptions[CategoricalOptions]

+

For configuring options Categorical Column.

+

Initialize options for the Categorical Column.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • top_k_categories ([None, int]) – number of categories to be displayed when called

  • +
  • max_sample_size_to_check_stop_condition ([None, int]) – The maximum sample size +before categorical stop conditions are checked

  • +
  • stop_condition_unique_value_ratio ([None, float]) – The highest ratio of unique +values to dataset size that is to be considered a categorical type

  • +
  • cms (bool) – boolean option for using count min sketch

  • +
  • cms_confidence ([None, float]) – defines the number of hashes used in CMS. +eg. confidence = 1 - failure probability, default 0.95

  • +
  • cms_relative_error ([None, float]) – defines the number of buckets used in CMS, +default 0.01

  • +
  • cms_max_num_heavy_hitters – value used to define

  • +
+
+
+

the threshold for minimum frequency required by a category to be counted +:vartype cms_max_num_heavy_hitters: [None, int]

+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.CorrelationOptions(is_enabled: bool = False, columns: Optional[list[str]] = None)
+

Bases: BaseInspectorOptions[CorrelationOptions]

+

For configuring options for Correlation between Columns.

+

Initialize options for the Correlation between Columns.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable.

  • +
  • columns (list()) – Columns considered to calculate correlation

  • +
+
+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.HyperLogLogOptions(seed: int = 0, register_count: int = 15)
+

Bases: BaseOption[HyperLogLogOptions]

+

Options for alternative method of gathering unique row count.

+

Initialize options for the hyperloglog method of gathering unique row count.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable.

  • +
  • seed (int) – seed used to set HLL hashing function

  • +
  • register_count (int) – number of registers is equal to 2^register_count

  • +
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.UniqueCountOptions(is_enabled: bool = True, hashing_method: str = 'full')
+

Bases: BooleanOption[UniqueCountOptions]

+

For configuring options for unique row count.

+

Initialize options for unique row counts.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable.

  • +
  • hashing_method (str) – property to specify row hashing method (“full” | “hll”)

  • +
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.RowStatisticsOptions(is_enabled: bool = True, unique_count: bool = True, null_count: bool = True)
+

Bases: BooleanOption[RowStatisticsOptions]

+

For configuring options for row statistics.

+

Initialize options for row statistics.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable.

  • +
  • unique_count (bool) – boolean option to enable/disable unique_count

  • +
+
+
+

ivar null_count: boolean option to enable/disable null_count +:vartype null_count: bool

+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.DataLabelerOptions
+

Bases: BaseInspectorOptions[DataLabelerOptions]

+

For configuring options for Data Labeler Column.

+

Initialize options for the Data Labeler Column.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • data_labeler_dirpath (str) – String to load data labeler

  • +
  • max_sample_size (BaseDataLabeler) – Int to decide sample size

  • +
  • data_labeler_object – DataLabeler object used in profiler

  • +
+
+
+
+
+property properties: dict
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) DataLabelerOptions
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

DataLabelerOptions

+
+
+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.TextProfilerOptions(is_enabled: bool = True, is_case_sensitive: bool = True, stop_words: Optional[set[str]] = None, top_k_chars: Optional[int] = None, top_k_words: Optional[int] = None)
+

Bases: BaseInspectorOptions[TextProfilerOptions]

+

For configuring options for text profiler.

+

Construct the TextProfilerOption object with default values.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the option.

  • +
  • is_case_sensitive (bool) – option set for case sensitivity.

  • +
  • stop_words (Union[None, list(str)]) – option set for stop words.

  • +
  • top_k_chars (Union[None, int]) – option set for number of top common characters.

  • +
  • top_k_words (Union[None, int]) – option set for number of top common words.

  • +
  • words (BooleanOption) – option set for word update.

  • +
  • vocab (BooleanOption) – option set for vocab update.

  • +
+
+
+
+
+is_prop_enabled(prop: str) bool
+

Check to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.StructuredOptions(null_values: dict[str, re.RegexFlag | int] = None, column_null_values: dict[int, dict[str, re.RegexFlag | int]] = None, sampling_ratio: float = 0.2)
+

Bases: BaseOption[StructuredOptions]

+

For configuring options for structured profiler.

+

Construct the StructuredOptions object with default values.

+
+
Parameters
+
    +
  • null_values – null values we input.

  • +
  • column_null_values – column level null values we input.

  • +
+
+
Variables
+
    +
  • int (IntOptions) – option set for int profiling.

  • +
  • float (FloatOptions) – option set for float profiling.

  • +
  • datetime (DateTimeOptions) – option set for datetime profiling.

  • +
  • text (TextOptions) – option set for text profiling.

  • +
  • order (OrderOptions) – option set for order profiling.

  • +
  • category (CategoricalOptions) – option set for category profiling.

  • +
  • data_labeler (DataLabelerOptions) – option set for data_labeler profiling.

  • +
  • correlation (CorrelationOptions) – option set for correlation profiling.

  • +
  • chi2_homogeneity (BooleanOption()) – option set for chi2_homogeneity matrix

  • +
  • row_statistics (BooleanOption()) – option set for row statistics calculations

  • +
  • null_replication_metrics (BooleanOptions) – option set for metrics +calculation for replicating nan vals

  • +
  • null_values (Union[None, dict]) – option set for defined null values

  • +
  • sampling_ratio (Union[None, float]) – What ratio of the input data to sample. +Float value > 0 and <= 1

  • +
+
+
+
+
+property enabled_profiles: list[str]
+

Return a list of the enabled profilers for columns.

+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.UnstructuredOptions
+

Bases: BaseOption[UnstructuredOptions]

+

For configuring options for unstructured profiler.

+

Construct the UnstructuredOptions object with default values.

+
+
Variables
+
+
+
+
+
+property enabled_profiles: list[str]
+

Return a list of the enabled profilers.

+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options: dict[str, bool]) None
+

Set all the options.

+

Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.ProfilerOptions(presets: Optional[str] = None)
+

Bases: BaseOption[ProfilerOptions]

+

For configuring options for profiler.

+

Initialize the ProfilerOptions object.

+
+
Variables
+
    +
  • structured_options (StructuredOptions) – option set for structured dataset profiling.

  • +
  • unstructured_options (UnstructuredOptions) – option set for unstructured dataset profiling.

  • +
  • presets (Optional[str]) – A pre-configured mapping of a string name to group of options: +“complete”, “data_types”, “numeric_stats_disabled”, +and “lower_memory_sketching”. Default: None

  • +
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None) BaseOption
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config to override loading options params from dictionary

  • +
+
+
Returns
+

Options with attributes populated.

+
+
Return type
+

BaseOption

+
+
+
+
+
+property properties: dict[str, dataprofiler.profilers.profiler_options.BooleanOption]
+

Return a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+validate(raise_error: bool = True) list[str] | None
+

Validate the options do not conflict and cause errors.

+

Raises error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+set(options: dict[str, Any]) None
+

Overwrite BaseOption.set.

+

We do this because the type (unstructured/structured) may +need to be specified if the same options exist within both +self.structured_options and self.unstructured_options

+
+
Parameters
+

options (dict) – Dictionary of options to set

+
+
Return
+

None

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.profiler_utils.html b/docs/0.12.0/html/dataprofiler.profilers.profiler_utils.html new file mode 100644 index 000000000..b5f837e35 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.profiler_utils.html @@ -0,0 +1,826 @@ + + + + + + + + + Profiler Utils - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Profiler Utils

+

Contains functions for profilers.

+
+
+dataprofiler.profilers.profiler_utils.recursive_dict_update(d: dict, update_d: dict) dict
+

Recursive updates nested dictionaries. Updating d with update_d.

+
+
Parameters
+
    +
  • d – dict which gets updated with update_d

  • +
  • update_d – dict to update d with

  • +
+
+
Returns
+

updated dict

+
+
+
+
+
+class dataprofiler.profilers.profiler_utils.KeyDict
+

Bases: defaultdict

+

Helper class for sample_in_chunks.

+

Allows keys that are missing to become the values for that key. +From: +https://www.drmaciver.com/2018/01/lazy-fisher-yates-shuffling-for-precise-rejection-sampling/

+
+
+clear() None.  Remove all items from D.
+
+
+
+copy() a shallow copy of D.
+
+
+
+default_factory
+

Factory for default value called by __missing__().

+
+
+
+fromkeys(value=None, /)
+

Create a new dictionary with keys from iterable and values set to value.

+
+
+
+get(key, default=None, /)
+

Return the value for key if key is in the dictionary, else default.

+
+
+
+items() a set-like object providing a view on D's items
+
+
+
+keys() a set-like object providing a view on D's keys
+
+
+
+pop(k[, d]) v, remove specified key and return the corresponding value.
+

If key is not found, default is returned if given, otherwise KeyError is raised

+
+
+
+popitem()
+

Remove and return a (key, value) pair as a 2-tuple.

+

Pairs are returned in LIFO (last-in, first-out) order. +Raises KeyError if the dict is empty.

+
+
+
+setdefault(key, default=None, /)
+

Insert key with a value of default if key is not in the dictionary.

+

Return the value for key if key is in the dictionary, else default.

+
+
+
+update([E, ]**F) None.  Update D from dict/iterable E and F.
+

If E is present and has a .keys() method, then does: for k in E: D[k] = E[k] +If E is present and lacks a .keys() method, then does: for k, v in E: D[k] = v +In either case, this is followed by: for k in F: D[k] = F[k]

+
+
+
+values() an object providing a view on D's values
+
+
+
+
+dataprofiler.profilers.profiler_utils.shuffle_in_chunks(data_length: int, chunk_size: int) Generator[list[int], None, Any]
+

Create shuffled indexes in chunks.

+

This reduces the cost of having to create all indexes, +but only of that what is needed. +Initial Code idea from: +https://www.drmaciver.com/2018/01/lazy-fisher-yates-shuffling-for-precise-rejection-sampling/

+
+
Parameters
+
    +
  • data_length – length of data to be shuffled

  • +
  • chunk_size – size of shuffled chunks

  • +
+
+
Returns
+

list of shuffled indices of chunk size

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.warn_on_profile(col_profile: str, e: Exception) None
+

Return a warning if a given profile errors (tensorflow typically).

+
+
Parameters
+
    +
  • col_profile (str) – Name of the column profile

  • +
  • e (Exception) – Error message from profiler error

  • +
+
+
+
+
+
+dataprofiler.profilers.profiler_utils.partition(data: list, chunk_size: int) Generator[list, None, Any]
+

Create a generator which returns data in specified chunk size.

+
+
Parameters
+
    +
  • data (list, dataframe, etc) – list, dataframe, etc

  • +
  • chunk_size (int) – size of partition to return

  • +
+
+
+
+
+
+dataprofiler.profilers.profiler_utils.auto_multiprocess_toggle(data: DataFrame, num_rows_threshold: int = 750000, num_cols_threshold: int = 20) bool
+

Automate multiprocessing toggle depending on dataset sizes.

+
+
Parameters
+
    +
  • data (pandas.DataFrame) – a dataset

  • +
  • num_rows_threshold (int) – threshold for number of rows to +use multiprocess

  • +
  • num_cols_threshold (int) – threshold for number of columns +to use multiprocess

  • +
+
+
Returns
+

recommended option.multiprocess.is_enabled value

+
+
Return type
+

bool

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.suggest_pool_size(data_size: int = None, cols: int = None) int | None
+

Suggest the pool size based on resources.

+
+
Parameters
+
    +
  • data_size (int) – size of the dataset

  • +
  • cols (int) – columns of the dataset

  • +
+
+
Return suggested_pool_size
+

suggested pool size

+
+
Rtype suggested_pool_size
+

int

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.generate_pool(max_pool_size: int = None, data_size: int = None, cols: int = None) tuple[Pool | None, int | None]
+

Generate a multiprocessing pool to allocate functions too.

+
+
Parameters
+
    +
  • max_pool_size (Union[int, None]) – Max number of processes assigned to the pool

  • +
  • data_size (int) – size of the dataset

  • +
  • cols (int) – columns of the dataset

  • +
+
+
Return pool
+

Multiprocessing pool to allocate processes to

+
+
Rtype pool
+

Multiproessing.Pool

+
+
Return cpu_count
+

Number of processes (cpu bound) to utilize

+
+
Rtype cpu_count
+

int

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.overlap(x1: int | Any, x2: int | Any, y1: int | Any, y2: int | Any) bool
+

Return True iff [x1:x2] overlaps with [y1:y2].

+
+
+
+dataprofiler.profilers.profiler_utils.add_nested_dictionaries(first_dict: dict, second_dict: dict) dict
+

Merge two dictionaries together and add values together.

+
+
Parameters
+
    +
  • first_dict (dict) – dictionary to be merged

  • +
  • second_dict (dict) – dictionary to be merged

  • +
+
+
Returns
+

merged dictionary

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.biased_skew(df_series: Series) float64
+

Calculate the biased estimator for skewness of the given data.

+
+
The definition is formalized as g_1 here:

https://en.wikipedia.org/wiki/Skewness#Sample_skewness

+
+
+
+
Parameters
+

df_series (pandas Series) – data to get skewness of, assuming floats

+
+
Returns
+

biased skewness

+
+
Return type
+

np.float64

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.biased_kurt(df_series: Series) float64
+

Calculate the biased estimator for kurtosis of the given data.

+
+
The definition is formalized as g_2 here:

https://en.wikipedia.org/wiki/Kurtosis#A_natural_but_biased_estimator

+
+
+
+
Parameters
+

df_series (pandas Series) – data to get kurtosis of, assuming floats

+
+
Returns
+

biased kurtosis

+
+
Return type
+

np.float64

+
+
+
+
+
+class dataprofiler.profilers.profiler_utils.Subtractable(*args, **kwargs)
+

Bases: Protocol

+

Protocol for annotating subtractable types.

+
+
+
+dataprofiler.profilers.profiler_utils.find_diff_of_numbers(stat1: int | float | np.float64 | np.int64 | None, stat2: int | float | np.float64 | np.int64 | None) Any
+
+dataprofiler.profilers.profiler_utils.find_diff_of_numbers(stat1: T | None, stat2: T | None) Any
+

Find the difference between two stats.

+

If there is no difference, return “unchanged”. +For ints/floats, returns stat1 - stat2.

+
+
Parameters
+
    +
  • stat1 (Union[int, float, np.float64, np.int64, None]) – the first statistical input

  • +
  • stat2 (Union[int, float, np.float64, np.int64, None]) – the second statistical input

  • +
+
+
Returns
+

the difference of the stats

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.find_diff_of_strings_and_bools(stat1: str | bool | None, stat2: str | bool | None) list[str | bool | None] | str
+

Find the difference between two stats.

+

If there is no difference, return “unchanged”. +For strings and bools, return list containing [stat1, stat2].

+
+
Parameters
+
    +
  • stat1 (Union[str, bool]) – the first statistical input

  • +
  • stat2 (Union[str, bool]) – the second statistical input

  • +
+
+
Returns
+

the difference of the stats

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.find_diff_of_lists_and_sets(stat1: list | set | None, stat2: list | set | None) list[list | set | None] | str
+

Find the difference between two stats.

+

If there is no difference, return +“unchanged”. Remove duplicates and returns [unique values of stat1, +shared values, unique values of stat2].

+
+
Parameters
+
    +
  • stat1 (Union[list, set]) – the first statistical input

  • +
  • stat2 (Union[list, set]) – the second statistical input

  • +
+
+
Returns
+

the difference of the stats

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.find_diff_of_dates(stat1: datetime.datetime | None, stat2: datetime.datetime | None) list | str | None
+

Find the difference between two dates.

+

If there is no difference, return +“unchanged”. For dates, return the difference in time.

+

Because only days can be stored as negative values internally +for timedelta objects, the output for these negative values is +less readable due to the combination of signs in the default +output. This returns a readable output for timedelta that +accounts for potential negative differences.

+
+
Parameters
+
    +
  • stat1 (datetime.datetime object) – the first statistical input

  • +
  • stat2 (datetime.datetime object) – the second statistical input

  • +
+
+
Returns
+

difference in stats

+
+
Return type
+

Union[List, str]

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.find_diff_of_dicts(dict1: dict | None, dict2: dict | None) dict | str
+

Find the difference between two dicts.

+

For each key in each dict, +return “unchanged” if there’s no difference, otherwise return +the difference. Assume that if the two dictionaries share the +same key, their values are the same type.

+
+
Parameters
+
    +
  • dict1 (dict) – the first dict

  • +
  • dict2 (dict) – the second dict

  • +
+
+
Returns
+

Difference in the keys of each dict

+
+
Return type
+

dict

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.find_diff_of_matrices(matrix1: np.ndarray | None, matrix2: np.ndarray | None) np.ndarray | str | None
+

Find the difference between two matrices.

+
+
Parameters
+
    +
  • matrix1 (list(list(float))) – the first matrix

  • +
  • matrix2 (list(list(float))) – the second matrix

  • +
+
+
Returns
+

Difference in the matrix

+
+
Return type
+

list(list(float))

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.find_diff_of_dicts_with_diff_keys(dict1: dict | None, dict2: dict | None) list[dict] | str
+

Find the difference between two dicts.

+

For each key in each dict, +return “unchanged” if there’s no difference, otherwise return +the difference. Assume that if the two dictionaries share the +same key, their values are the same type.

+
+
Parameters
+
    +
  • dict1 (dict) – the first dict

  • +
  • dict2 (dict) – the second dict

  • +
+
+
Returns
+

Difference in the keys of each dict

+
+
Return type
+

list

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.get_memory_size(data: list | np.ndarray | DataFrame, unit: str = 'M') float
+

Get memory size of the input data.

+
+
Parameters
+
    +
  • data (Union[list, numpy.array, pandas.DataFrame]) – list or array of data

  • +
  • unit (string) – memory size unit (B, K, M, or G)

  • +
+
+
Returns
+

memory size of the input data

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.method_timeit(method: Optional[Callable] = None, name: Optional[str] = None) Callable
+

Measure execution time of provided method.

+

Record time into times dictionary.

+
+
Parameters
+
    +
  • method (Callable) – method to time

  • +
  • name (str) – key argument for the times dictionary

  • +
+
+
+
+
+
+dataprofiler.profilers.profiler_utils.perform_chi_squared_test_for_homogeneity(categories1: dict, sample_size1: int, categories2: dict, sample_size2: int) dict[str, int | float | None]
+

Perform a Chi Squared test for homogeneity between two groups.

+
+
Parameters
+
    +
  • categories1 (dict) – Categories and respective counts of the first group

  • +
  • sample_size1 (int) – Number of samples in first group

  • +
  • categories2 (dict) – Categories and respective counts of the second group

  • +
  • sample_size2 (int) – Number of samples in second group

  • +
+
+
Returns
+

Results of the chi squared test

+
+
Return type
+

dict

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.chunk(lst: list, size: int) Iterator[tuple]
+

Chunk things out.

+
+
Parameters
+
    +
  • lst (list) – List to chunk

  • +
  • size (int) – Size of each chunk

  • +
+
+
Returns
+

Iterator that produces tuples of each chunk

+
+
Return type
+

Iterator[Tuple]

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.merge(top_profile: profile_builder.BaseProfiler, other_profile: profile_builder.BaseProfiler = None) profile_builder.BaseProfiler
+

Merge two Profiles.

+
+
Parameters
+
    +
  • top_profile (Profile) – First profile

  • +
  • other_profile (Profile) – Second profile

  • +
+
+
Returns
+

Merge of two profile objects

+
+
Return type
+

Profile

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.merge_profile_list(list_of_profiles: list[profile_builder.BaseProfiler], pool_count: int = 5) profile_builder.BaseProfiler
+

Merge list of profiles into a single profile.

+
+
Parameters
+
    +
  • list_of_profiles (list) – Categories and respective counts of the second group

  • +
  • pool_count (int) – Number of samples in second group

  • +
+
+
Returns
+

Single profile that is the merge of all profiles in the +list_of_profiles list.

+
+
Return type
+

Profile

+
+
+
+
+
+dataprofiler.profilers.profiler_utils.reload_labeler_from_options_or_get_new(data_labeler_load_attr: dict, config: dict | None = None) BaseDataLabeler | None
+

If required by the load_attr load a data labeler, but reuse from config if possible.

+
+
Parameters
+
    +
  • data_labeler_load_attr (dict[string, dict]) – dictionary with attributes and values.

  • +
  • config (dict[string, dict]) – config for loading classes to reuse an existing labeler

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

DataLabelerOptions

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.text_column_profile.html b/docs/0.12.0/html/dataprofiler.profilers.text_column_profile.html new file mode 100644 index 000000000..69d3626be --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.text_column_profile.html @@ -0,0 +1,589 @@ + + + + + + + + + Text Column Profile - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Text Column Profile

+

Text profile analysis for individual col within structured profiling..

+
+
+class dataprofiler.profilers.text_column_profile.TextColumn(name: str | None, options: TextOptions = None)
+

Bases: NumericStatsMixin[TextColumn], BaseColumnPrimitiveTypeProfiler[TextColumn]

+

Text column profile subclass of BaseColumnProfiler.

+

Represents a column in the dataset which is a text column.

+

Initialize column base properties and itself.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (TextOptions) – Options for the Text column

  • +
+
+
+
+
+type: str | None = 'text'
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Report profile attribute of class; potentially pop val from self.profile.

+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
Returns
+

+
+
+
+
+
+diff(other_profile: TextColumn, options: Optional[dict] = None) dict
+

Find the differences for text columns.

+
+
Parameters
+

other_profile (TextColumn Profile) – profile to find the difference with

+
+
Returns
+

the text columns differences

+
+
Return type
+

dict

+
+
+
+
+
+property data_type_ratio: float | None
+

Calculate the ratio of samples which match this data type.

+

NOTE: all values can be considered string so always returns 1 in this +case.

+
+
Returns
+

ratio of data type

+
+
Return type
+

float

+
+
+
+
+
+update(df_series: Series) TextColumn
+

Update the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

updated TextColumn

+
+
Return type
+

TextColumn

+
+
+
+
+
+classmethod load_from_dict(data, config: dict | None = None)
+

Parse attribute from json dictionary into self.

+
+
Parameters
+
    +
  • data (dict[string, Any]) – dictionary with attributes and values.

  • +
  • config (Dict | None) – config for loading column profiler params from dictionary

  • +
+
+
Returns
+

Profiler with attributes populated.

+
+
Return type
+

TextColumn

+
+
+
+
+
+col_type = None
+
+
+
+static is_float(x: str) bool
+

Return True if x is float.

+

For “0.80” this function returns True +For “1.00” this function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is float or not

+
+
Return type
+

bool

+
+
+
+
+
+static is_int(x: str) bool
+

Return True if x is integer.

+

For “0.80” This function returns False +For “1.00” This function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is integer or not

+
+
Return type
+

bool

+
+
+
+
+
+property kurtosis: float | np.float64
+

Return kurtosis value.

+
+
+
+property mean: float | np.float64
+

Return mean value.

+
+
+
+property median: float
+

Estimate the median of the data.

+
+
Returns
+

the median

+
+
Return type
+

float

+
+
+
+
+
+property median_abs_deviation: float | np.float64
+

Get median absolute deviation estimated from the histogram of the data.

+
+

Subtract bin edges from the median value +Fold the histogram to positive and negative parts around zero +Impose the two bin edges from the two histogram +Calculate the counts for the two histograms with the imposed bin edges +Superimpose the counts from the two histograms +Interpolate the median absolute deviation from the superimposed counts

+
+
+
Returns
+

median absolute deviation

+
+
+
+
+
+property mode: list[float]
+

Find an estimate for the mode[s] of the data.

+
+
Returns
+

the mode(s) of the data

+
+
Return type
+

list(float)

+
+
+
+
+
+static np_type_to_type(val: Any) Any
+

Convert numpy variables to base python type variables.

+
+
Parameters
+

val (numpy type or base type) – value to check & change

+
+
Return val
+

base python type

+
+
Rtype val
+

int or float

+
+
+
+
+
+property skewness: float | np.float64
+

Return skewness value.

+
+
+
+property stddev: float | np.float64
+

Return stddev value.

+
+
+
+property variance: float | np.float64
+

Return variance.

+
+
+
+min: int | float | np.float64 | np.int64 | None
+
+
+
+max: int | float | np.float64 | np.int64 | None
+
+
+
+sum: int | float | np.float64 | np.int64
+
+
+
+max_histogram_bin: int
+
+
+
+min_histogram_bin: int
+
+
+
+histogram_bin_method_names: list[str]
+
+
+
+histogram_selection: str | None
+
+
+
+user_set_histogram_bin: int | None
+
+
+
+bias_correction: bool
+
+
+
+num_zeros: int | np.int64
+
+
+
+num_negatives: int | np.int64
+
+
+
+histogram_methods: dict
+
+
+
+quantiles: list[float] | None
+
+
+
+match_count: int
+
+
+
+name: str | None
+
+
+
+sample_size: int
+
+
+
+metadata: dict
+
+
+
+times: dict
+
+
+
+thread_safe: bool
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.unstructured_labeler_profile.html b/docs/0.12.0/html/dataprofiler.profilers.unstructured_labeler_profile.html new file mode 100644 index 000000000..8786cf040 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.unstructured_labeler_profile.html @@ -0,0 +1,349 @@ + + + + + + + + + Unstructured Labeler Profile - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Unstructured Labeler Profile

+

Profile analysis for applying labels within unstructured profiling.

+
+
+class dataprofiler.profilers.unstructured_labeler_profile.UnstructuredLabelerProfile(data_labeler_dirpath: Optional[str] = None, options: Optional[DataLabelerOptions] = None)
+

Bases: object

+

Profiles and labels unstructured data.

+

Initialize of Data Label profiling for unstructured datasets.

+
+
Parameters
+
    +
  • data_labeler_dirpath (String) – Directory path to the data labeler

  • +
  • options (DataLabelerOptions) – Options for the data labeler column

  • +
+
+
+
+
+type = 'data_labeler'
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Return profile object.

+
+
Parameters
+

remove_disabled_flag (boolean) – flag to determine if disabled options +should be excluded in report.

+
+
+
+
+
+diff(other_profile: UnstructuredLabelerProfile, options: Optional[dict] = None) dict
+

Find the differences for two unstructured labeler profiles.

+
+
Parameters
+
    +
  • other_profile (UnstructuredLabelerProfile) – profile to find the difference with

  • +
  • options (dict) – options for diff output

  • +
+
+
Returns
+

the difference between entity counts/percentages

+
+
Return type
+

dict

+
+
+
+
+
+property label_encoding: list[str]
+

Return list of labels.

+
+
+
+update(df_series: Series) None
+

Update profile.

+
+
+
+property profile: dict
+

Return a profile.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.unstructured_text_profile.html b/docs/0.12.0/html/dataprofiler.profilers.unstructured_text_profile.html new file mode 100644 index 000000000..b5778264f --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.unstructured_text_profile.html @@ -0,0 +1,357 @@ + + + + + + + + + Unstructured Text Profile - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Unstructured Text Profile

+

For profiling unstructured text data.

+
+
+class dataprofiler.profilers.unstructured_text_profile.TextProfiler(name: str | None, options: TextProfilerOptions = None)
+

Bases: object

+

Profiles text data.

+

Initialize TextProfiler object.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (TextProfilerOptions) – Options for the Text Profiler

  • +
+
+
+
+
+type = 'text'
+
+
+
+diff(other_profile: TextProfiler, options: Optional[dict] = None) dict
+

Find the differences for two unstructured text profiles.

+
+
Parameters
+
    +
  • other_profile (TextProfiler) – profile to find the difference with

  • +
  • options (dict) – options for diff output

  • +
+
+
Returns
+

the difference between profiles

+
+
Return type
+

dict

+
+
+
+
+
+report(remove_disabled_flag: bool = False) dict
+

Report profile attribute of class; potentially pop val from self.profile.

+
+
+
+property profile: dict
+

Return the profile of the column.

+
+
Returns
+

profile of the column

+
+
Return type
+

dict

+
+
+
+
+
+update(data: Series) TextProfiler
+

Update the column profile.

+
+
Parameters
+

data (pandas.core.series.Series) – df series

+
+
Returns
+

updated TextProfiler

+
+
Return type
+

TextProfiler

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.profilers.utils.html b/docs/0.12.0/html/dataprofiler.profilers.utils.html new file mode 100644 index 000000000..650c6c253 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.profilers.utils.html @@ -0,0 +1,266 @@ + + + + + + + + + Utils - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Utils

+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.reports.graphs.html b/docs/0.12.0/html/dataprofiler.reports.graphs.html new file mode 100644 index 000000000..a167830a9 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.reports.graphs.html @@ -0,0 +1,267 @@ + + + + + + + + + Graphs - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Graphs

+

Contains functions for generating graph data report.

+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.reports.html b/docs/0.12.0/html/dataprofiler.reports.html new file mode 100644 index 000000000..d9a520e16 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.reports.html @@ -0,0 +1,299 @@ + + + + + + + + + Reports - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Reports

+
+

Modules

+
+
+
+ +
+

Package for generating reports.

+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.reports.utils.html b/docs/0.12.0/html/dataprofiler.reports.utils.html new file mode 100644 index 000000000..7eba28691 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.reports.utils.html @@ -0,0 +1,292 @@ + + + + + + + + + Utils - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Utils

+

Contains functions for checking for installations/dependencies.

+
+
+dataprofiler.reports.utils.warn_missing_module(graph_func: str, module_name: str) None
+

Return a warning if a given graph module doesn’t exist.

+
+
Parameters
+
    +
  • graph_func (str) – Name of the graphing function

  • +
  • module_name (str) – module name that was missing

  • +
+
+
+
+
+
+dataprofiler.reports.utils.require_module(names: List[str]) Callable[[F], F]
+

Check if a set of modules exists in sys.modules prior to running function.

+

If they do not, give a user a warning and do not run the +function.

+
+
Parameters
+

names (list[str]) – list of module names to check for in sys.modules

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.rng_utils.html b/docs/0.12.0/html/dataprofiler.rng_utils.html new file mode 100644 index 000000000..9f1f9603d --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.rng_utils.html @@ -0,0 +1,272 @@ + + + + + + + + + Rng Utils - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Rng Utils

+

Create a random number generator using a manual seed DATAPROFILER_SEED.

+
+
+dataprofiler.rng_utils.get_random_number_generator() Generator
+

Create a random number generator using a manual seed DATAPROFILER_SEED.

+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.settings.html b/docs/0.12.0/html/dataprofiler.settings.html new file mode 100644 index 000000000..f2b31d145 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.settings.html @@ -0,0 +1,267 @@ + + + + + + + + + Settings - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Settings

+

Configure settings for dataprofiler.

+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.validators.base_validators.html b/docs/0.12.0/html/dataprofiler.validators.base_validators.html new file mode 100644 index 000000000..c78a19577 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.validators.base_validators.html @@ -0,0 +1,363 @@ + + + + + + + + + Base Validators - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Validators

+

Build model for dataset by identifying col type along with its respective params.

+
+
+dataprofiler.validators.base_validators.is_in_range(x: str | int | float, config: dict) bool
+

Check to see x is in the range of the config.

+
+
Parameters
+
    +
  • x (int/float) – number

  • +
  • config (dict) – configuration

  • +
+
+
Returns
+

bool

+
+
+
+
+
+dataprofiler.validators.base_validators.is_in_list(x: str | int | float, config: dict) bool
+

Check to see x is in the config list.

+
+
Parameters
+
    +
  • x (string) – item

  • +
  • config (dict) – configuration

  • +
+
+
Returns
+

bool

+
+
+
+
+
+class dataprofiler.validators.base_validators.Validator
+

Bases: object

+

For validating a data set.

+

Initialize Validator object.

+
+
+validate(data: pd.DataFrame | dd.DataFrame, config: dict) None
+

Validate a data set.

+

No option for validating a partial data set.

+

Set configuration on run not on instantiation of the class such that +you have the option to run multiple times with different configurations +without having to also reinstantiate the class.

+
+
Parameters
+
    +
  • data (DataFrame Dask/Pandas) – The data to be processed by the validator. Processing +occurs in a column-wise fashion.

  • +
  • config (dict) – configuration for how the validator should +run across the given data. Validator will only run over columns +specified in the configuration.

  • +
+
+
Example
+

This is an example of the config:

+
config = {
+        <column_name>: {
+                range: {
+                    'start': 1,
+                    'end':2
+                },
+                list: [1,2,3]
+            }
+        }
+
+
+
+
+
+
+
+get() dict
+

Get the results of the validation run.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.validators.html b/docs/0.12.0/html/dataprofiler.validators.html new file mode 100644 index 000000000..73abdb7d2 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.validators.html @@ -0,0 +1,316 @@ + + + + + + + + + Validators - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Validators

+
+

Modules

+
+
+ +

Package for identifying cols.

+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/dataprofiler.version.html b/docs/0.12.0/html/dataprofiler.version.html new file mode 100644 index 000000000..e5bcc5bc4 --- /dev/null +++ b/docs/0.12.0/html/dataprofiler.version.html @@ -0,0 +1,267 @@ + + + + + + + + + Version - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Version

+

File contains the version number for the package.

+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/examples.html b/docs/0.12.0/html/examples.html new file mode 100644 index 000000000..dfb51af6f --- /dev/null +++ b/docs/0.12.0/html/examples.html @@ -0,0 +1,428 @@ + + + + + + + + + Examples - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Examples

+

These examples provide a more in-depth look into the details of the Data Profiler library.

+
+

Basics

+
+ +
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/genindex.html b/docs/0.12.0/html/genindex.html new file mode 100644 index 000000000..a52fcc470 --- /dev/null +++ b/docs/0.12.0/html/genindex.html @@ -0,0 +1,3412 @@ + + + + + + + Index - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + +
+

Index

+
A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W
+
+
+

A

+ + + +
+
+ +
+

B

+ + + +
+
+ +
+

C

+ + + +
+
+ +
+

D

+ + + +
+
+ +
+

E

+ + + +
+
+ +
+

F

+ + + +
+
+ +
+

G

+ + + +
+
+ +
+

H

+ + + +
+
+ +
+

I

+ + + +
+
+ +
+

J

+ + + +
+
+ +
+

K

+ + + +
+
+ +
+

L

+ + + +
+
+ +
+

M

+ + + +
+
+ +
+

N

+ + + +
+
+ +
+

O

+ + + +
+
+ +
+

P

+ + + +
+
+ +
+

Q

+ + + +
+
+ +
+

R

+ + + +
+
+ +
+

S

+ + + +
+
+ +
+

T

+ + + +
+
+ +
+

U

+ + + +
+
+ +
+

V

+ + + +
+
+ +
+

W

+ + + +
+
+ + +
+
+ + + + + +
+
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/graph_data_demo.html b/docs/0.12.0/html/graph_data_demo.html new file mode 100644 index 000000000..c10472164 --- /dev/null +++ b/docs/0.12.0/html/graph_data_demo.html @@ -0,0 +1,710 @@ + + + + + + + + + Graph Pipeline Demo - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Graph Pipeline Demo

+

DataProfiler can also load and profile graph datasets. Similarly to the rest of DataProfiler profilers, this is split into two components: - GraphData - GraphProfiler

+

We will demo the use of this graph pipeline.

+

First, let’s import the libraries needed for this example.

+
+
[ ]:
+
+
+
+import os
+import sys
+import pprint
+
+try:
+    sys.path.insert(0, '..')
+    import dataprofiler as dp
+except ImportError:
+    import dataprofiler as dp
+
+data_path = "../dataprofiler/tests/data"
+
+
+
+

We now input our dataset into the generic DataProfiler pipeline:

+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "csv/graph_data_csv_identify.csv"))
+profile = dp.Profiler(data)
+
+report = profile.report()
+
+pp = pprint.PrettyPrinter(sort_dicts=False, compact=True)
+pp.pprint(report)
+
+
+
+

We notice that the Data class automatically detected the input file as graph data. The GraphData class is able to differentiate between tabular and graph csv data. After Data matches the input file as graph data, GraphData does the necessary work to load the csv data into a NetworkX Graph.

+

Profiler runs GraphProfiler when graph data is input (or when data_type="graph" is specified). The report() function outputs the profile for the user.

+
+

Profile

+

The profile skeleton looks like this:

+
profile = {
+    "num_nodes": ...,
+    "num_edges": ...,
+    "categorical_attributes": ...,
+    "continuous_attributes": ...,
+    "avg_node_degree": ...,
+    "global_max_component_size": ...,
+    "continuous_distribution": ...,
+    "categorical_distribution": ...,
+    "times": ...,
+}
+
+
+

Description of properties in profile: - num_nodes: number of nodes in the graph - num_edges: number of edges in the graph - categorical_attributes: list of categorical edge attributes - continuous_attributes: list of continuous edge attributes - avg_node_degree: average degree of nodes in the graph - global_max_component_size: size of largest global max component in the graph - continuous_distribution: dictionary of statistical properties for each continuous attribute +- categorical_distribution: dictionary of statistical properties for each categorical attribute

+

The continuous_distribution and categorical_distribution dictionaries list statistical properties for each edge attribute in the graph:

+
continuous_distribution = {
+    "name": ...,
+    "scale": ...,
+    "properties": ...,
+}
+
+
+
categorical_distribution = {
+    "bin_counts": ...,
+    "bin_edges": ...,
+}
+
+
+

Description of each attribute: - Continuous distribution: - name: name of the distribution - scale: negative log likelihood used to scale distributions and compare them in GraphProfiler - properties: list of distribution props - Categorical distribution: - bin_counts: histogram bin counts - bin_edges: histogram bin edges

+

properties lists the following distribution properties: [optional: shape, loc, scale, mean, variance, skew, kurtosis]. The list can be either 6 length or 7 length depending on the distribution (extra shape parameter): - 6 length: norm, uniform, expon, logistic - 7 length: gamma, lognorm - gamma: shape=a (float) - lognorm: shape=s (float)

+

For more information on shape parameters a and s: https://docs.scipy.org/doc/scipy/tutorial/stats.html#shape-parameters

+
+
+

Saving and Loading a Profile

+

Below you will see an example of how a Graph Profile can be saved and loaded again.

+
+
[ ]:
+
+
+
+# The default save filepath is profile-<datetime>.pkl
+profile.save(filepath="profile.pkl")
+
+new_profile = dp.GraphProfiler.load("profile.pkl")
+new_report = new_profile.report()
+
+
+
+
+
[ ]:
+
+
+
+pp.pprint(report)
+
+
+
+
+
+

Difference in Data

+

If we wanted to ensure that this new profile was the same as the previous profile that we loaded, we could compare them using the diff functionality.

+
+
[ ]:
+
+
+
+diff = profile.diff(new_profile)
+
+
+
+
+
[ ]:
+
+
+
+pp.pprint(diff)
+
+
+
+

Another use for diff might be to provide differences between training and testing profiles as shown in the cell below. We will use the profile above as the training profile and create a new profile to represent the testing profile

+
+
[ ]:
+
+
+
+training_profile = profile
+
+testing_data = dp.Data(os.path.join(data_path, "csv/graph-differentiator-input-positive.csv"))
+testing_profile = dp.Profiler(testing_data)
+
+test_train_diff = training_profile.diff(testing_profile)
+
+
+
+

Below you can observe the difference between the two profiles.

+
+
[ ]:
+
+
+
+pp.pprint(test_train_diff)
+
+
+
+
+
+

Conclusion

+

We have shown the graph pipeline in the DataProfiler. It works similarly to the current DataProfiler implementation.

+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/graph_data_demo.ipynb b/docs/0.12.0/html/graph_data_demo.ipynb new file mode 100644 index 000000000..088612872 --- /dev/null +++ b/docs/0.12.0/html/graph_data_demo.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Graph Pipeline Demo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataProfiler can also load and profile graph datasets. Similarly to the rest of DataProfiler profilers, this is split into two components:\n", + "- GraphData\n", + "- GraphProfiler\n", + "\n", + "We will demo the use of this graph pipeline.\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import pprint\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now input our dataset into the generic DataProfiler pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/graph_data_csv_identify.csv\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "report = profile.report()\n", + "\n", + "pp = pprint.PrettyPrinter(sort_dicts=False, compact=True)\n", + "pp.pprint(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We notice that the `Data` class automatically detected the input file as graph data. The `GraphData` class is able to differentiate between tabular and graph csv data. After `Data` matches the input file as graph data, `GraphData` does the necessary work to load the csv data into a NetworkX Graph. \n", + "\n", + "`Profiler` runs `GraphProfiler` when graph data is input (or when `data_type=\"graph\"` is specified). The `report()` function outputs the profile for the user." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Profile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The profile skeleton looks like this:\n", + "```\n", + "profile = {\n", + " \"num_nodes\": ...,\n", + " \"num_edges\": ...,\n", + " \"categorical_attributes\": ...,\n", + " \"continuous_attributes\": ...,\n", + " \"avg_node_degree\": ...,\n", + " \"global_max_component_size\": ...,\n", + " \"continuous_distribution\": ...,\n", + " \"categorical_distribution\": ...,\n", + " \"times\": ...,\n", + "}\n", + "```\n", + "\n", + "Description of properties in profile:\n", + "- `num_nodes`: number of nodes in the graph\n", + "- `num_edges`: number of edges in the graph\n", + "- `categorical_attributes`: list of categorical edge attributes\n", + "- `continuous_attributes`: list of continuous edge attributes\n", + "- `avg_node_degree`: average degree of nodes in the graph\n", + "- `global_max_component_size`: size of largest global max component in the graph\n", + "- `continuous_distribution`: dictionary of statistical properties for each continuous attribute\n", + "- `categorical_distribution`: dictionary of statistical properties for each categorical attribute\n", + "\n", + "The `continuous_distribution` and `categorical_distribution` dictionaries list statistical properties for each edge attribute in the graph:\n", + "```\n", + "continuous_distribution = {\n", + " \"name\": ...,\n", + " \"scale\": ...,\n", + " \"properties\": ...,\n", + "}\n", + "```\n", + "```\n", + "categorical_distribution = {\n", + " \"bin_counts\": ...,\n", + " \"bin_edges\": ...,\n", + "}\n", + "```\n", + "Description of each attribute:\n", + "- Continuous distribution:\n", + " - `name`: name of the distribution\n", + " - `scale`: negative log likelihood used to scale distributions and compare them in `GraphProfiler`\n", + " - `properties`: list of distribution props\n", + "- Categorical distribution:\n", + " - `bin_counts`: histogram bin counts\n", + " - `bin_edges`: histogram bin edges\n", + "\n", + "`properties` lists the following distribution properties: [optional: shape, loc, scale, mean, variance, skew, kurtosis]. The list can be either 6 length or 7 length depending on the distribution (extra shape parameter):\n", + "- 6 length: norm, uniform, expon, logistic\n", + "- 7 length: gamma, lognorm\n", + " - gamma: shape=`a` (float)\n", + " - lognorm: shape=`s` (float)\n", + " \n", + "For more information on shape parameters `a` and `s`: https://docs.scipy.org/doc/scipy/tutorial/stats.html#shape-parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile\n", + "Below you will see an example of how a Graph Profile can be saved and loaded again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The default save filepath is profile-.pkl\n", + "profile.save(filepath=\"profile.pkl\")\n", + "\n", + "new_profile = dp.GraphProfiler.load(\"profile.pkl\")\n", + "new_report = new_profile.report()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pp.pprint(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Difference in Data\n", + "If we wanted to ensure that this new profile was the same as the previous profile that we loaded, we could compare them using the diff functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "diff = profile.diff(new_profile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pp.pprint(diff)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another use for diff might be to provide differences between training and testing profiles as shown in the cell below.\n", + "We will use the profile above as the training profile and create a new profile to represent the testing profile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_profile = profile\n", + "\n", + "testing_data = dp.Data(os.path.join(data_path, \"csv/graph-differentiator-input-positive.csv\"))\n", + "testing_profile = dp.Profiler(testing_data)\n", + "\n", + "test_train_diff = training_profile.diff(testing_profile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below you can observe the difference between the two profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pp.pprint(test_train_diff)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have shown the graph pipeline in the DataProfiler. It works similarly to the current DataProfiler implementation." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/0.12.0/html/graphs.html b/docs/0.12.0/html/graphs.html new file mode 100644 index 000000000..58d8966db --- /dev/null +++ b/docs/0.12.0/html/graphs.html @@ -0,0 +1,489 @@ + + + + + + + + + Graphs - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Graphs

+
+

Graph Your Data

+

We can plot some of our data as seaborn histogram plots. Below will demonstrate how to do so and provide examples.

+

The following plots are currently available to work directly with your profilers:

+
+
    +
  • histogram (numeric columns only)

  • +
  • missing values matrix

  • +
+
+

Below shows how to do so with examples.

+
+

What we need to import

+
from dataprofiler.reports import graphs
+
+
+

The main functions that is used to plot histograms are in graphs. You will also need the `dataprofiler[reports]` requirement to be installed:

+
pip install 'dataprofiler[reports]'
+
+
+
+
+

Plotting from a StructuredProfiler class

+

With a StructuredProfiler class variable, we can specify what columns we want to be plotted, and plot them into histograms.

+
graphs.plot_histograms(profiler, column_names, column_inds)
+
+
+

These are what the variables mean:

+
+
    +
  • profiler - StructuredProfiler class variable that contains the data we want

  • +
  • columns - (Optional) The list of IntColumn or FloatColumn names we want to specifically plot. If specified, column_inds cannot be specified.

  • +
  • column_inds - (Optional) The list of IntColumn or FloatColumn indexes we want to specifically plot. If specified, column_names cannot be specified.

  • +
+
+

Additionally, we can also plot the missing values matrix for a StructuredProfiler:

+
graphs.plot_missing_values_matrix(profiler, ax, title)
+
+
+

These are what the variables mean:

+
+
    +
  • profiler - StructuredProfiler class variable that contains the data we want

  • +
  • ax - (Optional) MatPlotLib Axes to plot the matrix within.

  • +
  • title - (Optional) The title of the axes we want to define.

  • +
+
+
+
+

Plotting an individual IntColumn or FloatColumn

+

With a column’s Int or Float profile, we can plot their respective histograms.

+
graphs.plot_col_histogram(column, axes, title)
+
+
+

These are what the variables mean:

+
+
    +
  • column - The IntColumn or FloatColumn we want to plot

  • +
  • axes - (Optional) The MatPlotLib Axes to plot the histogram within.

  • +
  • title - (Optional) The title of the axes we want to define.

  • +
+
+

Additionally, we can also plot the missing values bargraph for any column profile:

+
graphs.plot_col_missing_values(profiler, ax, title)
+
+
+

These are what the variables mean:

+
+
    +
  • profiler - The StructuredColProfiler we want to plot

  • +
  • ax - (Optional) MatPlotLib Axes to plot the matrix within.

  • +
  • title - (Optional) The title of the axes we want to define.

  • +
+
+
+
+

Examples

+
+

Histograms

+
    +
  1. This example demonstrates how we can take a StructuredProfiler class and plot histograms of the specified columns.

  2. +
+
import dataprofiler as dp
+from dataprofiler.reports import graphs
+
+
+data = [[1, 'a', 1.0],
+        [2, 'b', 2.2],
+        [3, 'c', 3.5],
+        [None, 'd', 10.0]]
+profiler = dp.StructuredProfiler(data)
+
+# This will plot all IntColumn and FloatColumn as histograms (The first and last column).
+fig = graphs.plot_histograms(profiler)
+fig.show()
+
+# This will only plot the specified column, 0.
+columns_names = [0]
+fig = graphs.plot_histograms(profiler, columns_names)
+fig.show()
+
+
+First Histogram Example Image +Second Histogram Example Image +
    +
  1. This example demonstrates how we can plot a low level profiler.

  2. +
+
import pandas as pd
+
+from dataprofiler.profilers import IntColumn
+from dataprofiler.reports import graphs
+
+
+data = pd.Series([1, 2, 3], dtype=str)
+profiler = IntColumn('example')
+profiler.update(data)
+
+# Plot the axes
+ax = graphs.plot_col_histogram(profiler)
+
+# get and show the figure of the plotted histogram
+fig = ax.get_figure()
+fig.show()
+
+
+Histogram Column Only Example Image +
+
+

Missing Values Matrix

+
    +
  1. This example demonstrates how we can take a StructuredProfiler class and plot a missing values matrix.

  2. +
+
import dataprofiler as dp
+from dataprofiler.reports import graphs
+
+
+data = pd.DataFrame(
+    [[None, '', 1.0, '1/2/2021'],
+     [3, None, 3.5, ''],
+     [1, None, 1.0, '2/5/2020'],
+     [None, 1, 10.0, '3/5/2020']],
+    columns=['integer', 'str', 'float', 'datetime'],
+    dtype=object
+)
+profiler = dp.StructuredProfiler(data)
+
+# This will plot the missing values matrix for all columns.
+fig = graphs.plot_missing_values_matrix(profiler)
+fig.show()
+
+
+Missing Values Matrix Example Image +
    +
  1. This example demonstrates how we can plot barchart of a column’s missing values.

  2. +
+
import pandas as pd
+
+from dataprofiler.profilers.profile_builder import StructuredColProfiler
+from dataprofiler.reports import graphs
+
+
+data = pd.Series([1, 2, 3, None, None, 4], name='example', dtype=str)
+profiler = StructuredColProfiler(data)
+
+# Plot the axes, can be a list of multiple columns
+ax = graphs.plot_col_missing_values([profiler])
+
+# get and show the figure of the plotted histogram
+fig = ax.get_figure()
+fig.show()
+
+
+Missing Values Column Only Example Image +
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/index.html b/docs/0.12.0/html/index.html new file mode 100644 index 000000000..a798460d6 --- /dev/null +++ b/docs/0.12.0/html/index.html @@ -0,0 +1,753 @@ + + + + + + + + + Home - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Profiler | What’s in your data?

+
+

Purpose

+

The DataProfiler is a Python library designed to make data analysis, monitoring and sensitive data detection easy.

+

Loading Data with a single command, the library automatically formats & loads files into a DataFrame. Profiling the Data, the library identifies the schema, statistics, entities and more. Data Profiles can then be used in downstream applications or reports.

+

The Data Profiler comes with a cutting edge pre-trained deep learning model, used to efficiently identify sensitive data (or PII). If customization is needed, it’s easy to add new entities to the existing pre-trained model or insert a new pipeline for entity recognition.

+

The best part? Getting started only takes a few lines of code (Example CSV):

+
import json
+from dataprofiler import Data, Profiler
+
+data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text
+print(data.data.head(5)) # Access data directly via a compatible Pandas DataFrame
+
+profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc
+readable_report = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(readable_report, indent=4))
+
+
+

To install the full package from pypi:

+
pip install DataProfiler[ml]
+
+
+

If the ML requirements are too strict (say, you don’t want to install tensorflow), you can install a slimmer package. The slimmer package disables the default sensitive data detection / entity recognition (labler)

+

Install from pypi:

+
pip install DataProfiler
+
+
+

If you have suggestions or find a bug, please open an issue.

+

Visit the API to explore Data Profiler’s terminology.

+
+
+

What is a Data Profile?

+

In the case of this library, a data profile is a dictionary containing statistics and predictions about the underlying dataset. There are “global statistics” or global_stats, which contain dataset level data and there are “column/row level statistics” or data_stats (each column is a new key-value entry).

+

The format for a structured profile is below:

+
"global_stats": {
+    "samples_used": int,
+    "column_count": int,
+    "row_count": int,
+    "row_has_null_ratio": float,
+    "row_is_null_ratio": float,
+    "unique_row_ratio": float,
+    "duplicate_row_count": int,
+    "file_type": string,
+    "encoding": string,
+    "correlation_matrix": list[list[int]], (*)
+    "chi2_matrix": list[list[float]],
+    "profile_schema": dict[string, list[int]]
+},
+"data_stats": [
+    {
+        "column_name": string,
+        "data_type": string,
+        "data_label": string,
+        "categorical": bool,
+        "order": string,
+        "samples": list[str],
+        "statistics": {
+            "sample_size": int,
+            "null_count": int,
+            "null_types": list[string],
+            "null_types_index": dict[string, list[int]],
+            "data_type_representation": dict[string, list[string]],
+            "min": [null, float],
+            "max": [null, float],
+            "sum": float,
+            "mode": list[float],
+            "median": float,
+            "median_absolute_deviation": float,
+            "mean": float,
+            "variance": float,
+            "stddev": float,
+            "skewness": float,
+            "kurtosis": float,
+            "num_zeros": int,
+            "num_negatives": int,
+            "histogram": {
+                "bin_counts": list[int],
+                "bin_edges": list[float],
+            },
+            "quantiles": {
+                int: float
+            },
+            "vocab": list[char],
+            "avg_predictions": dict[string, float],
+            "data_label_representation": dict[string, float],
+            "categories": list[str],
+            "unique_count": int,
+            "unique_ratio": float,
+            "categorical_count": dict[string, int],
+            "gini_impurity": float,
+            "unalikeability": float,
+            "precision": {
+                'min': int,
+                'max': int,
+                'mean': float,
+                'var': float,
+                'std': float,
+                'sample_size': int,
+                'margin_of_error': float,
+                'confidence_level': float
+            },
+            "times": dict[string, float],
+            "format": string
+        },
+        "null_replication_metrics": {
+            "class_prior": list[int],
+            "class_sum": list[list[int]],
+            "class_mean": list[list[int]]
+        }
+    }
+]
+
+
+

(*) Currently the correlation matrix update is toggled off. It will be reset in a later update. Users can still use it as desired with the is_enable option set to True.

+

The format for an unstructured profile is below:

+
"global_stats": {
+    "samples_used": int,
+    "empty_line_count": int,
+    "file_type": string,
+    "encoding": string,
+    "memory_size": float, # in MB
+},
+"data_stats": {
+    "data_label": {
+        "entity_counts": {
+            "word_level": dict[string, int],
+            "true_char_level": dict[string, int],
+            "postprocess_char_level": dict[string, int]
+        },
+        "entity_percentages": {
+            "word_level": dict[string, float],
+            "true_char_level": dict[string, float],
+            "postprocess_char_level": dict[string, float]
+        },
+        "times": dict[string, float]
+    },
+    "statistics": {
+        "vocab": list[char],
+        "vocab_count": dict[string, int],
+        "words": list[string],
+        "word_count": dict[string, int],
+        "times": dict[string, float]
+    }
+}
+
+
+

The format for a graph profile is below:

+
"num_nodes": int,
+"num_edges": int,
+"categorical_attributes": list[string],
+"continuous_attributes": list[string],
+"avg_node_degree": float,
+"global_max_component_size": int,
+"continuous_distribution": {
+    "<attribute_1>": {
+        "name": string,
+        "scale": float,
+        "properties": list[float, np.array]
+    },
+    "<attribute_2>": None,
+},
+"categorical_distribution": {
+    "<attribute_1>": None,
+    "<attribute_2>": {
+        "bin_counts": list[int],
+        "bin_edges": list[float]
+    },
+},
+"times": dict[string, float]
+
+
+
+

Supported Data Formats

+
    +
  • Any delimited file (CSV, TSV, etc.)

  • +
  • JSON object

  • +
  • Avro file

  • +
  • Parquet file

  • +
  • Text file

  • +
  • Pandas DataFrame

  • +
  • A URL that points to one of the supported file types above

  • +
+
+
+

Data Labels

+

Data Labels are determined per cell for structured data (column/row when the profiler is used) or at the character level for unstructured data.

+
    +
  • UNKNOWN

  • +
  • ADDRESS

  • +
  • BAN (bank account number, 10-18 digits)

  • +
  • CREDIT_CARD

  • +
  • EMAIL_ADDRESS

  • +
  • UUID

  • +
  • HASH_OR_KEY (md5, sha1, sha256, random hash, etc.)

  • +
  • IPV4

  • +
  • IPV6

  • +
  • MAC_ADDRESS

  • +
  • PERSON

  • +
  • PHONE_NUMBER

  • +
  • SSN

  • +
  • URL

  • +
  • US_STATE

  • +
  • DRIVERS_LICENSE

  • +
  • DATE

  • +
  • TIME

  • +
  • DATETIME

  • +
  • INTEGER

  • +
  • FLOAT

  • +
  • QUANTITY

  • +
  • ORDINAL

  • +
+
+
+
+

Get Started

+
+

Load a File

+

The profiler should automatically identify the file type and load the data into a Data Class.

+

Along with other attributtes the Data class enables structured data to be accessed via a valid Pandas DataFrame.

+
# Load a csv file, return a CSVData object
+csv_data = Data('your_file.csv')
+
+# Print the first 10 rows of the csv file
+print(csv_data.data.head(10))
+
+# Load a parquet file, return a ParquetData object
+parquet_data = Data('your_file.parquet')
+
+# Sort the data by the name column
+parquet_data.data.sort_values(by='name', inplace=True)
+
+# Print the sorted first 10 rows of the parquet data
+print(parquet_data.data.head(10))
+
+
+

If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers.

+
+
+

Profile a File

+

Example uses a CSV file for example, but CSV, JSON, Avro, Parquet or Text should also work.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load file (CSV should be automatically identified)
+data = Data("your_file.csv")
+
+# Profile the dataset
+profile = Profiler(data)
+
+# Generate a report and use json to prettify.
+report  = profile.report(report_options={"output_format":"pretty"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+

Updating Profiles

+

Currently, the data profiler is equipped to update its profile in batches.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load and profile a CSV file
+data = Data("your_file.csv")
+profile = Profiler(data)
+
+# Update the profile with new data:
+new_data = Data("new_data.csv")
+profile.update_profile(new_data)
+
+# Print the report using json to prettify.
+report  = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Merging Profiles

+

If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator.

+

This also enables profiles to be determined in a distributed manner.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load a CSV file with a schema
+data1 = Data("file_a.csv")
+profile1 = Profiler(data)
+
+# Load another CSV file with the same schema
+data2 = Data("file_b.csv")
+profile2 = Profiler(data)
+
+profile3 = profile1 + profile2
+
+# Print the report using json to prettify.
+report  = profile3.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Profile a Pandas DataFrame

+
import pandas as pd
+import dataprofiler as dp
+import json
+
+my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]])
+profile = dp.Profiler(my_dataframe)
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+# read a specified column, in this case it is labeled 0:
+print(json.dumps(report["data stats"][0], indent=4))
+
+
+
+
+

Unstructured Profiler

+

In addition to the structured profiler, the Data Profiler provides unstructured +profiling for the TextData object or string. Unstructured profiling also works +with list(string), pd.Series(string) or pd.DataFrame(string) given profiler_type +option specified as unstructured. Below is an example of unstructured profile +with a text file.

+
import dataprofiler as dp
+import json
+my_text = dp.Data('text_file.txt')
+profile = dp.Profiler(my_text)
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+

Another example of unstructured profile with pd.Series of string is given as below

+
import dataprofiler as dp
+import pandas as pd
+import json
+
+text_data = pd.Series(['first string', 'second string'])
+profile = dp.Profiler(text_data, profiler_type="unstructured")
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Graph Profiler

+

DataProfiler also provides the ability to profile graph data from a csv file. Below is an example of the graph profiler with a graph data csv file:

+
import dataprofiler as dp
+import pprint
+
+my_graph = dp.Data('graph_file.csv')
+profile = dp.Profiler(my_graph)
+
+# print the report using pretty print (json dump does not work on numpy array values inside dict)
+report = profile.report()
+printer = pprint.PrettyPrinter(sort_dicts=False, compact=True)
+printer.pprint(report)
+
+
+
+
+

Specifying a Filetype or Delimiter

+

Example of specifying a CSV data type, with a , delimiter. +In addition, it utilizes only the first 10,000 rows.

+
import json
+import os
+from dataprofiler import Data, Profiler
+from dataprofiler.data_readers.csv_data import CSVData
+
+# Load a CSV file, with "," as the delimiter
+data = CSVData("your_file.csv", options={"delimiter": ","})
+
+# Split the data, such that only the first 10,000 rows are used
+data = data.data[0:10000]
+
+# Read in profile and print results
+profile = Profiler(data)
+print(json.dumps(profile.report(report_options={"output_format":"pretty"}), indent=4))
+
+
+
+
+
+
+
+
+
+
+
+

Versions

+ +
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/install.html b/docs/0.12.0/html/install.html new file mode 100644 index 000000000..41dfa4ddf --- /dev/null +++ b/docs/0.12.0/html/install.html @@ -0,0 +1,393 @@ + + + + + + + + + Install - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Install

+

To install the full package from pypi:

+
pip install DataProfiler[ml]
+
+
+

If the ML requirements are too strict (say, you don’t want to install +tensorflow), you can install a slimmer package. The slimmer package disables +the default sensitive data detection / entity recognition (labler)

+

Install from pypi:

+
pip install DataProfiler
+
+
+
+

Snappy Installation

+

This is required to profile parquet/avro datasets

+

MacOS (intel chip) with homebrew:

+
brew install snappy && CPPFLAGS="-I/usr/local/include -L/usr/local/lib" pip install python-snappy
+
+
+

MacOS (apple chip) with homebrew:

+
brew install snappy && CPPFLAGS="-I/opt/homebrew/include -L/opt/homebrew/lib" pip install python-snappy
+
+
+

Linux install:

+
sudo apt-get -y install libsnappy-dev
+
+
+
+
+

Build From Scratch

+

NOTE: Installation for python3

+

virtualenv install:

+
python3 -m pip install virtualenv
+
+
+

Setup virtual env:

+
python3 -m virtualenv --python=python3 venv3
+source venv3/bin/activate
+
+
+

Install requirements:

+
pip3 install -r requirements.txt
+
+
+

Install labeler dependencies:

+
pip3 install -r requirements-ml.txt
+
+
+

Install via the repo – Build setup.py and install locally:

+
python3 setup.py sdist bdist bdist_wheel
+pip3 install dist/DataProfiler*-py3-none-any.whl
+
+
+

If you see:

+
ERROR: Double requirement given:dataprofiler==X.Y.Z from dataprofiler/dist/DataProfiler-X.Y.Z-py3-none-any.whl (already in dataprofiler==X2.Y2.Z2 from dataprofiler/dist/DataProfiler-X2.Y2.Z2-py3-none-any.whl, name='dataprofiler')
+
+
+

This means that you have multiple versions of the DataProfiler distribution +in the dist folder. +To resolve, either remove the older one or delete the folder and rerun the steps +above.

+

Install via github:

+
pip3 install git+https://github.com/capitalone/dataprofiler.git#egg=dataprofiler
+
+
+
+
+

Testing

+

For testing, install test requirements:

+
pip3 install -r requirements-test.txt
+
+
+

To run all unit tests, use:

+
DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py"
+
+
+

To run file of unit tests, use form:

+
DATAPROFILER_SEED=0 python3 -m unittest discover -p test_profile_builder.py
+
+
+

To run a file with Pytest use:

+
DATAPROFILER_SEED=0 pytest dataprofiler/tests/data_readers/test_csv_data.py -v
+
+
+

To run individual of unit test, use form:

+
DATAPROFILER_SEED=0 python3 -m unittest dataprofiler.tests.profilers.test_profile_builder.TestProfiler
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/labeler.html b/docs/0.12.0/html/labeler.html new file mode 100644 index 000000000..ea9bb189e --- /dev/null +++ b/docs/0.12.0/html/labeler.html @@ -0,0 +1,958 @@ + + + + + + + + + Sensitive Data Detection with the Labeler - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Sensitive Data Detection with the Labeler

+

In this example, we utilize the Labeler component of the Data Profiler to detect the sensitive information for both structured and unstructured data. In addition, we show how to train the Labeler on some specific dataset with different list of entities.

+

First, let’s dive into what the Labeler is.

+
+

What is the Labeler

+

The Labeler is a pipeline designed to make building, training, and predictions with ML models quick and easy. There are 3 major components to the Labeler: the preprocessor, the model, and the postprocessor.

+

alt text

+

Each component can be switched out individually to suit your needs. As you might expect, the preprocessor takes in raw data and prepares it for the model, the model performs the prediction or training, and the postprocessor takes prediction results and turns them into human-readable results.

+

Now let’s run some examples. Start by importing all the requirements.

+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+import pandas as pd
+
+try:
+    sys.path.insert(0, '..')
+    import dataprofiler as dp
+except ImportError:
+    import dataprofiler as dp
+
+# remove extra tf loggin
+import tensorflow as tf
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+
+
+
+
+
+

Structured Data Prediction

+

We’ll use the aws honeypot dataset in the test folder for this example. First, look at the data using the Data Reader class of the Data Profiler. This dataset is from the US department of educations, found here!

+
+
[ ]:
+
+
+
+data = dp.Data("../dataprofiler/tests/data/csv/SchoolDataSmall.csv")
+df_data = data.data
+df_data.head()
+
+
+
+

We can directly predict the labels of a structured dataset on the cell level.

+
+
[ ]:
+
+
+
+labeler = dp.DataLabeler(labeler_type='structured')
+
+# print out the labels and label mapping
+print("Labels: {}".format(labeler.labels))
+print("\n")
+print("Label Mapping: {}".format(labeler.label_mapping))
+print("\n")
+
+# make predictions and get labels for each cell going row by row
+# predict options are model dependent and the default model can show prediction confidences
+predictions = labeler.predict(data, predict_options={"show_confidences": True})
+
+# display prediction results
+print("Predictions: {}".format(predictions['pred']))
+print("\n")
+
+# display confidence results
+print("Confidences: {}".format(predictions['conf']))
+
+
+
+

The profiler uses the Labeler to perform column by column predictions. The data contains 11 columns, each of which has data label. Next, we will use the Labeler of the Data Profiler to predict the label for each column in this tabular dataset. Since we are only going to demo the labeling functionality, other options of the Data Profiler are disabled to keep this quick.

+
+
[ ]:
+
+
+
+# helper functions for printing results
+
+def get_structured_results(results):
+    """Helper function to get data labels for each column."""
+    columns = []
+    predictions = []
+    samples = []
+    for col in results['data_stats']:
+        columns.append(col['column_name'])
+        predictions.append(col['data_label'])
+        samples.append(col['samples'])
+
+    df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions, 'Sample': samples})
+    return df_results
+
+def get_unstructured_results(data, results):
+    """Helper function to get data labels for each labeled piece of text."""
+    labeled_data = []
+    for pred in results['pred'][0]:
+        labeled_data.append([data[0][pred[0]:pred[1]], pred[2]])
+    label_df = pd.DataFrame(labeled_data, columns=['Text', 'Labels'])
+    return label_df
+
+
+pd.set_option('display.width', 100)
+
+
+
+
+
[ ]:
+
+
+
+# set options to only run the labeler
+profile_options = dp.ProfilerOptions()
+profile_options.set({"structured_options.text.is_enabled": False,
+                     "int.is_enabled": False,
+                     "float.is_enabled": False,
+                     "order.is_enabled": False,
+                     "category.is_enabled": False,
+                     "chi2_homogeneity.is_enabled": False,
+                     "datetime.is_enabled": False,})
+
+profile = dp.Profiler(data, options=profile_options)
+
+results = profile.report()
+print(get_structured_results(results))
+
+
+
+

In this example, the results show that the Data Profiler is able to detect integers, URLs, address, and floats appropriately. Unknown is typically strings of text, which is appropriate for those columns.

+
+
+

Unstructured Data Prediction

+

Besides structured data, the Labeler detects the sensitive information on the unstructured text. We use a sample of spam email in Enron email dataset for this demo. As above, we start investigating the content of the given email sample.

+
+
[ ]:
+
+
+
+# load data
+data = "Message-ID: <11111111.1111111111111.JavaMail.evans@thyme>\n" + \
+        "Date: Fri, 10 Aug 2005 11:31:37 -0700 (PDT)\n" + \
+        "From: w..smith@company.com\n" + \
+        "To: john.smith@company.com\n" + \
+        "Subject: RE: ABC\n" + \
+        "Mime-Version: 1.0\n" + \
+        "Content-Type: text/plain; charset=us-ascii\n" + \
+        "Content-Transfer-Encoding: 7bit\n" + \
+        "X-From: Smith, Mary W. </O=ENRON/OU=NA/CN=RECIPIENTS/CN=SSMITH>\n" + \
+        "X-To: Smith, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JSMITH>\n" + \
+        "X-cc: \n" + \
+        "X-bcc: \n" + \
+        "X-Folder: \SSMITH (Non-Privileged)\Sent Items\n" + \
+        "X-Origin: Smith-S\n" + \
+        "X-FileName: SSMITH (Non-Privileged).pst\n\n" + \
+        "All I ever saw was the e-mail from the office.\n\n" + \
+        "Mary\n\n" + \
+        "-----Original Message-----\n" + \
+        "From:   Smith, John  \n" + \
+        "Sent:   Friday, August 10, 2005 13:07 PM\n" + \
+        "To:     Smith, Mary W.\n" + \
+        "Subject:        ABC\n\n" + \
+        "Have you heard any more regarding the ABC sale? I guess that means that " + \
+        "it's no big deal here, but you think they would have send something.\n\n\n" + \
+        "John Smith\n" + \
+        "123-456-7890\n"
+
+# convert string data to list to feed into the labeler
+data = [data]
+
+
+
+

By default, the Labeler predicts the results at the character level for unstructured text.

+
+
[ ]:
+
+
+
+labeler = dp.DataLabeler(labeler_type='unstructured')
+
+# make predictions and get labels per character
+predictions = labeler.predict(data)
+
+# display results
+print(predictions['pred'])
+
+
+
+

In addition to the character-level result, the Labeler provides the results at the word level following the standard NER (Named Entity Recognition), e.g., utilized by spaCy.

+
+
[ ]:
+
+
+
+# convert prediction to word format and ner format
+# Set the output to the NER format (start position, end position, label)
+labeler.set_params(
+    { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } }
+)
+
+# make predictions and get labels per character
+predictions = labeler.predict(data)
+
+# display results
+print('\n')
+print('=======================Prediction======================\n')
+for pred in predictions['pred'][0]:
+    print('{}: {}'.format(data[0][pred[0]: pred[1]], pred[2]))
+    print('--------------------------------------------------------')
+
+
+
+

Here, the Labeler is able to identify sensitive information such as datetime, email address, person names, and phone number in an email sample.

+
+
+

Train the Labeler from Scratch

+

The Labeler can be trained from scratch with a new list of labels. Below, we show an example of training the Labeler on a dataset with labels given as the columns of that dataset. For brevity’s sake, let’s only train a few epochs with a subset of a dataset.

+
+
[ ]:
+
+
+
+data = dp.Data("../dataprofiler/tests/data/csv/SchoolDataSmall.csv")
+df = data.data[["OPEID6", "INSTURL", "SEARCH_STRING"]]
+df.head()
+
+# split data to training and test set
+split_ratio = 0.2
+df = df.sample(frac=1).reset_index(drop=True)
+data_train = df[:int((1 - split_ratio) * len(df))]
+data_test = df[int((1 - split_ratio) * len(df)):]
+
+# train a new labeler with column names as labels
+if not os.path.exists('data_labeler_saved'):
+    os.makedirs('data_labeler_saved')
+
+labeler = dp.train_structured_labeler(
+    data=data_train,
+    save_dirpath="data_labeler_saved",
+    epochs=10,
+    default_label="OPEID6"
+)
+
+
+
+
+

The trained Labeler is then used by the Data Profiler to provide the prediction on the new dataset.

+
+
[ ]:
+
+
+
+# predict with the labeler object
+profile_options.set({'structured_options.data_labeler.data_labeler_object': labeler})
+profile = dp.Profiler(data_test, options=profile_options)
+
+# get the prediction from the data profiler
+results = profile.report()
+print(get_structured_results(results))
+
+
+
+

Another way to use the trained Labeler is through the directory path of the saved labeler.

+
+
[ ]:
+
+
+
+# predict with the labeler loaded from path
+profile_options.set({'structured_options.data_labeler.data_labeler_dirpath': 'data_labeler_saved'})
+profile = dp.Profiler(data_test, options=profile_options)
+
+# get the prediction from the data profiler
+results = profile.report()
+print(get_structured_results(results))
+
+
+
+
+
+

Transfer Learning a Labeler

+

Instead of training a model from scratch, we can also transfer learn to improve the model and/or extend the labels. Again for brevity’s sake, let’s only train a few epochs with a small dataset at the cost of accuracy.

+
+
[ ]:
+
+
+
+data = dp.Data("../dataprofiler/tests/data/csv/SchoolDataSmall.csv")
+df_data = data.data[["OPEID6", "INSTURL", "SEARCH_STRING"]]
+
+
+# prep data
+df_data = df_data.reset_index(drop=True).melt()
+df_data.columns = [1, 0]  # labels=1, values=0 in that order
+df_data = df_data.astype(str)
+new_labels = df_data[1].unique().tolist()
+
+# load structured Labeler w/ trainable set to True
+labeler = dp.DataLabeler(labeler_type='structured', trainable=True)
+
+# Reconstruct the model to add each new label
+for label in new_labels:
+    labeler.add_label(label)
+
+# this will use transfer learning to retrain the labeler on your new
+# dataset and labels.
+# Setting labels with a list of labels or label mapping will overwrite the existing labels with new ones
+# Setting the reset_weights parameter to false allows transfer learning to occur
+model_results = labeler.fit(x=df_data[0], y=df_data[1], validation_split=0.2,
+                                 epochs=10, labels=None, reset_weights=False)
+
+
+
+

Let’s display the training results of the last epoch:

+
+
[ ]:
+
+
+
+print("{:16s}  Precision  Recall  F1-score  Support".format(""))
+for item in model_results[-1][2]:
+    print("{:16s}  {:4.3f}      {:4.3f}   {:4.3f}     {:7.0f}".format(item,
+                                                                      model_results[-1][2][item]["precision"],
+                                                                      model_results[-1][2][item]["recall"],
+                                                                      model_results[-1][2][item]["f1-score"],
+                                                                      model_results[-1][2][item]["support"]))
+
+
+
+

It is now trained to detect additional labels! The model results here show all the labels training accuracy. Since only new labels existed in the dataset, only the new labels are given accuracy scores. Keep in mind this is a small dataset for brevity’s sake and that real training would involve more samples and better results.

+
+
+

Saving and Loading a Labeler

+

The Labeler can easily be saved or loaded with one simple line.

+
+
[ ]:
+
+
+
+# Ensure save directory exists
+if not os.path.exists('my_labeler'):
+    os.makedirs('my_labeler')
+
+# Saving the labeler
+labeler.save_to_disk("my_labeler")
+
+# Loading the labeler
+labeler = dp.DataLabeler(labeler_type='structured', dirpath="my_labeler")
+
+
+
+
+
+

Building a Labeler from the Ground Up

+

As mentioned earlier, the labeler is comprised of three components, and each of the compenents can be created and interchanged in the the labeler pipeline.

+
+
[ ]:
+
+
+
+import random
+from dataprofiler.labelers.character_level_cnn_model import \
+    CharacterLevelCnnModel
+from dataprofiler.labelers.data_processing import \
+    StructCharPreprocessor, StructCharPostprocessor
+
+model = CharacterLevelCnnModel({"PAD":0, "UNKNOWN":1, "Test_Label":2})
+preprocessor = StructCharPreprocessor()
+postprocessor = StructCharPostprocessor()
+
+labeler = dp.DataLabeler(labeler_type='structured')
+labeler.set_preprocessor(preprocessor)
+labeler.set_model(model)
+labeler.set_postprocessor(postprocessor)
+
+# check for basic compatibility between the processors and the model
+labeler.check_pipeline()
+
+# Optionally set the parameters
+parameters={
+    'preprocessor':{
+        'max_length': 100,
+    },
+    'model':{
+        'max_length': 100,
+    },
+    'postprocessor':{
+        'random_state': random.Random(1)
+    }
+}
+labeler.set_params(parameters)
+
+labeler.help()
+
+
+
+

The components can each be created if you inherit the BaseModel and BaseProcessor for the model and processors, respectively. More info can be found about coding your own components in the Labeler section of the documentation. In summary, the Data Profiler open source library can be used to scan sensitive information in both structured and unstructured data with different file types. It supports multiple input formats and output formats at word and +character levels. Users can also train the labeler on their own datasets.

+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/labeler.ipynb b/docs/0.12.0/html/labeler.ipynb new file mode 100644 index 000000000..af31b68c5 --- /dev/null +++ b/docs/0.12.0/html/labeler.ipynb @@ -0,0 +1,650 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "spoken-reunion", + "metadata": {}, + "source": [ + "# Sensitive Data Detection with the Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "interesting-bidder", + "metadata": {}, + "source": [ + "In this example, we utilize the Labeler component of the Data Profiler to detect the sensitive information for both structured and unstructured data. In addition, we show how to train the Labeler on some specific dataset with different list of entities.\n", + "\n", + "First, let's dive into what the Labeler is." + ] + }, + { + "cell_type": "markdown", + "id": "1965b83b", + "metadata": {}, + "source": [ + "## What is the Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "388c643f", + "metadata": {}, + "source": [ + "The Labeler is a pipeline designed to make building, training, and predictions with ML models quick and easy. There are 3 major components to the Labeler: the preprocessor, the model, and the postprocessor." + ] + }, + { + "cell_type": "markdown", + "id": "e5d0aeb4", + "metadata": {}, + "source": [ + "![alt text](DL-Flowchart.png \"Title\")" + ] + }, + { + "cell_type": "markdown", + "id": "550323c7", + "metadata": {}, + "source": [ + "Each component can be switched out individually to suit your needs. As you might expect, the preprocessor takes in raw data and prepares it for the model, the model performs the prediction or training, and the postprocessor takes prediction results and turns them into human-readable results. \n", + "\n", + "Now let's run some examples. Start by importing all the requirements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "scientific-stevens", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "\n", + "# remove extra tf loggin\n", + "import tensorflow as tf\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "id": "5125b215", + "metadata": {}, + "source": [ + "## Structured Data Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "wicked-devon", + "metadata": {}, + "source": [ + "We'll use the aws honeypot dataset in the test folder for this example. First, look at the data using the Data Reader class of the Data Profiler. This dataset is from the US department of educations, [found here!](https://data.ed.gov/dataset/college-scorecard-all-data-files-through-6-2020/resources?resource=823ac095-bdfc-41b0-b508-4e8fc3110082)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adjusted-native", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df_data = data.data\n", + "df_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ab6ccf8a", + "metadata": {}, + "source": [ + "We can directly predict the labels of a structured dataset on the cell level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19529af4", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "labeler = dp.DataLabeler(labeler_type='structured')\n", + "\n", + "# print out the labels and label mapping\n", + "print(\"Labels: {}\".format(labeler.labels)) \n", + "print(\"\\n\")\n", + "print(\"Label Mapping: {}\".format(labeler.label_mapping))\n", + "print(\"\\n\")\n", + "\n", + "# make predictions and get labels for each cell going row by row\n", + "# predict options are model dependent and the default model can show prediction confidences\n", + "predictions = labeler.predict(data, predict_options={\"show_confidences\": True})\n", + "\n", + "# display prediction results\n", + "print(\"Predictions: {}\".format(predictions['pred']))\n", + "print(\"\\n\")\n", + "\n", + "# display confidence results\n", + "print(\"Confidences: {}\".format(predictions['conf']))" + ] + }, + { + "cell_type": "markdown", + "id": "2af72e2c", + "metadata": {}, + "source": [ + "The profiler uses the Labeler to perform column by column predictions. The data contains 11 columns, each of which has data label. Next, we will use the Labeler of the Data Profiler to predict the label for each column in this tabular dataset. Since we are only going to demo the labeling functionality, other options of the Data Profiler are disabled to keep this quick." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6cb9d7e-149a-4cfe-86f8-76c47c57aeea", + "metadata": {}, + "outputs": [], + "source": [ + "# helper functions for printing results\n", + "\n", + "def get_structured_results(results):\n", + " \"\"\"Helper function to get data labels for each column.\"\"\"\n", + " columns = []\n", + " predictions = []\n", + " samples = []\n", + " for col in results['data_stats']:\n", + " columns.append(col['column_name'])\n", + " predictions.append(col['data_label'])\n", + " samples.append(col['samples'])\n", + "\n", + " df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions, 'Sample': samples})\n", + " return df_results\n", + "\n", + "def get_unstructured_results(data, results):\n", + " \"\"\"Helper function to get data labels for each labeled piece of text.\"\"\"\n", + " labeled_data = []\n", + " for pred in results['pred'][0]:\n", + " labeled_data.append([data[0][pred[0]:pred[1]], pred[2]])\n", + " label_df = pd.DataFrame(labeled_data, columns=['Text', 'Labels'])\n", + " return label_df\n", + " \n", + "\n", + "pd.set_option('display.width', 100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "secret-million", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# set options to only run the labeler\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"structured_options.text.is_enabled\": False, \n", + " \"int.is_enabled\": False, \n", + " \"float.is_enabled\": False, \n", + " \"order.is_enabled\": False, \n", + " \"category.is_enabled\": False, \n", + " \"chi2_homogeneity.is_enabled\": False,\n", + " \"datetime.is_enabled\": False,})\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "\n", + "results = profile.report() \n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "fatty-louisville", + "metadata": {}, + "source": [ + "In this example, the results show that the Data Profiler is able to detect integers, URLs, address, and floats appropriately. Unknown is typically strings of text, which is appropriate for those columns." + ] + }, + { + "cell_type": "markdown", + "id": "unavailable-diploma", + "metadata": {}, + "source": [ + "## Unstructured Data Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "metallic-coaching", + "metadata": {}, + "source": [ + "Besides structured data, the Labeler detects the sensitive information on the unstructured text. We use a sample of spam email in Enron email dataset for this demo. As above, we start investigating the content of the given email sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unauthorized-lounge", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# load data\n", + "data = \"Message-ID: <11111111.1111111111111.JavaMail.evans@thyme>\\n\" + \\\n", + " \"Date: Fri, 10 Aug 2005 11:31:37 -0700 (PDT)\\n\" + \\\n", + " \"From: w..smith@company.com\\n\" + \\\n", + " \"To: john.smith@company.com\\n\" + \\\n", + " \"Subject: RE: ABC\\n\" + \\\n", + " \"Mime-Version: 1.0\\n\" + \\\n", + " \"Content-Type: text/plain; charset=us-ascii\\n\" + \\\n", + " \"Content-Transfer-Encoding: 7bit\\n\" + \\\n", + " \"X-From: Smith, Mary W. \\n\" + \\\n", + " \"X-To: Smith, John \\n\" + \\\n", + " \"X-cc: \\n\" + \\\n", + " \"X-bcc: \\n\" + \\\n", + " \"X-Folder: \\SSMITH (Non-Privileged)\\Sent Items\\n\" + \\\n", + " \"X-Origin: Smith-S\\n\" + \\\n", + " \"X-FileName: SSMITH (Non-Privileged).pst\\n\\n\" + \\\n", + " \"All I ever saw was the e-mail from the office.\\n\\n\" + \\\n", + " \"Mary\\n\\n\" + \\\n", + " \"-----Original Message-----\\n\" + \\\n", + " \"From: Smith, John \\n\" + \\\n", + " \"Sent: Friday, August 10, 2005 13:07 PM\\n\" + \\\n", + " \"To: Smith, Mary W.\\n\" + \\\n", + " \"Subject: ABC\\n\\n\" + \\\n", + " \"Have you heard any more regarding the ABC sale? I guess that means that \" + \\\n", + " \"it's no big deal here, but you think they would have send something.\\n\\n\\n\" + \\\n", + " \"John Smith\\n\" + \\\n", + " \"123-456-7890\\n\"\n", + "\n", + "# convert string data to list to feed into the labeler\n", + "data = [data]" + ] + }, + { + "cell_type": "markdown", + "id": "concerned-segment", + "metadata": {}, + "source": [ + "By default, the Labeler predicts the results at the character level for unstructured text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "junior-acrobat", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "labeler = dp.DataLabeler(labeler_type='unstructured')\n", + "\n", + "# make predictions and get labels per character\n", + "predictions = labeler.predict(data)\n", + "\n", + "# display results\n", + "print(predictions['pred'])" + ] + }, + { + "cell_type": "markdown", + "id": "individual-diabetes", + "metadata": {}, + "source": [ + "In addition to the character-level result, the Labeler provides the results at the word level following the standard NER (Named Entity Recognition), e.g., utilized by spaCy. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "optical-universe", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# convert prediction to word format and ner format\n", + "# Set the output to the NER format (start position, end position, label)\n", + "labeler.set_params(\n", + " { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } } \n", + ")\n", + "\n", + "# make predictions and get labels per character\n", + "predictions = labeler.predict(data)\n", + "\n", + "# display results\n", + "print('\\n')\n", + "print('=======================Prediction======================\\n')\n", + "for pred in predictions['pred'][0]:\n", + " print('{}: {}'.format(data[0][pred[0]: pred[1]], pred[2]))\n", + " print('--------------------------------------------------------')" + ] + }, + { + "cell_type": "markdown", + "id": "behavioral-tourism", + "metadata": {}, + "source": [ + "Here, the Labeler is able to identify sensitive information such as datetime, email address, person names, and phone number in an email sample. " + ] + }, + { + "cell_type": "markdown", + "id": "nasty-disney", + "metadata": {}, + "source": [ + "## Train the Labeler from Scratch" + ] + }, + { + "cell_type": "markdown", + "id": "destroyed-twist", + "metadata": {}, + "source": [ + "The Labeler can be trained from scratch with a new list of labels. Below, we show an example of training the Labeler on a dataset with labels given as the columns of that dataset. For brevity's sake, let's only train a few epochs with a subset of a dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "utility-evaluation", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df = data.data[[\"OPEID6\", \"INSTURL\", \"SEARCH_STRING\"]]\n", + "df.head()\n", + "\n", + "# split data to training and test set\n", + "split_ratio = 0.2\n", + "df = df.sample(frac=1).reset_index(drop=True)\n", + "data_train = df[:int((1 - split_ratio) * len(df))]\n", + "data_test = df[int((1 - split_ratio) * len(df)):]\n", + "\n", + "# train a new labeler with column names as labels\n", + "if not os.path.exists('data_labeler_saved'):\n", + " os.makedirs('data_labeler_saved')\n", + "\n", + "labeler = dp.train_structured_labeler(\n", + " data=data_train,\n", + " save_dirpath=\"data_labeler_saved\",\n", + " epochs=10,\n", + " default_label=\"OPEID6\"\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "utility-torture", + "metadata": {}, + "source": [ + "The trained Labeler is then used by the Data Profiler to provide the prediction on the new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "answering-panel", + "metadata": {}, + "outputs": [], + "source": [ + "# predict with the labeler object\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': labeler})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "polish-stand", + "metadata": {}, + "source": [ + "Another way to use the trained Labeler is through the directory path of the saved labeler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "industrial-characterization", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# predict with the labeler loaded from path\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_dirpath': 'data_labeler_saved'})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "2acedba0", + "metadata": {}, + "source": [ + "## Transfer Learning a Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "2f15fb1f", + "metadata": {}, + "source": [ + "Instead of training a model from scratch, we can also transfer learn to improve the model and/or extend the labels. Again for brevity's sake, let's only train a few epochs with a small dataset at the cost of accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0104c374", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df_data = data.data[[\"OPEID6\", \"INSTURL\", \"SEARCH_STRING\"]]\n", + "\n", + "\n", + "# prep data\n", + "df_data = df_data.reset_index(drop=True).melt()\n", + "df_data.columns = [1, 0] # labels=1, values=0 in that order\n", + "df_data = df_data.astype(str)\n", + "new_labels = df_data[1].unique().tolist()\n", + "\n", + "# load structured Labeler w/ trainable set to True\n", + "labeler = dp.DataLabeler(labeler_type='structured', trainable=True)\n", + "\n", + "# Reconstruct the model to add each new label\n", + "for label in new_labels:\n", + " labeler.add_label(label)\n", + "\n", + "# this will use transfer learning to retrain the labeler on your new\n", + "# dataset and labels.\n", + "# Setting labels with a list of labels or label mapping will overwrite the existing labels with new ones\n", + "# Setting the reset_weights parameter to false allows transfer learning to occur\n", + "model_results = labeler.fit(x=df_data[0], y=df_data[1], validation_split=0.2, \n", + " epochs=10, labels=None, reset_weights=False)" + ] + }, + { + "cell_type": "markdown", + "id": "ae78745f", + "metadata": {}, + "source": [ + "Let's display the training results of the last epoch:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b764aa8c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"{:16s} Precision Recall F1-score Support\".format(\"\"))\n", + "for item in model_results[-1][2]:\n", + " print(\"{:16s} {:4.3f} {:4.3f} {:4.3f} {:7.0f}\".format(item,\n", + " model_results[-1][2][item][\"precision\"],\n", + " model_results[-1][2][item][\"recall\"],\n", + " model_results[-1][2][item][\"f1-score\"],\n", + " model_results[-1][2][item][\"support\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "44009522", + "metadata": {}, + "source": [ + "It is now trained to detect additional labels! The model results here show all the labels training accuracy. Since only new labels existed in the dataset, only the new labels are given accuracy scores. Keep in mind this is a small dataset for brevity's sake and that real training would involve more samples and better results." + ] + }, + { + "cell_type": "markdown", + "id": "e110ee1c", + "metadata": {}, + "source": [ + "## Saving and Loading a Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "c484d193", + "metadata": {}, + "source": [ + "The Labeler can easily be saved or loaded with one simple line." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d8684fa", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure save directory exists\n", + "if not os.path.exists('my_labeler'):\n", + " os.makedirs('my_labeler')\n", + "\n", + "# Saving the labeler\n", + "labeler.save_to_disk(\"my_labeler\")\n", + "\n", + "# Loading the labeler\n", + "labeler = dp.DataLabeler(labeler_type='structured', dirpath=\"my_labeler\")" + ] + }, + { + "cell_type": "markdown", + "id": "8d36dec8", + "metadata": {}, + "source": [ + "## Building a Labeler from the Ground Up" + ] + }, + { + "cell_type": "markdown", + "id": "59346d2b", + "metadata": {}, + "source": [ + "As mentioned earlier, the labeler is comprised of three components, and each of the compenents can be created and interchanged in the the labeler pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6506ef97", + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "from dataprofiler.labelers.character_level_cnn_model import \\\n", + " CharacterLevelCnnModel\n", + "from dataprofiler.labelers.data_processing import \\\n", + " StructCharPreprocessor, StructCharPostprocessor\n", + "\n", + "model = CharacterLevelCnnModel({\"PAD\":0, \"UNKNOWN\":1, \"Test_Label\":2})\n", + "preprocessor = StructCharPreprocessor()\n", + "postprocessor = StructCharPostprocessor()\n", + "\n", + "labeler = dp.DataLabeler(labeler_type='structured')\n", + "labeler.set_preprocessor(preprocessor)\n", + "labeler.set_model(model)\n", + "labeler.set_postprocessor(postprocessor)\n", + "\n", + "# check for basic compatibility between the processors and the model\n", + "labeler.check_pipeline()\n", + "\n", + "# Optionally set the parameters\n", + "parameters={\n", + " 'preprocessor':{\n", + " 'max_length': 100,\n", + " },\n", + " 'model':{\n", + " 'max_length': 100,\n", + " },\n", + " 'postprocessor':{\n", + " 'random_state': random.Random(1)\n", + " }\n", + "} \n", + "labeler.set_params(parameters)\n", + "\n", + "labeler.help()" + ] + }, + { + "cell_type": "markdown", + "id": "5f020d7f", + "metadata": {}, + "source": [ + "The components can each be created if you inherit the BaseModel and BaseProcessor for the model and processors, respectively. More info can be found about coding your own components in the Labeler section of the [documentation]( https://capitalone.github.io/dataprofiler). In summary, the Data Profiler open source library can be used to scan sensitive information in both structured and unstructured data with different file types. It supports multiple input formats and output formats at word and character levels. Users can also train the labeler on their own datasets." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/html/merge_profile_list.html b/docs/0.12.0/html/merge_profile_list.html new file mode 100644 index 000000000..792ff54c0 --- /dev/null +++ b/docs/0.12.0/html/merge_profile_list.html @@ -0,0 +1,639 @@ + + + + + + + + + Merge List of Profiles - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Merge List of Profiles

+

This is an example of a new utils in the dataprofiler for distributed merging of profile objects. This assumes the user is providing a list of profile objects to the utils function for merging all the profiles together.

+
+

Imports

+

Let’s start by importing the necessary packages…

+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+
+import pandas as pd
+import tensorflow as tf
+
+try:
+    sys.path.insert(0, '..')
+    import dataprofiler as dp
+    from dataprofiler.profilers.profiler_utils import merge_profile_list
+except ImportError:
+    import dataprofiler as dp
+    from dataprofiler.profilers.profiler_utils import merge_profile_list
+
+# remove extra tf loggin
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+
+
+
+
+
+

Setup the Data and Profiler

+

This section shows the basic example of the Data Profiler.

+
    +
  1. Instantiate a Pandas dataframe with dummy data

  2. +
  3. Pass the dataframe to the Profiler and instantiate two separate profilers in a list

  4. +
+
+
[ ]:
+
+
+
+d = {'col1': [1, 2], 'col2': [3, 4]}
+df = pd.DataFrame(data=d)
+
+list_of_profiles = [dp.Profiler(df), dp.Profiler(df)]
+
+
+
+

Take a look at the list of profiles…

+
+
[ ]:
+
+
+
+list_of_profiles
+
+
+
+
+
+

Run Merge on List of Profiles

+

Now let’s merge the list of profiles into a single_profile

+
+
[ ]:
+
+
+
+single_profile = merge_profile_list(list_of_profiles=list_of_profiles)
+
+
+
+

And check out the .report on the single profile:

+
+
[ ]:
+
+
+
+single_profile.report()
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/merge_profile_list.ipynb b/docs/0.12.0/html/merge_profile_list.ipynb new file mode 100644 index 000000000..7a6d8005a --- /dev/null +++ b/docs/0.12.0/html/merge_profile_list.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "60af5256", + "metadata": {}, + "source": [ + "# Merge List of Profiles\n", + "\n", + "This is an example of a new utils in the dataprofiler for distributed merging of profile objects. This assumes the user is providing a list of profile objects to the utils function for merging all the profiles together." + ] + }, + { + "cell_type": "markdown", + "id": "7eee37ff", + "metadata": {}, + "source": [ + "## Imports\n", + "\n", + "Let's start by importing the necessary packages..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0d27009", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + " from dataprofiler.profilers.profiler_utils import merge_profile_list\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + " from dataprofiler.profilers.profiler_utils import merge_profile_list\n", + "\n", + "# remove extra tf loggin\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "id": "b4369e64", + "metadata": {}, + "source": [ + "## Setup the Data and Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "410c3c4d", + "metadata": {}, + "source": [ + "This section shows the basic example of the Data Profiler. \n", + "\n", + "1. Instantiate a Pandas dataframe with dummy data\n", + "2. Pass the dataframe to the `Profiler` and instantiate two separate profilers in a list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3567c82", + "metadata": {}, + "outputs": [], + "source": [ + "d = {'col1': [1, 2], 'col2': [3, 4]}\n", + "df = pd.DataFrame(data=d)\n", + "\n", + "list_of_profiles = [dp.Profiler(df), dp.Profiler(df)]" + ] + }, + { + "cell_type": "markdown", + "id": "350502eb", + "metadata": {}, + "source": [ + "Take a look at the list of profiles... " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b649db32", + "metadata": {}, + "outputs": [], + "source": [ + "list_of_profiles" + ] + }, + { + "cell_type": "markdown", + "id": "4ed4fc12", + "metadata": {}, + "source": [ + "## Run Merge on List of Profiles\n", + "\n", + "Now let's merge the list of profiles into a `single_profile`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a636047", + "metadata": {}, + "outputs": [], + "source": [ + "single_profile = merge_profile_list(list_of_profiles=list_of_profiles)" + ] + }, + { + "cell_type": "markdown", + "id": "0aa88720", + "metadata": {}, + "source": [ + "And check out the `.report` on the single profile:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34059c21", + "metadata": {}, + "outputs": [], + "source": [ + "single_profile.report()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dataprofiler", + "language": "python", + "name": "dataprofiler" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/html/modules.html b/docs/0.12.0/html/modules.html new file mode 100644 index 000000000..331db6797 --- /dev/null +++ b/docs/0.12.0/html/modules.html @@ -0,0 +1,304 @@ + + + + + + + + + dataprofiler - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/objects.inv b/docs/0.12.0/html/objects.inv new file mode 100644 index 000000000..216ab0a1c Binary files /dev/null and b/docs/0.12.0/html/objects.inv differ diff --git a/docs/0.12.0/html/overview.html b/docs/0.12.0/html/overview.html new file mode 100644 index 000000000..16a6fe278 --- /dev/null +++ b/docs/0.12.0/html/overview.html @@ -0,0 +1,850 @@ + + + + + + + + + Data Profiler - What’s in your data? - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Data Profiler - What’s in your data?

+

This introductory jupyter notebook demonstrates the basic usages of the Data Profiler. The library is designed to easily detect sensitive data and gather statistics on your datasets with just several lines of code. The Data Profiler can handle several different data types including: CSV (or any delimited file), JSON, Parquet, AVRO, and text. Additionally, there are a plethora of options to customize your profile. This library also has the ability to update profiles from multiple batches of large +datasets, or merge multiple profiles. In particular, this example covers the followings:

+
    +
  • Basic usage of the Data Profiler

  • +
  • The data reader class

  • +
  • Profiler options

  • +
  • Updating profiles and merging profiles

  • +
+

First, let’s import the libraries needed for this example.

+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+import pandas as pd
+import matplotlib.pyplot as plt
+sys.path.insert(0, '..')
+import dataprofiler as dp
+
+data_path = "../dataprofiler/tests/data"
+
+
+
+
+

Basic Usage of the Data Profiler

+

This section shows the basic example of the Data Profiler. A CSV dataset is read using the data reader, then the Data object is given to the Data Profiler to detect sensitive data and obtain the statistics.

+
+
[ ]:
+
+
+
+# use data reader to read input data
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+print(data.data.head())
+
+# run data profiler and get the report
+profile = dp.Profiler(data)
+report  = profile.report(report_options={"output_format":"compact"})
+
+# print the report
+print(json.dumps(report, indent=4))
+
+
+
+

The report includes global_stats and data_stats for the given dataset. The former contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio, while the latter contains specific properties and statistics for each column such as detected data label, min, max, mean, variance, etc. In this example, the compact format of the report is used to shorten the full list of the results. To get more results related to detailed predictions at the entity +level from the Data Labeler component or histogram results, the format pretty should be used.

+
+
+

Data reader class

+

DataProfiler can detect multiple file types including CSV (or any delimited file), JSON, Parquet, AVRO, and text. The example below shows that it successfully detects data types from multiple categories regardless of the file extensions.

+
+
[ ]:
+
+
+
+# use data reader to read input data with different file types
+csv_files = [
+    "csv/aws_honeypot_marx_geo.csv",
+    "csv/all-strings-skip-header-author.csv", # csv files with the author/description on the first line
+    "csv/sparse-first-and-last-column-empty-first-row.txt", # csv file with the .txt extension
+]
+json_files = [
+    "json/complex_nested.json",
+    "json/honeypot_intentially_mislabeled_file.csv", # json file with the .csv extension
+]
+parquet_files = [
+    "parquet/nation.dict.parquet",
+    "parquet/nation.plain.intentionally_mislabled_file.csv", # parquet file with the .csv extension
+]
+avro_files = [
+    "avro/userdata1.avro",
+    "avro/userdata1_intentionally_mislabled_file.json", # avro file with the .json extension
+]
+text_files = [
+    "txt/discussion_reddit.txt",
+]
+
+all_files = {
+    "csv": csv_files,
+    "json": json_files,
+    "parquet": parquet_files,
+    "avro": avro_files,
+    "text": text_files
+}
+
+for file_type in all_files:
+    print(file_type)
+    for file in all_files[file_type]:
+        data = dp.Data(os.path.join(data_path, file))
+        print("{:<85} {:<15}".format(file, data.data_type))
+    print("\n")
+
+
+
+

The Data class detects the file type and uses one of the following classes: CSVData, JSONData, ParquetData, AVROData, TextData. Users can call these specific classes directly if desired. For example, below we provide a collection of data with different types, each of them is processed by the corresponding data class.

+
+
[ ]:
+
+
+
+# use individual data reader classes
+from dataprofiler.data_readers.csv_data import CSVData
+from dataprofiler.data_readers.json_data import JSONData
+from dataprofiler.data_readers.parquet_data import ParquetData
+from dataprofiler.data_readers.avro_data import AVROData
+from dataprofiler.data_readers.text_data import TextData
+
+csv_files = "csv/aws_honeypot_marx_geo.csv"
+json_files = "json/complex_nested.json"
+parquet_files = "parquet/nation.dict.parquet"
+avro_files = "avro/userdata1.avro"
+text_files = "txt/discussion_reddit.txt"
+
+all_files = {
+    "csv": [csv_files, CSVData],
+    "json": [json_files, JSONData],
+    "parquet": [parquet_files, ParquetData],
+    "avro": [avro_files, AVROData],
+    "text": [text_files, TextData],
+}
+
+for file_type in all_files:
+    file, data_reader = all_files[file_type]
+    data = data_reader(os.path.join(data_path, file))
+    print("File name {}\n".format(file))
+    if file_type == "text":
+        print(data.data[0][:1000]) # print the first 1000 characters
+    else:
+        print(data.data)
+    print('===============================================================================')
+
+
+
+

In addition to reading the input data from multiple file types, the Data Profiler allows the input data as a dataframe.

+
+
[ ]:
+
+
+
+# run data profiler and get the report
+my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=["col_int", "col_float"])
+profile = dp.Profiler(my_dataframe)
+report  = profile.report(report_options={"output_format":"compact"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Structured Profiler vs. Unstructured Profiler

+

The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile and the unstructured profile.

+
+
[ ]:
+
+
+
+# Using the structured profiler
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+profile = dp.Profiler(data, profiler_type='structured')
+
+report = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+# Using the unstructured profiler
+my_dataframe = pd.DataFrame([["Sample1"],["Sample2"],["Sample3"]], columns=["Text_Samples"])
+profile = dp.Profiler(my_dataframe, profiler_type='unstructured')
+
+report  = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Profiler options

+

The Data Profiler can enable/disable statistics and modify features through profiler options. For example, if the users only want the statistics information, they may turn off the Data Labeler functionality. Below, let’s remove the histogram and data labeler component while running Data Profiler.

+
+
[ ]:
+
+
+
+profile_options = dp.ProfilerOptions()
+profile_options.set({"histogram_and_quantiles.is_enabled": False,
+                     "median_abs_deviation.is_enabled": False,
+                     "median.is_enabled": False,
+                     "mode.is_enabled": False,
+                     "data_labeler.is_enabled": False,})
+
+profile = dp.Profiler(my_dataframe, options=profile_options)
+report  = profile.report(report_options={"output_format":"pretty"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+

Besides toggling on and off features, other options like the data labeler sample size or histogram bin method can be directly set and validated as shown here:

+
+
[ ]:
+
+
+
+profile_options = dp.ProfilerOptions()
+profile_options.structured_options.data_labeler.sample_size = 1
+profile_options.structured_options.int.histogram_and_quantiles.bin_count_or_method = "rice"
+# An error will raise if the options are set incorrectly.
+profile_options.validate()
+
+profile = dp.Profiler(my_dataframe, options=profile_options)
+report  = profile.report(report_options={"output_format":"pretty"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Update profiles

+

One of the interesting features of the Data Profiler is the ability to update profiles from batches of data, which allows for data streaming usage. In this section, the original dataset is separated into two batches with equal size. Each batch is then updated with Data Profiler sequentially.

+

After the update, we expect the resulted profiles give the same statistics as the profiles updated from the full dataset. We will verify that through some properties in global_stats of the profiles including column_count, row_count, row_is_null_ratio, duplicate_row_count.

+
+
[ ]:
+
+
+
+# read the input data and devide it into two equal halves
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+df = data.data
+df1 = df.iloc[:int(len(df)/2)]
+df2 = df.iloc[int(len(df)/2):]
+
+# Update the profile with the first half
+profile = dp.Profiler(df1)
+
+# Update the profile with the second half
+profile.update_profile(df2)
+
+# Update profile with the full dataset
+profile_full = dp.Profiler(df)
+
+report  = profile.report(report_options={"output_format":"compact"})
+report_full  = profile_full.report(report_options={"output_format":"compact"})
+
+# print the report
+print(json.dumps(report, indent=4))
+print(json.dumps(report_full, indent=4))
+
+
+
+

You can see that the profiles are exactly the same whether they are broken into several updates or not.

+
+
+

Merge profiles

+

In addition to the profile update, Data Profiler provides the merging functionality which allows users to combine the profiles updated from multiple locations. This enables Data Profiler to be used in a distributed computing environment. Below, we assume that the two aforementioned halves of the original dataset come from two different machines. Each of them is then updated with the Data Profiler on the same machine, then the resulted profiles are merged.

+

As with the profile update, we expect the merged profiles give the same statistics as the profiles updated from the full dataset.

+
+
[ ]:
+
+
+
+# Update the profile with the first half
+profile1 = dp.Profiler(df1)
+
+# Update the profile with the second half
+profile2 = dp.Profiler(df2)
+
+# merge profiles
+profile_merge = profile1 + profile2
+
+# check results of the merged profile
+report_merge  = profile.report(report_options={"output_format":"compact"})
+
+# print the report
+print(json.dumps(report_merge, indent=4))
+print(json.dumps(report_full, indent=4))
+
+
+
+

You can see that the profiles are exactly the same!

+
+
+

Conclusion

+

We have walked through some basic examples of Data Profiler usage, with different input data types and profiling options. We also work with update and merging functionality of the Data Profiler, which make it applicable for data streaming and distributed environment. Interested users can try with different datasets and functionalities as desired.

+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/overview.ipynb b/docs/0.12.0/html/overview.ipynb new file mode 100644 index 000000000..d5e77abe4 --- /dev/null +++ b/docs/0.12.0/html/overview.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc2826d9", + "metadata": {}, + "source": [ + "# Data Profiler - What's in your data?" + ] + }, + { + "cell_type": "markdown", + "id": "b997522b", + "metadata": {}, + "source": [ + "This introductory jupyter notebook demonstrates the basic usages of the Data Profiler. The library is designed to easily detect sensitive data and gather statistics on your datasets with just several lines of code. The Data Profiler can handle several different data types including: CSV (or any delimited file), JSON, Parquet, AVRO, and text. Additionally, there are a plethora of options to customize your profile. This library also has the ability to update profiles from multiple batches of large datasets, or merge multiple profiles. In particular, this example covers the followings:\n", + "\n", + "- Basic usage of the Data Profiler\n", + "- The data reader class\n", + "- Profiler options\n", + "- Updating profiles and merging profiles\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef404c84", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "markdown", + "id": "f51971e3", + "metadata": {}, + "source": [ + "## Basic Usage of the Data Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "639e66d3", + "metadata": {}, + "source": [ + "This section shows the basic example of the Data Profiler. A CSV dataset is read using the data reader, then the Data object is given to the Data Profiler to detect sensitive data and obtain the statistics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5379c45c", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# use data reader to read input data\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "print(data.data.head())\n", + "\n", + "# run data profiler and get the report\n", + "profile = dp.Profiler(data)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "57fe2827", + "metadata": {}, + "source": [ + "The report includes `global_stats` and `data_stats` for the given dataset. The former contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio, while the latter contains specific properties and statistics for each column such as detected data label, min, max, mean, variance, etc. In this example, the `compact` format of the report is used to shorten the full list of the results. To get more results related to detailed predictions at the entity level from the Data Labeler component or histogram results, the format `pretty` should be used." + ] + }, + { + "cell_type": "markdown", + "id": "74027cfd", + "metadata": {}, + "source": [ + "## Data reader class" + ] + }, + { + "cell_type": "markdown", + "id": "41364888", + "metadata": {}, + "source": [ + "DataProfiler can detect multiple file types including CSV (or any delimited file), JSON, Parquet, AVRO, and text. The example below shows that it successfully detects data types from multiple categories regardless of the file extensions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "823829f4", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data with different file types\n", + "csv_files = [\n", + " \"csv/aws_honeypot_marx_geo.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "json_files = [\n", + " \"json/complex_nested.json\",\n", + " \"json/honeypot_intentially_mislabeled_file.csv\", # json file with the .csv extension\n", + "]\n", + "parquet_files = [\n", + " \"parquet/nation.dict.parquet\",\n", + " \"parquet/nation.plain.intentionally_mislabled_file.csv\", # parquet file with the .csv extension\n", + "]\n", + "avro_files = [\n", + " \"avro/userdata1.avro\",\n", + " \"avro/userdata1_intentionally_mislabled_file.json\", # avro file with the .json extension\n", + "]\n", + "text_files = [\n", + " \"txt/discussion_reddit.txt\",\n", + "]\n", + "\n", + "all_files = {\n", + " \"csv\": csv_files,\n", + " \"json\": json_files,\n", + " \"parquet\": parquet_files,\n", + " \"avro\": avro_files,\n", + " \"text\": text_files\n", + "}\n", + "\n", + "for file_type in all_files:\n", + " print(file_type)\n", + " for file in all_files[file_type]:\n", + " data = dp.Data(os.path.join(data_path, file))\n", + " print(\"{:<85} {:<15}\".format(file, data.data_type))\n", + " print(\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "3f9d7e02", + "metadata": {}, + "source": [ + "The `Data` class detects the file type and uses one of the following classes: `CSVData`, `JSONData`, `ParquetData`, `AVROData`, `TextData`. Users can call these specific classes directly if desired. For example, below we provide a collection of data with different types, each of them is processed by the corresponding data class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "831e68a3", + "metadata": {}, + "outputs": [], + "source": [ + "# use individual data reader classes\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "from dataprofiler.data_readers.json_data import JSONData\n", + "from dataprofiler.data_readers.parquet_data import ParquetData\n", + "from dataprofiler.data_readers.avro_data import AVROData\n", + "from dataprofiler.data_readers.text_data import TextData\n", + "\n", + "csv_files = \"csv/aws_honeypot_marx_geo.csv\"\n", + "json_files = \"json/complex_nested.json\"\n", + "parquet_files = \"parquet/nation.dict.parquet\"\n", + "avro_files = \"avro/userdata1.avro\"\n", + "text_files = \"txt/discussion_reddit.txt\"\n", + "\n", + "all_files = {\n", + " \"csv\": [csv_files, CSVData],\n", + " \"json\": [json_files, JSONData],\n", + " \"parquet\": [parquet_files, ParquetData],\n", + " \"avro\": [avro_files, AVROData],\n", + " \"text\": [text_files, TextData],\n", + "}\n", + "\n", + "for file_type in all_files:\n", + " file, data_reader = all_files[file_type]\n", + " data = data_reader(os.path.join(data_path, file))\n", + " print(\"File name {}\\n\".format(file))\n", + " if file_type == \"text\":\n", + " print(data.data[0][:1000]) # print the first 1000 characters\n", + " else:\n", + " print(data.data)\n", + " print('===============================================================================')" + ] + }, + { + "cell_type": "markdown", + "id": "572df0a8", + "metadata": {}, + "source": [ + "In addition to reading the input data from multiple file types, the Data Profiler allows the input data as a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df87ab83", + "metadata": {}, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=[\"col_int\", \"col_float\"])\n", + "profile = dp.Profiler(my_dataframe)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "84a06312", + "metadata": {}, + "source": [ + "## Structured Profiler vs. Unstructured Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "4c0ea925", + "metadata": {}, + "source": [ + "The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile and the unstructured profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f4565d8", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Using the structured profiler\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data, profiler_type='structured')\n", + "\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))\n", + "\n", + "# Using the unstructured profiler\n", + "my_dataframe = pd.DataFrame([[\"Sample1\"],[\"Sample2\"],[\"Sample3\"]], columns=[\"Text_Samples\"])\n", + "profile = dp.Profiler(my_dataframe, profiler_type='unstructured')\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b16648ba", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "8b0cc8ad", + "metadata": {}, + "source": [ + "The Data Profiler can enable/disable statistics and modify features through profiler options. For example, if the users only want the statistics information, they may turn off the Data Labeler functionality. Below, let's remove the histogram and data labeler component while running Data Profiler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbac3a2c", + "metadata": {}, + "outputs": [], + "source": [ + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"histogram_and_quantiles.is_enabled\": False,\n", + " \"median_abs_deviation.is_enabled\": False,\n", + " \"median.is_enabled\": False,\n", + " \"mode.is_enabled\": False,\n", + " \"data_labeler.is_enabled\": False,})\n", + "\n", + "profile = dp.Profiler(my_dataframe, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "590ca50b", + "metadata": {}, + "source": [ + "Besides toggling on and off features, other options like the data labeler sample size or histogram bin method can be directly set and validated as shown here:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ed21bc1", + "metadata": {}, + "outputs": [], + "source": [ + "profile_options = dp.ProfilerOptions()\n", + "profile_options.structured_options.data_labeler.sample_size = 1\n", + "profile_options.structured_options.int.histogram_and_quantiles.bin_count_or_method = \"rice\"\n", + "# An error will raise if the options are set incorrectly.\n", + "profile_options.validate()\n", + "\n", + "profile = dp.Profiler(my_dataframe, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "9f690616", + "metadata": {}, + "source": [ + "## Update profiles" + ] + }, + { + "cell_type": "markdown", + "id": "965f8c85", + "metadata": {}, + "source": [ + "One of the interesting features of the Data Profiler is the ability to update profiles from batches of data, which allows for data streaming usage. In this section, the original dataset is separated into two batches with equal size. Each batch is then updated with Data Profiler sequentially. \n", + "\n", + "After the update, we expect the resulted profiles give the same statistics as the profiles updated from the full dataset. We will verify that through some properties in `global_stats` of the profiles including `column_count`, `row_count`, `row_is_null_ratio`, `duplicate_row_count`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34ac4346", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# read the input data and devide it into two equal halves\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "df = data.data\n", + "df1 = df.iloc[:int(len(df)/2)]\n", + "df2 = df.iloc[int(len(df)/2):]\n", + "\n", + "# Update the profile with the first half\n", + "profile = dp.Profiler(df1)\n", + "\n", + "# Update the profile with the second half\n", + "profile.update_profile(df2)\n", + "\n", + "# Update profile with the full dataset\n", + "profile_full = dp.Profiler(df)\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "report_full = profile_full.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report, indent=4))\n", + "print(json.dumps(report_full, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b41ee2bf", + "metadata": {}, + "source": [ + "You can see that the profiles are exactly the same whether they are broken into several updates or not." + ] + }, + { + "cell_type": "markdown", + "id": "c547f051", + "metadata": {}, + "source": [ + "## Merge profiles" + ] + }, + { + "cell_type": "markdown", + "id": "a5292962", + "metadata": {}, + "source": [ + "In addition to the profile update, Data Profiler provides the merging functionality which allows users to combine the profiles updated from multiple locations. This enables Data Profiler to be used in a distributed computing environment. Below, we assume that the two aforementioned halves of the original dataset come from two different machines. Each of them is then updated with the Data Profiler on the same machine, then the resulted profiles are merged.\n", + "\n", + "As with the profile update, we expect the merged profiles give the same statistics as the profiles updated from the full dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a565b8d1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Update the profile with the first half\n", + "profile1 = dp.Profiler(df1)\n", + "\n", + "# Update the profile with the second half\n", + "profile2 = dp.Profiler(df2)\n", + "\n", + "# merge profiles\n", + "profile_merge = profile1 + profile2\n", + "\n", + "# check results of the merged profile\n", + "report_merge = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report_merge, indent=4))\n", + "print(json.dumps(report_full, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b77fac3f", + "metadata": {}, + "source": [ + "You can see that the profiles are exactly the same!" + ] + }, + { + "cell_type": "markdown", + "id": "c644ee42", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "We have walked through some basic examples of Data Profiler usage, with different input data types and profiling options. We also work with update and merging functionality of the Data Profiler, which make it applicable for data streaming and distributed environment. Interested users can try with different datasets and functionalities as desired." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/html/popmon_dp_loader_example.html b/docs/0.12.0/html/popmon_dp_loader_example.html new file mode 100644 index 000000000..ca308ac4b --- /dev/null +++ b/docs/0.12.0/html/popmon_dp_loader_example.html @@ -0,0 +1,805 @@ + + + + + + + + + Dataloader with Popmon Reports - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Dataloader with Popmon Reports

+

This demo is to cover the usage of popmon with the dataloader from the dataprofiler

+

This demo covers the followings:

+
- How to install popmon
+- Comparison of the dynamic dataloader from dataprofiler to the
+    standard dataloader used in pandas
+- Popmon's usage example using both dataloaders
+- Dataprofiler's examples using both dataloaders
+- Usage of the pm_stability_report function (popmon reports)
+
+
+
+

How to Install Popmon

+

To install popmon you can use the command below:

+

pip3 install popmon

+

From here, we can import the libararies needed for this demo.

+
+
[ ]:
+
+
+
+import os
+import sys
+try:
+    sys.path.insert(0, '..')
+    import dataprofiler as dp
+except ImportError:
+    import dataprofiler as dp
+import pandas as pd
+import popmon  # noqa
+
+
+
+
+
+

Comparison of Dataloaders

+

First, we have the original pandas dataloading which works for specific file types. This is good for if the data format is known ahead of time but is less useful for more dynamic cases.

+
+
[ ]:
+
+
+
+def popmon_dataloader(path, time_index):
+    # Load pm dataframe (Can only read csvs unless reader option is changed)
+    if not time_index is None:
+        pm_data = pd.read_csv(path, parse_dates=[time_index])
+    else:
+        time_index = True
+        pm_data = pd.read_csv(path)
+    return pm_data
+
+
+
+

Next, we have the dataprofiler’s dataloader. This allows for the dynamic loading of different data formats which is super useful when the data format is not know ahead of time. This is intended to be an improvement on the dataloader standardly used in pandas.

+
+
[ ]:
+
+
+
+def dp_dataloader(path):
+    # Datalaoder from dataprofiler used
+    dp_data = dp.Data(path)
+
+    # Profiler used to ensure proper label for datetime even
+    # when null values exist
+    profiler_options = dp.ProfilerOptions()
+    profiler_options.set({'*.is_enabled': False,  # Runs first disabling all options in profiler
+                          '*.datetime.is_enabled': True})
+    profile = dp.Profiler(dp_data, options=profiler_options)
+
+    # convert any time/datetime types from strings to actual datatime type
+    for ind, col in enumerate(dp_data.data.columns):
+        if profile.profile[ind].profile.get('data_type') == 'datetime':
+            dp_data.data[col] = pd.to_datetime(dp_data.data[col])
+
+    return dp_data.data
+
+
+
+
+
+

Popmon’s usage example using both dataloaders

+

Next, we’ll download a dataset from the resources component

+
+
[ ]:
+
+
+
+import gzip
+import shutil
+popmon_tutorial_data = popmon.resources.data("flight_delays.csv.gz")
+with gzip.open(popmon_tutorial_data, 'rb') as f_in:
+    with open('./flight_delays.csv', 'wb') as f_out:
+        shutil.copyfileobj(f_in, f_out)
+
+
+
+

Finally we read in the data with popmon and print the report to a file

+
+
[ ]:
+
+
+
+# Default csv from popmon example
+path = "./flight_delays.csv"
+time_index = "DATE"
+report_output_dir = "./popmon_output/flight_delays_full"
+if not os.path.exists(report_output_dir):
+    os.makedirs(report_output_dir)
+
+
+
+
+
+
[ ]:
+
+
+
+pm_data = popmon_dataloader(path, time_index)
+
+report_pm_loader = pm_data.pm_stability_report(
+    time_axis=time_index,
+    time_width="1w",
+    time_offset="2015-07-02",
+    extended_report=False,
+    pull_rules={"*_pull": [10, 7, -7, -10]},
+)
+
+# Save popmon reports
+report_pm_loader.to_file(os.path.join(report_output_dir, "popmon_loader_report.html"))
+print("Report printed at:", os.path.join(report_output_dir, "popmon_loader_report.html"))
+
+
+
+

We then do the same for the dataprofiler loader

+
+
[ ]:
+
+
+
+dp_dataframe = dp_dataloader(path)
+# Generate pm report using dp dataloader
+report_dp_loader = dp_dataframe.pm_stability_report(
+    time_axis=time_index,
+    time_width="1w",
+    time_offset="2015-07-02",
+    extended_report=False,
+    pull_rules={"*_pull": [10, 7, -7, -10]},
+)
+
+# Save popmon reports
+report_dp_loader.to_file(os.path.join(report_output_dir, "dataprofiler_loader_report.html"))
+print("Report printed at:", os.path.join(report_output_dir, "dataprofiler_loader_report.html"))
+
+
+
+
+
+

Examples of data

+

Next, We’ll use some data from the test files of the data profiler to compare the dynamic loading of the dataprofiler’s data loader to that of the standard pandas approach.

+
+
+

Dataprofiler’s examples using both dataloaders

+

To execute this properly, simply choose one of the 3 examples below and then run the report generation below.

+
+
[ ]:
+
+
+
+# Default csv from popmon example (mini version)
+path = "../dataprofiler/tests/data/csv/flight_delays.csv"
+time_index = "DATE"
+report_output_dir = "./popmon_output/flight_delays_mini"
+
+
+
+
+
[ ]:
+
+
+
+# Random csv from dataprofiler tests
+path = "../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv"
+time_index = "datetime"
+report_output_dir = "./popmon_output/aws_honeypot_marx_geo"
+
+
+
+
+
[ ]:
+
+
+
+# Random json file from dataprofiler tests
+path = "../dataprofiler/tests/data/json/math.json"
+
+time_index = "data.9"
+report_output_dir = "./popmon_output/math"
+
+
+
+

Run the block below to create an output directory for your popmon reports.

+
+
[ ]:
+
+
+
+if not os.path.exists(report_output_dir):
+    os.makedirs(report_output_dir)
+dp_dataframe = dp_dataloader(path)
+
+
+
+
+
+

Report comparison

+

We generate reports using different sets of data from the dataprofiler and pandas below using dataprofiler’s dataloader and popmons report generator

+

The dataprofiler’s dataloader can seemlessly switch between data formats and generate reports with the exact same code in place.

+
+
[ ]:
+
+
+
+# Generate pm report using dp dataloader
+report_dp_loader = dp_dataframe.pm_stability_report(
+    time_axis=time_index,
+    time_width="1w",
+    time_offset="2015-07-02",
+    extended_report=False,
+    pull_rules={"*_pull": [10, 7, -7, -10]},
+)
+
+
+
+

If the dataloaders are valid, you can see the reports and compare them at the output directory specified in the printout below each report generation block (the two code blocks below).

+
+
[ ]:
+
+
+
+# Save dp reports
+report_dp_loader.to_file(os.path.join(report_output_dir, "dataprofiler_loader_report.html"))
+print("Report printed at:", os.path.join(report_output_dir, "dataprofiler_loader_report.html"))
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/popmon_dp_loader_example.ipynb b/docs/0.12.0/html/popmon_dp_loader_example.ipynb new file mode 100644 index 000000000..3ddb267da --- /dev/null +++ b/docs/0.12.0/html/popmon_dp_loader_example.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7f0cceea", + "metadata": {}, + "source": [ + "# Dataloader with Popmon Reports" + ] + }, + { + "cell_type": "markdown", + "id": "9e79d9c5", + "metadata": {}, + "source": [ + "This demo is to cover the usage of popmon with the dataloader from the dataprofiler\n", + "\n", + "This demo covers the followings:\n", + "\n", + " - How to install popmon\n", + " - Comparison of the dynamic dataloader from dataprofiler to the \n", + " standard dataloader used in pandas\n", + " - Popmon's usage example using both dataloaders\n", + " - Dataprofiler's examples using both dataloaders\n", + " - Usage of the pm_stability_report function (popmon reports)\n" + ] + }, + { + "cell_type": "markdown", + "id": "aec2198a", + "metadata": {}, + "source": [ + "## How to Install Popmon\n", + "To install popmon you can use the command below:" + ] + }, + { + "cell_type": "markdown", + "id": "4383ed2a", + "metadata": {}, + "source": [ + "`pip3 install popmon`\n" + ] + }, + { + "cell_type": "markdown", + "id": "91dedc34", + "metadata": {}, + "source": [ + "From here, we can import the libararies needed for this demo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2adec556", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "import pandas as pd\n", + "import popmon # noqa" + ] + }, + { + "cell_type": "markdown", + "id": "2ed532ec", + "metadata": {}, + "source": [ + "## Comparison of Dataloaders" + ] + }, + { + "cell_type": "markdown", + "id": "cccbf4cd", + "metadata": {}, + "source": [ + "First, we have the original pandas dataloading which works for specific file types. \n", + "This is good for if the data format is known ahead of time but is less useful for more dynamic cases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96e9ff89", + "metadata": {}, + "outputs": [], + "source": [ + "def popmon_dataloader(path, time_index):\n", + " # Load pm dataframe (Can only read csvs unless reader option is changed)\n", + " if not time_index is None:\n", + " pm_data = pd.read_csv(path, parse_dates=[time_index])\n", + " else:\n", + " time_index = True\n", + " pm_data = pd.read_csv(path)\n", + " return pm_data" + ] + }, + { + "cell_type": "markdown", + "id": "16dfbe10", + "metadata": {}, + "source": [ + "Next, we have the dataprofiler's dataloader. This allows for the dynamic loading of different data formats which is super useful when the data format is not know ahead of time.\n", + "This is intended to be an improvement on the dataloader standardly used in pandas." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07481259", + "metadata": {}, + "outputs": [], + "source": [ + "def dp_dataloader(path):\n", + " # Datalaoder from dataprofiler used\n", + " dp_data = dp.Data(path)\n", + " \n", + " # Profiler used to ensure proper label for datetime even \n", + " # when null values exist\n", + " profiler_options = dp.ProfilerOptions()\n", + " profiler_options.set({'*.is_enabled': False, # Runs first disabling all options in profiler\n", + " '*.datetime.is_enabled': True})\n", + " profile = dp.Profiler(dp_data, options=profiler_options)\n", + "\n", + " # convert any time/datetime types from strings to actual datatime type\n", + " for ind, col in enumerate(dp_data.data.columns):\n", + " if profile.profile[ind].profile.get('data_type') == 'datetime':\n", + " dp_data.data[col] = pd.to_datetime(dp_data.data[col])\n", + "\n", + " return dp_data.data" + ] + }, + { + "cell_type": "markdown", + "id": "69a8ea9b", + "metadata": {}, + "source": [ + "## Popmon's usage example using both dataloaders" + ] + }, + { + "cell_type": "markdown", + "id": "ff914ca7", + "metadata": {}, + "source": [ + "Next, we'll download a dataset from the resources component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bff33da8", + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "import shutil\n", + "popmon_tutorial_data = popmon.resources.data(\"flight_delays.csv.gz\")\n", + "with gzip.open(popmon_tutorial_data, 'rb') as f_in:\n", + " with open('./flight_delays.csv', 'wb') as f_out:\n", + " shutil.copyfileobj(f_in, f_out)" + ] + }, + { + "cell_type": "markdown", + "id": "19222c4a", + "metadata": {}, + "source": [ + "Finally we read in the data with popmon and print the report to a file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0090a2f3", + "metadata": {}, + "outputs": [], + "source": [ + "# Default csv from popmon example\n", + "path = \"./flight_delays.csv\"\n", + "time_index = \"DATE\"\n", + "report_output_dir = \"./popmon_output/flight_delays_full\"\n", + "if not os.path.exists(report_output_dir):\n", + " os.makedirs(report_output_dir)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0abcd9b", + "metadata": {}, + "outputs": [], + "source": [ + "pm_data = popmon_dataloader(path, time_index)\n", + "\n", + "report_pm_loader = pm_data.pm_stability_report(\n", + " time_axis=time_index,\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " pull_rules={\"*_pull\": [10, 7, -7, -10]},\n", + ")\n", + "\n", + "# Save popmon reports\n", + "report_pm_loader.to_file(os.path.join(report_output_dir, \"popmon_loader_report.html\"))\n", + "print(\"Report printed at:\", os.path.join(report_output_dir, \"popmon_loader_report.html\"))" + ] + }, + { + "cell_type": "markdown", + "id": "2303b5cf", + "metadata": {}, + "source": [ + "We then do the same for the dataprofiler loader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2854383", + "metadata": {}, + "outputs": [], + "source": [ + "dp_dataframe = dp_dataloader(path)\n", + "# Generate pm report using dp dataloader\n", + "report_dp_loader = dp_dataframe.pm_stability_report(\n", + " time_axis=time_index,\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " pull_rules={\"*_pull\": [10, 7, -7, -10]},\n", + ")\n", + "\n", + "# Save popmon reports\n", + "report_dp_loader.to_file(os.path.join(report_output_dir, \"dataprofiler_loader_report.html\"))\n", + "print(\"Report printed at:\", os.path.join(report_output_dir, \"dataprofiler_loader_report.html\"))" + ] + }, + { + "cell_type": "markdown", + "id": "8cc4e5f3", + "metadata": {}, + "source": [ + "## Examples of data\n", + "Next, We'll use some data from the test files of the data profiler to compare the dynamic loading of the dataprofiler's data loader to that of the standard pandas approach. \n" + ] + }, + { + "cell_type": "markdown", + "id": "352eaeea", + "metadata": {}, + "source": [ + "## Dataprofiler's examples using both dataloaders" + ] + }, + { + "cell_type": "markdown", + "id": "e99af913", + "metadata": {}, + "source": [ + "To execute this properly, simply choose one of the 3 examples below and then run the report generation below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80eb601d", + "metadata": {}, + "outputs": [], + "source": [ + "# Default csv from popmon example (mini version)\n", + "path = \"../dataprofiler/tests/data/csv/flight_delays.csv\"\n", + "time_index = \"DATE\"\n", + "report_output_dir = \"./popmon_output/flight_delays_mini\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c127288", + "metadata": {}, + "outputs": [], + "source": [ + "# Random csv from dataprofiler tests\n", + "path = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "time_index = \"datetime\"\n", + "report_output_dir = \"./popmon_output/aws_honeypot_marx_geo\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cd5c385", + "metadata": {}, + "outputs": [], + "source": [ + "# Random json file from dataprofiler tests\n", + "path = \"../dataprofiler/tests/data/json/math.json\"\n", + "\n", + "time_index = \"data.9\"\n", + "report_output_dir = \"./popmon_output/math\"" + ] + }, + { + "cell_type": "markdown", + "id": "ec860cb7", + "metadata": {}, + "source": [ + "Run the block below to create an output directory for your popmon reports." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf21835c", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(report_output_dir):\n", + " os.makedirs(report_output_dir)\n", + "dp_dataframe = dp_dataloader(path)" + ] + }, + { + "cell_type": "markdown", + "id": "479975a5", + "metadata": {}, + "source": [ + "## Report comparison" + ] + }, + { + "cell_type": "markdown", + "id": "02a355e7", + "metadata": {}, + "source": [ + "We generate reports using different sets of data from the dataprofiler and pandas below using dataprofiler's dataloader and popmons report generator\n" + ] + }, + { + "cell_type": "markdown", + "id": "6ce69145", + "metadata": {}, + "source": [ + "The dataprofiler's dataloader can seemlessly switch between data formats and generate reports with the exact same code in place." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0dcb405", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Generate pm report using dp dataloader\n", + "report_dp_loader = dp_dataframe.pm_stability_report(\n", + " time_axis=time_index,\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " pull_rules={\"*_pull\": [10, 7, -7, -10]},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9eb0035c", + "metadata": {}, + "source": [ + "If the dataloaders are valid, you can see the reports and compare them at the output directory specified in the printout below each report generation block (the two code blocks below)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efe7d8d6", + "metadata": {}, + "outputs": [], + "source": [ + "# Save dp reports\n", + "report_dp_loader.to_file(os.path.join(report_output_dir, \"dataprofiler_loader_report.html\"))\n", + "print(\"Report printed at:\", os.path.join(report_output_dir, \"dataprofiler_loader_report.html\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/html/profiler.html b/docs/0.12.0/html/profiler.html new file mode 100644 index 000000000..e520acdc0 --- /dev/null +++ b/docs/0.12.0/html/profiler.html @@ -0,0 +1,1414 @@ + + + + + + + + + Profiler - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Profiler

+
+

Profile Your Data

+

Profiling your data is easy. Just use the data reader, send the data to the +profiler, and print out the report.

+
import json
+from dataprofiler import Data, Profiler
+
+data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text
+
+profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc
+
+readable_report = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(readable_report, indent=4))
+
+
+

If the data is structured, the profile will return global statistics as well as +column by column statistics. The vast amount of statistics are listed on the +intro page.

+
+

Load a File

+

The profiler should automatically identify the file type and load the data into a Data Class.

+

Along with other attributtes the Data class enables structured data to be accessed via a valid Pandas DataFrame.

+
# Load a csv file, return a CSVData object
+csv_data = Data('your_file.csv')
+
+# Print the first 10 rows of the csv file
+print(csv_data.data.head(10))
+
+# Load a parquet file, return a ParquetData object
+parquet_data = Data('your_file.parquet')
+
+# Sort the data by the name column
+parquet_data.data.sort_values(by='name', inplace=True)
+
+# Print the sorted first 10 rows of the parquet data
+print(parquet_data.data.head(10))
+
+
+

If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers.

+
+
+

Profile a File

+

Example uses a CSV file for example, but CSV, JSON, Avro or Parquet should also work.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load file (CSV should be automatically identified)
+data = Data("your_file.csv")
+
+# Profile the dataset
+profile = Profiler(data)
+
+# Generate a report and use json to prettify.
+report  = profile.report(report_options={"output_format": "pretty"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+

Updating Profiles

+

Currently, the data profiler is equipped to update its profile in batches.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load and profile a CSV file
+data = Data("your_file.csv")
+profile = Profiler(data)
+
+# Update the profile with new data:
+new_data = Data("new_data.csv")
+profile.update_profile(new_data)
+
+# Print the report using json to prettify.
+report  = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Merging Profiles

+

If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator.

+

This also enables profiles to be determined in a distributed manner.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load a CSV file with a schema
+data1 = Data("file_a.csv")
+profile1 = Profiler(data)
+
+# Load another CSV file with the same schema
+data2 = Data("file_b.csv")
+profile2 = Profiler(data)
+
+profile3 = profile1 + profile2
+
+# Print the report using json to prettify.
+report  = profile3.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Profile Differences

+

Profile differences take two profiles and find the differences +between them. Create the difference report like this:

+
from dataprofiler import Data, Profiler
+
+# Load a CSV file
+data1 = Data("file_a.csv")
+profile1 = Profiler(data)
+
+# Load another CSV file
+data2 = Data("file_b.csv")
+profile2 = Profiler(data)
+
+diff_report = profile1.diff(profile2)
+print(diff_report)
+
+
+

The .diff() operation is available between two profiles, although there are different +outputs depending on the type of profile being differenced. For example, for numerical +column profiles (e.g. integers and floats), two valuable calculations that +.diff() returns are t-test, chi2-test, and psi (Popoulation Stability Index) +for understanding distributional changes.

+

The difference report contains a dictionary that mirrors the profile report. +Each data type has its own difference:

+
    +
  • Int/Float - One profile subtracts the value from the other.

  • +
  • String - The strings will be shown in a list:

    +
      +
    • [profile1 str, profile2 str]

    • +
    +
  • +
  • List - A list of 3 will be returned showing the unique values of +each profile and the shared values:

    +
      +
    • [profile 1 unique values, shared values, profile 2 unique values]

    • +
    +
  • +
  • Dict - Some dictionaries with varied keys will also return a list +of three in the format:

    +
      +
    • [profile 1 unique key-values, shared key differences, profile 2 unique key-values]

    • +
    +
  • +
+

Otherwise, when no differences occur:

+
    +
  • Any Type No Differences - A string will report: “unchanged”.

  • +
+

Below is the structured difference report:

+
{
+    'global_stats': {
+        'file_type': [str, str],
+        'encoding': [str, str],
+        'samples_used': int,
+        'column_count': int,
+        'row_count': int,
+        'row_has_null_ratio': float,
+        'row_is_null_ratio': float,
+        'unique_row_ratio': float,
+        'duplicate_row_count': int,
+        'correlation_matrix': list[list[float]],
+        'chi2_matrix': list[list[float]],
+        'profile_schema': list[dict[str, int]]
+    },
+    'data_stats': [{
+        'column_name': str,
+        'data_type': [str, str],
+        'data_label': [list[str], list[str], list[str]],
+        'categorical': [str, str],
+        'order': [str, str],
+        'statistics': {
+            'min': float,
+            'max': float,
+            'sum': float,
+            'mean': float,
+            'median': float,
+            'mode': [list[float], list[float], list[float]],
+            'median_absolute_deviation': float,
+            'variance': float,
+            'stddev': float,
+            't-test': {
+                't-statistic': float,
+                'conservative': {'deg_of_free': int,
+                                 'p-value': float},
+                'welch': {'deg_of_free': float,
+                          'p-value': float}},
+            'psi': float,
+            "chi2-test": {
+                "chi2-statistic": float,
+                "deg_of_free": int,
+                "p-value": float
+            },
+            'unique_count': int,
+            'unique_ratio': float,
+            'categories': [list[str], list[str], list[str]],
+            'gini_impurity': float,
+            'unalikeability': float,
+            'categorical_count': [dict[str, int], dict[str, int], dict[str, int]],
+            'avg_predictions': [dict[str, float]],
+            'label_representation': [dict[str, float]],
+            'sample_size': int,
+            'null_count': int,
+            'null_types': [list[str], list[str], list[str]],
+            'null_types_index': [dict[str, int], dict[str, int], dict[str, int]],
+            'data_type_representation': [dict[str, float]]
+        },
+        "null_replication_metrics": {
+            "class_prior": list[int],
+            "class_sum": list[list[int]],
+            "class_mean": list[list[int]]
+        }
+    }
+
+
+

Below is the unstructured difference report:

+
{
+    'global_stats': {
+        'file_type': [str, str],
+        'encoding': [str, str],
+        'samples_used': int,
+        'empty_line_count': int,
+        'memory_size': float
+    },
+    'data_stats': {
+        'data_label': {
+            'entity_counts': {
+                'word_level': dict[str, int],
+                'true_char_level': dict[str, int],
+                'postprocess_char_level': dict[str, int]
+            },
+            'entity_percentages': {
+                'word_level': dict[str, float],
+                'true_char_level': dict[str, float],
+                'postprocess_char_level': dict[str, float]
+            }
+        },
+        'statistics': {
+            'vocab': [list[str], list[str], list[str]],
+            'vocab_count': [dict[str, int], dict[str, int], dict[str, int]],
+            'words': [list[str], list[str], list[str]],
+            'word_count': [dict[str, int], dict[str, int], dict[str, int]]
+        }
+    }
+}
+
+
+
+
+

Saving and Loading a Profile

+

The profiles can easily be saved and loaded as shown below:

+

NOTE: Json saving and loading only supports Structured Profiles currently.

+

There are two save/load methods:

+
    +
  • Pickle save/load

    +
      +
    • Save a profile as a .pkl file.

    • +
    • Load a .pkl file as a profile object.

    • +
    +
  • +
+
import json
+from dataprofiler import Data, Profiler
+
+# Load a CSV file, with "," as the delimiter
+data = Data("your_file.csv")
+
+# Read data into profile
+profile = Profiler(data)
+
+# save structured profile to pkl file
+profile.save(filepath="my_profile.pkl")
+
+# load pkl file to structured profile
+loaded_pkl_profile = dp.Profiler.load(filepath="my_profile.pkl")
+
+print(json.dumps(loaded_pkl_profile.report(report_options={"output_format": "compact"}),
+                                       indent=4))
+
+
+
    +
  • Json save/load

    +
      +
    • Save a profile as a human-readable .json file.

    • +
    • Load a .json file as a profile object.

    • +
    +
  • +
+
import json
+from dataprofiler import Data, Profiler
+
+# Load a CSV file, with "," as the delimiter
+data = Data("your_file.csv")
+
+# Read data into profile
+profile = Profiler(data)
+
+# save structured profile to json file
+profile.save(filepath="my_profile.json", save_method="json")
+
+# load json file to structured profile
+loaded_json_profile = dp.Profiler.load(filepath="my_profile.json", load_method="json")
+
+print(json.dumps(loaded_json_profile.report(report_options={"output_format": "compact"}),
+                                       indent=4))
+
+
+
+
+

Structured vs Unstructured Profiles

+

When using the profiler, the data profiler will automatically infer whether to +create the structured profile or the unstructured profile. However, you can be +explicit as shown below:

+
import json
+from dataprofiler import Data, Profiler
+
+# Creating a structured profile
+data1 = Data("normal_csv_file.csv")
+structured_profile = Profiler(data1, profiler_type="structured")
+
+structured_report = structured_profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(structured_report, indent=4))
+
+# Creating an unstructured profile
+data2 = Data("normal_text_file.txt")
+unstructured_profile = Profiler(data2, profiler_type="unstructured")
+
+unstructured_report = unstructured_profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(unstructured_report, indent=4))
+
+
+
+
+

Setting the Sample Size

+

There are two ways to set sample size in a profile: samples_per_update and +min_true_samples. Samples_per_update takes an integer as the exact amount that +will be sampled. Min_true_samples will set the minimum amount of samples that +are not null. For example:

+
from dataprofiler import Profiler
+
+sample_array = [1.0, NULL, 2.0]
+profile = dp.Profiler(sample_array, samples_per_update=2)
+
+
+

The first two samples (1.0 and NULL) are used for the statistical analysis.

+

In contrast, if we also set min_true_samples to 2 then the Data Reader will +continue to read until the minimum true samples were found for the given column. +For example:

+
from dataprofiler import Profiler
+
+sample_array = [1.0, NULL, 2.0]
+profile = dp.Profiler(sample_array, samples_per_update=2, min_true_samples=2)
+
+
+

This will use all samples in the statistical analysis until the number of “true” +(non-NULL) values are reached. Both min_true_samples and +samples_per_update conditions must be met. In this case, the profile will grab +the first two samples (1.0 and NULL) to satisfy the samples_per_update, and then +it will grab the first two VALID samples (1.0 and 2.0) to satisfy the +min_true_samples.

+
+
+

Profile a Pandas DataFrame

+
import pandas as pd
+import dataprofiler as dp
+import json
+
+my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]])
+profile = dp.Profiler(my_dataframe)
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+# read a specified column, in this case it is labeled 0:
+print(json.dumps(report["data stats"][0], indent=4))
+
+
+
+
+

Specifying a Filetype or Delimiter

+

Example of specifying a CSV data type, with a , delimiter. +In addition, it utilizes only the first 10,000 rows.

+
import json
+from dataprofiler import Data, Profiler
+from dataprofiler.data_readers.csv_data import CSVData
+
+# Load a CSV file, with "," as the delimiter
+data = CSVData("your_file.csv", options={"delimiter": ","})
+
+# Split the data, such that only the first 10,000 rows are used
+data = data.data[0:10000]
+
+# Read in profile and print results
+profile = Profiler(data)
+print(json.dumps(profile.report(report_options={"output_format": "pretty"}), indent=4))
+
+
+
+
+

Setting Profiler Seed

+

Example of specifying a seed for reproducibility.

+
import dataprofiler as dp
+
+# Set seed to non-negative integer value or None
+dp.set_seed(0)
+
+
+
+
+
+

Profile Statistic Descriptions

+
+

Structured Profile

+

global_stats:

+
    +
  • samples_used - number of input data samples used to generate this profile

  • +
  • column_count - the number of columns contained in the input dataset

  • +
  • row_count - the number of rows contained in the input dataset

  • +
  • row_has_null_ratio - the proportion of rows that contain at least one null value to the total number of rows

  • +
  • row_is_null_ratio - the proportion of rows that are fully comprised of null values (null rows) to the total number of rows

  • +
  • unique_row_ratio - the proportion of distinct rows in the input dataset to the total number of rows

  • +
  • duplicate_row_count - the number of rows that occur more than once in the input dataset

  • +
  • file_type - the format of the file containing the input dataset (ex: .csv)

  • +
  • encoding - the encoding of the file containing the input dataset (ex: UTF-8)

  • +
  • correlation_matrix - matrix of shape column_count x column_count containing the correlation coefficients between each column in the dataset

  • +
  • chi2_matrix - matrix of shape column_count x column_count containing the chi-square statistics between each column in the dataset

  • +
  • +
    profile_schema - a description of the format of the input dataset labeling each column and its index in the dataset
      +
    • string - the label of the column in question and its index in the profile schema

    • +
    +
    +
    +
  • +
  • times - the duration of time it took to generate the global statistics for this dataset in milliseconds

  • +
+

data_stats:

+
    +
  • column_name - the label/title of this column in the input dataset

  • +
  • data_type - the primitive python data type that is contained within this column

  • +
  • data_label - the label/entity of the data in this column as determined by the Labeler component

  • +
  • categorical - ‘true’ if this column contains categorical data

  • +
  • order - the way in which the data in this column is ordered, if any, otherwise “random”

  • +
  • samples - a small subset of data entries from this column

  • +
  • +
    statistics - statistical information on the column
      +
    • sample_size - number of input data samples used to generate this profile

    • +
    • null_count - the number of null entries in the sample

    • +
    • null_types - a list of the different null types present within this sample

    • +
    • null_types_index - a dict containing each null type and a respective list of the indicies that it is present within this sample

    • +
    • data_type_representation - the percentage of samples used identifying as each data_type

    • +
    • min - minimum value in the sample

    • +
    • max - maximum value in the sample

    • +
    • mode - mode of the entries in the sample

    • +
    • median - median of the entries in the sample

    • +
    • median_absolute_deviation - the median absolute deviation of the entries in the sample

    • +
    • sum - the total of all sampled values from the column

    • +
    • mean - the average of all entries in the sample

    • +
    • variance - the variance of all entries in the sample

    • +
    • stddev - the standard deviation of all entries in the sample

    • +
    • skewness - the statistical skewness of all entries in the sample

    • +
    • kurtosis - the statistical kurtosis of all entries in the sample

    • +
    • num_zeros - the number of entries in this sample that have the value 0

    • +
    • num_negatives - the number of entries in this sample that have a value less than 0

    • +
    • +
      histogram - contains histogram relevant information
        +
      • bin_counts - the number of entries within each bin

      • +
      • bin_edges - the thresholds of each bin

      • +
      +
      +
      +
    • +
    • quantiles - the value at each percentile in the order they are listed based on the entries in the sample

    • +
    • vocab - a list of the characters used within the entries in this sample

    • +
    • avg_predictions - average of the data label prediction confidences across all data points sampled

    • +
    • categories - a list of each distinct category within the sample if categorial = ‘true’

    • +
    • unique_count - the number of distinct entries in the sample

    • +
    • unique_ratio - the proportion of the number of distinct entries in the sample to the total number of entries in the sample

    • +
    • categorical_count - number of entries sampled for each category if categorical = ‘true’

    • +
    • gini_impurity - measure of how often a randomly chosen element from the set would be incorrectly labeled if it was randomly labeled according to the distribution of labels in the subset

    • +
    • unalikeability - a value denoting how frequently entries differ from one another within the sample

    • +
    • precision - a dict of statistics with respect to the number of digits in a number for each sample

    • +
    • times - the duration of time it took to generate this sample’s statistics in milliseconds

    • +
    • format - list of possible datetime formats

    • +
    +
    +
    +
  • +
  • +
    null_replication_metrics - statistics of data partitioned based on whether column value is null (index 1 of lists referenced by dict keys) or not (index 0)
      +
    • class_prior - a list containing probability of a column value being null and not null

    • +
    • class_sum - a list containing sum of all other rows based on whether column value is null or not

    • +
    • class_mean - a list containing mean of all other rows based on whether column value is null or not

    • +
    +
    +
    +
  • +
+
+
+

Unstructured Profile

+

global_stats:

+
    +
  • samples_used - number of input data samples used to generate this profile

  • +
  • empty_line_count - the number of empty lines in the input data

  • +
  • file_type - the file type of the input data (ex: .txt)

  • +
  • encoding - file encoding of the input data file (ex: UTF-8)

  • +
  • memory_size - size of the input data in MB

  • +
  • times - duration of time it took to generate this profile in milliseconds

  • +
+

data_stats:

+
    +
  • +
    data_label - labels and statistics on the labels of the input data
      +
    • +
      entity_counts - the number of times a specific label or entity appears inside the input data
        +
      • word_level - the number of words counted within each label or entity

      • +
      • true_char_level - the number of characters counted within each label or entity as determined by the model

      • +
      • postprocess_char_level - the number of characters counted within each label or entity as determined by the postprocessor

      • +
      +
      +
      +
    • +
    • +
      entity_percentages - the percentages of each label or entity within the input data
        +
      • word_level - the percentage of words in the input data that are contained within each label or entity

      • +
      • true_char_level - the percentage of characters in the input data that are contained within each label or entity as determined by the model

      • +
      • postprocess_char_level - the percentage of characters in the input data that are contained within each label or entity as determined by the postprocessor

      • +
      +
      +
      +
    • +
    • times - the duration of time it took for the data labeler to predict on the data

    • +
    +
    +
    +
  • +
  • +
    statistics - statistics of the input data
      +
    • vocab - a list of each character in the input data

    • +
    • vocab_count - the number of occurrences of each distinct character in the input data

    • +
    • words - a list of each word in the input data

    • +
    • word_count - the number of occurrences of each distinct word in the input data

    • +
    • times - the duration of time it took to generate the vocab and words statistics in milliseconds

    • +
    +
    +
    +
  • +
+
+
+

Graph Profile

+
    +
  • num_nodes - number of nodes in the graph

  • +
  • num_edges - number of edges in the graph

  • +
  • categorical_attributes - list of categorical edge attributes

  • +
  • continuous_attributes - list of continuous edge attributes

  • +
  • avg_node_degree - average degree of nodes in the graph

  • +
  • global_max_component_size: size of the global max component

  • +
+

continuous_distribution:

+
    +
  • +
    <attribute_N>: name of N-th edge attribute in list of attributes
      +
    • name - name of distribution for attribute

    • +
    • scale - negative log likelihood used to scale and compare distributions

    • +
    • +
      properties - list of statistical properties describing the distribution
        +
      • [shape (optional), loc, scale, mean, variance, skew, kurtosis]

      • +
      +
      +
      +
    • +
    +
    +
    +
  • +
+

categorical_distribution:

+
    +
  • +
    <attribute_N>: name of N-th edge attribute in list of attributes
      +
    • bin_counts: counts in each bin of the distribution histogram

    • +
    • bin_edges: edges of each bin of the distribution histogram

    • +
    +
    +
    +
  • +
  • times - duration of time it took to generate this profile in milliseconds

  • +
+
+
+
+

Profile Options

+

The data profiler accepts several options to toggle on and off +features. The 8 columns (int options, float options, datetime options, +text options, order options, category options, data labeler options) can be +enabled or disabled. By default, all options are toggled on. Below is an example +of how to alter these options. Options shared by structured and unstructured options +must be specified as structured or unstructured when setting (ie. datalabeler options).

+
import json
+from dataprofiler import Data, Profiler, ProfilerOptions
+
+# Load and profile a CSV file
+data = Data("your_file.csv")
+profile_options = ProfilerOptions()
+
+#All of these are different examples of adjusting the profile options
+
+# Options can be toggled directly like this:
+profile_options.structured_options.text.is_enabled = False
+profile_options.structured_options.text.vocab.is_enabled = True
+profile_options.structured_options.int.variance.is_enabled = True
+profile_options.structured_options.data_labeler.data_labeler_dirpath = \
+    "Wheres/My/Datalabeler"
+profile_options.structured_options.data_labeler.is_enabled = False
+
+# A dictionary can be sent in to set the properties for all the options
+profile_options.set({"structured_options.data_labeler.is_enabled": False, "min.is_enabled": False})
+
+# Specific columns can be set/disabled/enabled in the same way
+profile_options.structured_options.text.set({"max.is_enabled":True,
+                                         "variance.is_enabled": True})
+
+# numeric stats can be turned off/on entirely
+profile_options.set({"is_numeric_stats_enabled": False})
+profile_options.set({"int.is_numeric_stats_enabled": False})
+
+profile = Profiler(data, options=profile_options)
+
+# Print the report using json to prettify.
+report  = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+

Below is an breakdown of all the options.

+
    +
  • ProfilerOptions - The top-level options class that contains options for the Profiler class

    +
      +
    • presets - A pre-configured mapping of a string name to group of options:

      +
        +
      • default is None

      • +
      • “complete”

      • +
      +
      options = ProfilerOptions(presets="complete")
      +
      +
      +
        +
      • “data_types”

      • +
      +
      options = ProfilerOptions(presets="data_types")
      +
      +
      +
        +
      • “numeric_stats_disabled”

      • +
      +
      options = ProfilerOptions(presets="numeric_stats_disabled")
      +
      +
      +
        +
      • “lower_memory_sketching”

      • +
      +
      options = ProfilerOptions(presets="lower_memory_sketching")
      +
      +
      +
    • +
    • structured_options - Options responsible for all structured data

      +
        +
      • multiprocess - Option to enable multiprocessing. If on, multiprocessing is toggled on if the dataset contains more than 750,000 rows or more than 20 columns. +Automatically selects the optimal number of pooling processes to utilize based on system constraints when toggled on.

        +
          +
        • is_enabled - (Boolean) Enables or disables multiprocessing

        • +
        +
      • +
      • sampling_ratio - A percentage, as a decimal, ranging from greater than 0 to less than or equal to 1 indicating how much input data to sample. Default value set to 0.2.

      • +
      • int - Options for the integer columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the integer operations

        • +
        • min - Finds minimum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables min

          • +
          +
        • +
        • max - Finds maximum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables max

          • +
          +
        • +
        • mode - Finds mode(s) in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables mode

          • +
          • top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes.

          • +
          +
        • +
        • median - Finds median value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables median

          • +
          +
        • +
        • sum - Finds sum of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables sum

          • +
          +
        • +
        • variance - Finds variance of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables variance

          • +
          +
        • +
        • skewness - Finds skewness of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables skewness

          • +
          +
        • +
        • kurtosis - Finds kurtosis of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables kurtosis

          • +
          +
        • +
        • median_abs_deviation - Finds median absolute deviation of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables median absolute deviation

          • +
          +
        • +
        • num_zeros - Finds the count of zeros in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_zeros

          • +
          +
        • +
        • num_negatives - Finds the count of negative numbers in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_negatives

          • +
          +
        • +
        • bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations

          +
            +
          • is_enabled - (Boolean) Enables or disables bias correction

          • +
          +
        • +
        • histogram_and_quantiles - Generates a histogram and quantiles +from the column values

          +
            +
          • bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. +If left unspecified (None) the optimal method will be chosen by attempting all methods. +If multiple specified (list) the optimal method will be chosen by attempting the provided ones. +methods: ‘auto’, ‘fd’, ‘doane’, ‘scott’, ‘rice’, ‘sturges’, ‘sqrt’ +Note: ‘auto’ is used to choose optimally between ‘fd’ and ‘sturges’

          • +
          • num_quantiles - (Int) Number of quantiles to bin the data. +Default value is set to 1,000 quantiles.

          • +
          • is_enabled - (Boolean) Enables or disables histogram and quantiles

          • +
          +
        • +
        +
      • +
      • float - Options for the float columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the float operations

        • +
        • precision - Finds the precision (significant figures) within the column

          +
            +
          • is_enabled - (Boolean) Enables or disables precision

          • +
          +
        • +
        • sample_ratio - (Float) The ratio of 0 to 1 how much data (identified as floats) to utilize as samples in determining precision

        • +
        • min - Finds minimum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables min

          • +
          +
        • +
        • max - Finds maximum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables max

          • +
          +
        • +
        • mode - Finds mode(s) in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables mode

          • +
          • top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes.

          • +
          +
        • +
        • median - Finds median value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables median

          • +
          +
        • +
        • sum - Finds sum of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables sum

          • +
          +
        • +
        • variance - Finds variance of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables variance

          • +
          +
        • +
        • skewness - Finds skewness of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables skewness

          • +
          +
        • +
        • kurtosis - Finds kurtosis of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables kurtosis

          • +
          +
        • +
        • median_abs_deviation - Finds median absolute deviation of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables median absolute deviation

          • +
          +
        • +
        • is_numeric_stats_enabled - (Boolean) enable or disable all numeric stats

        • +
        • num_zeros - Finds the count of zeros in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_zeros

          • +
          +
        • +
        • num_negatives - Finds the count of negative numbers in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_negatives

          • +
          +
        • +
        • bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations

          +
            +
          • is_enabled - (Boolean) Enables or disables bias correction

          • +
          +
        • +
        • histogram_and_quantiles - Generates a histogram and quantiles +from the column values

          +
            +
          • bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. +If left unspecified (None) the optimal method will be chosen by attempting all methods. +If multiple specified (list) the optimal method will be chosen by attempting the provided ones. +methods: ‘auto’, ‘fd’, ‘doane’, ‘scott’, ‘rice’, ‘sturges’, ‘sqrt’ +Note: ‘auto’ is used to choose optimally between ‘fd’ and ‘sturges’

          • +
          • num_quantiles - (Int) Number of quantiles to bin the data. +Default value is set to 1,000 quantiles.

          • +
          • is_enabled - (Boolean) Enables or disables histogram and quantiles

          • +
          +
        • +
        +
      • +
      • text - Options for the text columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the text operations

        • +
        • vocab - Finds all the unique characters used in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables vocab

          • +
          +
        • +
        • min - Finds minimum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables min

          • +
          +
        • +
        • max - Finds maximum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables max

          • +
          +
        • +
        • mode - Finds mode(s) in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables mode

          • +
          • top_k_modes - (Int) Sets the number of modes to return if multiple exist. Default returns max 5 modes.

          • +
          +
        • +
        • median - Finds median value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables median

          • +
          +
        • +
        • sum - Finds sum of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables sum

          • +
          +
        • +
        • variance - Finds variance of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables variance

          • +
          +
        • +
        • skewness - Finds skewness of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables skewness

          • +
          +
        • +
        • kurtosis - Finds kurtosis of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables kurtosis

          • +
          +
        • +
        • median_abs_deviation - Finds median absolute deviation of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables median absolute deviation

          • +
          +
        • +
        • bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations

          +
            +
          • is_enabled - (Boolean) Enables or disables bias correction

          • +
          +
        • +
        • is_numeric_stats_enabled - (Boolean) enable or disable all numeric stats

        • +
        • num_zeros - Finds the count of zeros in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_zeros

          • +
          +
        • +
        • num_negatives - Finds the count of negative numbers in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_negatives

          • +
          +
        • +
        • histogram_and_quantiles - Generates a histogram and quantiles +from the column values

          +
            +
          • bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. +If left unspecified (None) the optimal method will be chosen by attempting all methods. +If multiple specified (list) the optimal method will be chosen by attempting the provided ones. +methods: ‘auto’, ‘fd’, ‘doane’, ‘scott’, ‘rice’, ‘sturges’, ‘sqrt’ +Note: ‘auto’ is used to choose optimally between ‘fd’ and ‘sturges’

          • +
          • num_quantiles - (Int) Number of quantiles to bin the data. +Default value is set to 1,000 quantiles.

          • +
          • is_enabled - (Boolean) Enables or disables histogram and quantiles

          • +
          +
        • +
        +
      • +
      • datetime - Options for the datetime columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the datetime operations

        • +
        +
      • +
      • order - Options for the order columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the order operations

        • +
        +
      • +
      • category - Options for the category columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the category operations

        • +
        • top_k_categories - (int) Number of categories to be displayed when reporting

        • +
        • max_sample_size_to_check_stop_condition - (int) The maximum sample size before categorical stop conditions are checked

        • +
        • stop_condition_unique_value_ratio - (float) The highest ratio of unique values to dataset size that is to be considered a categorical type

        • +
        • cms - (Boolean) Enables or Disables the use of count min sketch / heavy hitters for approximate frequency counts

        • +
        • cms_confidence - (float) Defines the number of hashes used in CMS, default 0.95

        • +
        • cms_relative_error - (float) Defines the number of buckets used in CMS, default 0.01

        • +
        • cms_max_num_heavy_hitters - (int) The value used to define the threshold for minimum frequency required by a category to be counted

        • +
        +
      • +
      • data_labeler - Options for the data labeler columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the data labeler operations

        • +
        • data_labeler_dirpath - (String) Directory path to data labeler

        • +
        • data_labeler_object - (BaseDataLabeler) Datalabeler to replace +the default labeler

        • +
        • max_sample_size - (Int) The max number of samples for the data +labeler

        • +
        +
      • +
      • correlation - Option set for correlation profiling +* is_enabled - (Boolean) Enables or disables performing correlation profiling +* columns - Columns considered to calculate correlation

      • +
      • row_statistics - (Boolean) Option to enable/disable row statistics calculations

        +
          +
        • unique_count - (UniqueCountOptions) Option to enable/disable unique row count calculations

          +
            +
          • is_enabled - (Bool) Enables or disables options for unique row count

          • +
          • hashing_method - (String) Property to specify row hashing method (“full” | “hll”)

          • +
          • hll - (HyperLogLogOptions) Options for alternative method of estimating unique row count (activated when hll is the selected hashing_method)

            +
              +
            • seed - (Int) Used to set HLL hashing function seed

            • +
            • register_count - (Int) Number of registers is equal to 2^register_count

            • +
            +
          • +
          +
        • +
        • null_count - (Boolean) Option to enable/disable functionalities for row_has_null_ratio and row_is_null_ratio

        • +
        +
      • +
      • chi2_homogeneity - Options for the chi-squared test matrix

        +
          +
        • is_enabled - (Boolean) Enables or disables performing chi-squared tests for homogeneity between the categorical columns of the dataset.

        • +
        +
      • +
      • null_replication_metrics - Options for calculating null replication metrics

        +
          +
        • is_enabled - (Boolean) Enables or disables calculation of null replication metrics

        • +
        +
      • +
      +
    • +
    • unstructured_options - Options responsible for all unstructured data

      +
        +
      • text - Options for the text profile

        +
          +
        • is_case_sensitive - (Boolean) Specify whether the profile is case sensitive

        • +
        • stop_words - (List of Strings) List of stop words to be removed when profiling

        • +
        • top_k_chars - (Int) Number of top characters to be retrieved when profiling

        • +
        • top_k_words - (Int) Number of top words to be retrieved when profiling

        • +
        • vocab - Options for vocab count

          +
            +
          • is_enabled - (Boolean) Enables or disables the vocab stats

          • +
          +
        • +
        • words - Options for word count

          +
            +
          • is_enabled - (Boolean) Enables or disables the word stats

          • +
          +
        • +
        +
      • +
      • data_labeler - Options for the data labeler

        +
          +
        • is_enabled - (Boolean) Enables or disables the data labeler operations

        • +
        • data_labeler_dirpath - (String) Directory path to data labeler

        • +
        • data_labeler_object - (BaseDataLabeler) Datalabeler to replace +the default labeler

        • +
        • max_sample_size - (Int) The max number of samples for the data +labeler

        • +
        +
      • +
      +
    • +
    +
  • +
+
+
+

Statistical Dependency on Order of Updates

+

Some profile features/statistics are dependent on the order in which the profiler +is updated with new data.

+
+

Order Profile

+

The order profiler utilizes the last value in the previous data batch to ensure +the subsequent dataset is above/below/equal to that value when predicting +non-random order.

+

For instance, a dataset to be predicted as ascending would require the following +batch data update to be ascending and its first value >= than that of the +previous batch of data.

+

Ex. of ascending:

+
batch_1 = [0, 1, 2]
+batch_2 = [3, 4, 5]
+
+
+

Ex. of random:

+
batch_1 = [0, 1, 2]
+batch_2 = [1, 2, 3] # notice how the first value is less than the last value in the previous batch
+
+
+
+
+
+

Reporting Structure

+

For every profile, we can provide a report and customize it with a couple optional parameters:

+
    +
  • output_format (string)

    +
      +
    • This will allow the user to decide the output format for report.

      +
        +
      • Options are one of [pretty, compact, serializable, flat]:

        +
          +
        • Pretty: floats are rounded to four decimal places, and lists are shortened.

        • +
        • Compact: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc.

        • +
        • Serializable: Output is json serializable and not prettified

        • +
        • Flat: Nested output is returned as a flattened dictionary

        • +
        +
      • +
      +
    • +
    +
  • +
  • num_quantile_groups (int)

    +
      +
    • You can sample your data as you like! With a minimum of one and a maximum of 1000, you can decide the number of quantile groups!

    • +
    +
  • +
+
report  = profile.report(report_options={"output_format": "pretty"})
+report  = profile.report(report_options={"output_format": "compact"})
+report  = profile.report(report_options={"output_format": "serializable"})
+report  = profile.report(report_options={"output_format": "flat"})
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/profiler_example.html b/docs/0.12.0/html/profiler_example.html new file mode 100644 index 000000000..676f55e5c --- /dev/null +++ b/docs/0.12.0/html/profiler_example.html @@ -0,0 +1,950 @@ + + + + + + + + + Structured Profilers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Structured Profilers

+

Data profiling - is the process of examining a dataset and collecting statistical or informational summaries about said dataset.

+

The Profiler class inside the DataProfiler is designed to generate data profiles via the Profiler class, which ingests either a Data class or a Pandas DataFrame.

+

Currently, the Data class supports loading the following file formats:

+
    +
  • Any delimited (CSV, TSV, etc.)

  • +
  • JSON object

  • +
  • Avro

  • +
  • Parquet

  • +
  • Text files

  • +
  • Pandas Series/Dataframe

  • +
+

Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.

+

This example will look at specifically the structured data types for structured profiling.

+
+

Reporting

+

One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.

+

In terms of reporting, there are multiple reporting options:

+
    +
  • Pretty: Floats are rounded to four decimal places, and lists are shortened.

  • +
  • Compact: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc.

  • +
  • Serializable: Output is json serializable and not prettified

  • +
  • Flat: Nested Output is returned as a flattened dictionary

  • +
+

The Pretty and Compact reports are the two most commonly used reports and includes global_stats and data_stats for the given dataset. global_stats contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio. data_stats contains specific properties and statistics for each column file such as min, max, mean, variance, etc.

+

For structured profiles, the report looks like this:

+
"global_stats": {
+    "samples_used": int,
+    "column_count": int,
+    "row_count": int,
+    "row_has_null_ratio": float,
+    "row_is_null_ratio": float,
+    "unique_row_ratio": float,
+    "duplicate_row_count": int,
+    "file_type": string,
+    "encoding": string,
+},
+"data_stats": [
+    {
+        "column_name": string,
+        "data_type": string,
+        "data_label": string,
+        "categorical": bool,
+        "order": string,
+        "samples": list(str),
+        "statistics": {
+            "sample_size": int,
+            "null_count": int,
+            "null_types": list(string),
+            "null_types_index": {
+                string: list(int)
+            },
+            "data_type_representation": [string, list(string)],
+            "min": [null, float],
+            "max": [null, float],
+            "mean": float,
+            "variance": float,
+            "stddev": float,
+            "histogram": {
+                "bin_counts": list(int),
+                "bin_edges": list(float),
+            },
+            "quantiles": {
+                int: float
+            }
+            "vocab": list(char),
+            "avg_predictions": dict(float),
+            "data_label_representation": dict(float),
+            "categories": list(str),
+            "unique_count": int,
+            "unique_ratio": float,
+            "precision": {
+                'min': int,
+                'max': int,
+                'mean': float,
+                'var': float,
+                'std': float,
+                'sample_size': int,
+                'margin_of_error': float,
+                'confidence_level': float
+            },
+            "times": dict(float),
+            "format": string
+        }
+    }
+]
+
+
+

In the example, the compact format of the report is used to shorten the full list of the results.

+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+
+try:
+    sys.path.insert(0, '..')
+    import dataprofiler as dp
+except ImportError:
+    import dataprofiler as dp
+
+data_path = "../dataprofiler/tests/data"
+
+# remove extra tf loggin
+import tensorflow as tf
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+
+
+
+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+profile = dp.Profiler(data)
+
+# Compact - A high level view, good for quick reviews
+report  = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

It should be noted, in addition to reading the input data from multiple file types, DataProfiler allows the input data as a dataframe. To get more results related to detailed predictions at the entity level from the DataLabeler component or histogram results, the format pretty should be used.

+
+
[ ]:
+
+
+
+# run data profiler and get the report
+import pandas as pd
+my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=["col_int", "col_float"])
+profile = dp.Profiler(my_dataframe)
+
+report  = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Profiler Type

+

The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile.

+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+profile = dp.Profiler(data, profiler_type='structured')
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Profiler options

+

The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the ProfilerOptions class.

+

For example, if a user doesn’t require histogram information they may desire to turn off the histogram functionality. Simialrly, if a user is looking for a more accurate labeling, they can increase the samples used to label.

+

Below, let’s remove the histogram and increase the number of samples to the labeler component (1,000 samples).

+

Full list of options in the Profiler section of the DataProfiler documentation.

+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "csv/diamonds.csv"))
+
+profile_options = dp.ProfilerOptions()
+
+# Setting multiple options via set
+profile_options.set({ "histogram.is_enabled": False, "int.is_enabled": False})
+
+# Set options via directly setting them
+profile_options.structured_options.data_labeler.max_sample_size = 1000
+
+profile = dp.Profiler(data, options=profile_options)
+report  = profile.report(report_options={"output_format":"compact"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Updating Profiles

+

Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately.

+
+
[ ]:
+
+
+
+# Load and profile a CSV file
+data = dp.Data(os.path.join(data_path, "csv/sparse-first-and-last-column-header-and-author.txt"))
+profile = dp.Profiler(data)
+
+# Update the profile with new data:
+new_data = dp.Data(os.path.join(data_path, "csv/sparse-first-and-last-column-skip-header.txt"))
+# new_data = dp.Data(os.path.join(data_path, "iris-utf-16.csv")) # will error due to schema mismatch
+profile.update_profile(new_data)
+
+# Take a peek at the data
+print(data.data)
+print(new_data.data)
+
+# Report the compact version of the profile
+report  = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Merging Profiles

+

Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple + command: profile3 = profile1 + profile2

+
+
[ ]:
+
+
+
+# Load a CSV file with a schema
+data1 = dp.Data(os.path.join(data_path, "csv/sparse-first-and-last-column-header-and-author.txt"))
+profile1 = dp.Profiler(data1)
+
+# Load another CSV file with the same schema
+data2 = dp.Data(os.path.join(data_path, "csv/sparse-first-and-last-column-skip-header.txt"))
+profile2 = dp.Profiler(data2)
+
+# Merge the profiles
+profile3 = profile1 + profile2
+
+# Report the compact version of the profile
+report  = profile3.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

As you can see, the update_profile function and the + operator function similarly. The reason the + operator is important is that it’s possible to save and load profiles, which we cover next.

+
+
+

Differences in Data

+

Can be applied to both structured and unstructured datasets.

+

Such reports can provide details on the differences between training and validation data like in this pseudo example:

+
profiler_training = dp.Profiler(training_data)
+profiler_testing = dp.Profiler(testing_data)
+
+validation_report = profiler_training.diff(profiler_testing)
+
+
+
+
[ ]:
+
+
+
+from pprint import pprint
+
+# structured differences example
+data_split_differences = profile1.diff(profile2)
+pprint(data_split_differences)
+
+
+
+
+
+

Graphing a Profile

+

We’ve also added the ability to generating visual reports from a profile.

+

The following plots are currently available to work directly with your profilers:

+
    +
  • missing values matrix

  • +
  • histogram (numeric columns only)

  • +
+
+
[ ]:
+
+
+
+import matplotlib.pyplot as plt
+
+
+# get the data
+data_folder = "../dataprofiler/tests/data"
+data = dp.Data(os.path.join(data_folder, "csv/aws_honeypot_marx_geo.csv"))
+
+# profile the data
+profile = dp.Profiler(data)
+
+
+
+
+
[ ]:
+
+
+
+# generate a missing values matrix
+fig = plt.figure(figsize=(8, 6), dpi=100)
+fig = dp.graphs.plot_missing_values_matrix(profile, ax=fig.gca(), title="Missing Values Matrix")
+
+
+
+
+
[ ]:
+
+
+
+# generate histogram of all int/float columns
+fig = dp.graphs.plot_histograms(profile)
+fig.set_size_inches(8, 6)
+fig.set_dpi(100)
+
+
+
+
+
+

Saving and Loading a Profile

+

Not only can the Profiler create and update profiles, it’s also possible to save, load then manipulate profiles.

+
+
[ ]:
+
+
+
+# Load data
+data = dp.Data(os.path.join(data_path, "csv/names-col.txt"))
+
+# Generate a profile
+profile = dp.Profiler(data)
+
+# Save a profile to disk for later (saves as pickle file)
+profile.save(filepath="my_profile.pkl")
+
+# Load a profile from disk
+loaded_profile = dp.Profiler.load("my_profile.pkl")
+
+# Report the compact version of the profile
+report = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more.

+
+
[ ]:
+
+
+
+# Load a multiple files via the Data class
+filenames = ["csv/sparse-first-and-last-column-header-and-author.txt",
+             "csv/sparse-first-and-last-column-skip-header.txt"]
+data_objects = []
+for filename in filenames:
+    data_objects.append(dp.Data(os.path.join(data_path, filename)))
+
+
+# Generate and save profiles
+for i in range(len(data_objects)):
+    profile = dp.Profiler(data_objects[i])
+    profile.save(filepath="data-"+str(i)+".pkl")
+
+
+# Load profiles and add them together
+profile = None
+for i in range(len(data_objects)):
+    if profile is None:
+        profile = dp.Profiler.load("data-"+str(i)+".pkl")
+    else:
+        profile += dp.Profiler.load("data-"+str(i)+".pkl")
+
+
+# Report the compact version of the profile
+report = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
[ ]:
+
+
+
+
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/profiler_example.ipynb b/docs/0.12.0/html/profiler_example.ipynb new file mode 100644 index 000000000..b6a4409c9 --- /dev/null +++ b/docs/0.12.0/html/profiler_example.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f37ca393", + "metadata": {}, + "source": [ + "# Structured Profilers" + ] + }, + { + "cell_type": "markdown", + "id": "ff9bd095", + "metadata": {}, + "source": [ + "**Data profiling** - *is the process of examining a dataset and collecting statistical or informational summaries about said dataset.*\n", + "\n", + "The Profiler class inside the DataProfiler is designed to generate *data profiles* via the Profiler class, which ingests either a Data class or a Pandas DataFrame. \n", + "\n", + "Currently, the Data class supports loading the following file formats:\n", + "\n", + "* Any delimited (CSV, TSV, etc.)\n", + "* JSON object\n", + "* Avro\n", + "* Parquet\n", + "* Text files\n", + "* Pandas Series/Dataframe\n", + "\n", + "Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.\n", + "\n", + "This example will look at specifically the structured data types for structured profiling. " + ] + }, + { + "cell_type": "markdown", + "id": "de58b9c4", + "metadata": {}, + "source": [ + "## Reporting" + ] + }, + { + "cell_type": "markdown", + "id": "8001185a", + "metadata": {}, + "source": [ + "One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.\n", + "\n", + "In terms of reporting, there are multiple reporting options:\n", + "\n", + "* **Pretty**: Floats are rounded to four decimal places, and lists are shortened.\n", + "* **Compact**: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc.\n", + "* **Serializable**: Output is json serializable and not prettified\n", + "* **Flat**: Nested Output is returned as a flattened dictionary\n", + "\n", + "The **Pretty** and **Compact** reports are the two most commonly used reports and includes `global_stats` and `data_stats` for the given dataset. `global_stats` contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio. `data_stats` contains specific properties and statistics for each column file such as min, max, mean, variance, etc.\n", + "\n", + "For structured profiles, the report looks like this:\n", + "\n", + "```\n", + "\"global_stats\": {\n", + " \"samples_used\": int,\n", + " \"column_count\": int,\n", + " \"row_count\": int,\n", + " \"row_has_null_ratio\": float,\n", + " \"row_is_null_ratio\": float, \n", + " \"unique_row_ratio\": float,\n", + " \"duplicate_row_count\": int,\n", + " \"file_type\": string,\n", + " \"encoding\": string,\n", + "},\n", + "\"data_stats\": [\n", + " {\n", + " \"column_name\": string,\n", + " \"data_type\": string,\n", + " \"data_label\": string,\n", + " \"categorical\": bool,\n", + " \"order\": string,\n", + " \"samples\": list(str),\n", + " \"statistics\": {\n", + " \"sample_size\": int,\n", + " \"null_count\": int,\n", + " \"null_types\": list(string),\n", + " \"null_types_index\": {\n", + " string: list(int)\n", + " },\n", + " \"data_type_representation\": [string, list(string)],\n", + " \"min\": [null, float],\n", + " \"max\": [null, float],\n", + " \"mean\": float,\n", + " \"variance\": float,\n", + " \"stddev\": float,\n", + " \"histogram\": { \n", + " \"bin_counts\": list(int),\n", + " \"bin_edges\": list(float),\n", + " },\n", + " \"quantiles\": {\n", + " int: float\n", + " }\n", + " \"vocab\": list(char),\n", + " \"avg_predictions\": dict(float), \n", + " \"data_label_representation\": dict(float),\n", + " \"categories\": list(str),\n", + " \"unique_count\": int,\n", + " \"unique_ratio\": float,\n", + " \"precision\": {\n", + " 'min': int,\n", + " 'max': int,\n", + " 'mean': float,\n", + " 'var': float,\n", + " 'std': float,\n", + " 'sample_size': int,\n", + " 'margin_of_error': float,\n", + " 'confidence_level': float\t\t\n", + " },\n", + " \"times\": dict(float),\n", + " \"format\": string\n", + " }\n", + " }\n", + "]\n", + "```\n", + "\n", + "In the example, the `compact` format of the report is used to shorten the full list of the results. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fcb5447", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"\n", + "\n", + "# remove extra tf loggin\n", + "import tensorflow as tf\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7fc2df6", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Compact - A high level view, good for quick reviews\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "d7ec39d2", + "metadata": {}, + "source": [ + "It should be noted, in addition to reading the input data from multiple file types, DataProfiler allows the input data as a dataframe. To get more results related to detailed predictions at the entity level from the DataLabeler component or histogram results, the format `pretty` should be used. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29737f25", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "import pandas as pd\n", + "my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=[\"col_int\", \"col_float\"])\n", + "profile = dp.Profiler(my_dataframe)\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "241f6e3e", + "metadata": {}, + "source": [ + "## Profiler Type" + ] + }, + { + "cell_type": "markdown", + "id": "5b20879b", + "metadata": {}, + "source": [ + "The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc44eb47", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data, profiler_type='structured')\n", + "\n", + "# print the report using json to prettify.\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "fe02ad64", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "40804cc9", + "metadata": {}, + "source": [ + "The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the `ProfilerOptions` class.\n", + "\n", + "For example, if a user doesn't require histogram information they may desire to turn off the histogram functionality. Simialrly, if a user is looking for a more accurate labeling, they can increase the samples used to label.\n", + "\n", + "Below, let's remove the histogram and increase the number of samples to the labeler component (1,000 samples). \n", + "\n", + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler/profile_options.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d25d899", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/diamonds.csv\"))\n", + "\n", + "profile_options = dp.ProfilerOptions()\n", + "\n", + "# Setting multiple options via set\n", + "profile_options.set({ \"histogram.is_enabled\": False, \"int.is_enabled\": False})\n", + "\n", + "# Set options via directly setting them\n", + "profile_options.structured_options.data_labeler.max_sample_size = 1000\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "2052415a", + "metadata": {}, + "source": [ + "## Updating Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "7e02f746", + "metadata": {}, + "source": [ + "Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab8022f", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Load and profile a CSV file\n", + "data = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-header-and-author.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Update the profile with new data:\n", + "new_data = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-skip-header.txt\"))\n", + "# new_data = dp.Data(os.path.join(data_path, \"iris-utf-16.csv\")) # will error due to schema mismatch\n", + "profile.update_profile(new_data)\n", + "\n", + "# Take a peek at the data\n", + "print(data.data)\n", + "print(new_data.data)\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "66ec6dc5", + "metadata": {}, + "source": [ + "## Merging Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "e2265fe9", + "metadata": {}, + "source": [ + "Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple `+` command: `profile3 = profile1 + profile2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc68ca07", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Load a CSV file with a schema\n", + "data1 = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-header-and-author.txt\"))\n", + "profile1 = dp.Profiler(data1)\n", + "\n", + "# Load another CSV file with the same schema\n", + "data2 = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-skip-header.txt\"))\n", + "profile2 = dp.Profiler(data2)\n", + "\n", + "# Merge the profiles\n", + "profile3 = profile1 + profile2\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile3.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "7ea07dc6", + "metadata": {}, + "source": [ + "As you can see, the `update_profile` function and the `+` operator function similarly. The reason the `+` operator is important is that it's possible to *save and load profiles*, which we cover next." + ] + }, + { + "cell_type": "markdown", + "id": "375ff25c-b189-436a-b07d-5e7f13cc6e03", + "metadata": {}, + "source": [ + "## Differences in Data\n", + "Can be applied to both structured and unstructured datasets. \n", + "\n", + "Such reports can provide details on the differences between training and validation data like in this pseudo example:\n", + "```python\n", + "profiler_training = dp.Profiler(training_data)\n", + "profiler_testing = dp.Profiler(testing_data)\n", + "\n", + "validation_report = profiler_training.diff(profiler_testing)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65360a03-e3ff-4f3c-9963-412298fdb284", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "# structured differences example\n", + "data_split_differences = profile1.diff(profile2)\n", + "pprint(data_split_differences)" + ] + }, + { + "cell_type": "markdown", + "id": "2ae471ff-852f-400a-9bee-5c9fef96f10a", + "metadata": {}, + "source": [ + "## Graphing a Profile\n", + "\n", + "We've also added the ability to generating visual reports from a profile.\n", + "\n", + "The following plots are currently available to work directly with your profilers:\n", + "\n", + " * missing values matrix\n", + " * histogram (numeric columns only)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "734b588d-ac9a-409c-8eb5-b1a0aede8c63", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# get the data\n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "data = dp.Data(os.path.join(data_folder, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "\n", + "# profile the data\n", + "profile = dp.Profiler(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4e70204-fa30-43c2-9556-e84c19f82d32", + "metadata": {}, + "outputs": [], + "source": [ + "# generate a missing values matrix\n", + "fig = plt.figure(figsize=(8, 6), dpi=100)\n", + "fig = dp.graphs.plot_missing_values_matrix(profile, ax=fig.gca(), title=\"Missing Values Matrix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d734d355-e542-4245-a1e9-66521e333c2d", + "metadata": {}, + "outputs": [], + "source": [ + "# generate histogram of all int/float columns\n", + "fig = dp.graphs.plot_histograms(profile)\n", + "fig.set_size_inches(8, 6)\n", + "fig.set_dpi(100)" + ] + }, + { + "cell_type": "markdown", + "id": "30868000", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile" + ] + }, + { + "cell_type": "markdown", + "id": "f2858072", + "metadata": {}, + "source": [ + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ad9ca57", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "data = dp.Data(os.path.join(data_path, \"csv/names-col.txt\"))\n", + "\n", + "# Generate a profile\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Save a profile to disk for later (saves as pickle file)\n", + "profile.save(filepath=\"my_profile.pkl\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.pkl\")\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "8f9859c2", + "metadata": {}, + "source": [ + "With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3571f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a multiple files via the Data class\n", + "filenames = [\"csv/sparse-first-and-last-column-header-and-author.txt\",\n", + " \"csv/sparse-first-and-last-column-skip-header.txt\"]\n", + "data_objects = []\n", + "for filename in filenames:\n", + " data_objects.append(dp.Data(os.path.join(data_path, filename)))\n", + "\n", + "\n", + "# Generate and save profiles\n", + "for i in range(len(data_objects)):\n", + " profile = dp.Profiler(data_objects[i])\n", + " profile.save(filepath=\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Load profiles and add them together\n", + "profile = None\n", + "for i in range(len(data_objects)):\n", + " if profile is None:\n", + " profile = dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + " else:\n", + " profile += dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4690068a-8fc3-4bd5-8649-63d0f34fa91d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/html/py-modindex.html b/docs/0.12.0/html/py-modindex.html new file mode 100644 index 000000000..3a8cc5920 --- /dev/null +++ b/docs/0.12.0/html/py-modindex.html @@ -0,0 +1,618 @@ + + + + + + + Python Module Index - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + +
+

Python Module Index

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 
d
+ dataprofiler +
    + dataprofiler.data_readers +
    + dataprofiler.data_readers.avro_data +
    + dataprofiler.data_readers.base_data +
    + dataprofiler.data_readers.csv_data +
    + dataprofiler.data_readers.data +
    + dataprofiler.data_readers.data_utils +
    + dataprofiler.data_readers.filepath_or_buffer +
    + dataprofiler.data_readers.graph_data +
    + dataprofiler.data_readers.json_data +
    + dataprofiler.data_readers.parquet_data +
    + dataprofiler.data_readers.structured_mixins +
    + dataprofiler.data_readers.text_data +
    + dataprofiler.dp_logging +
    + dataprofiler.labelers +
    + dataprofiler.labelers.base_data_labeler +
    + dataprofiler.labelers.base_model +
    + dataprofiler.labelers.char_load_tf_model +
    + dataprofiler.labelers.character_level_cnn_model +
    + dataprofiler.labelers.classification_report_utils +
    + dataprofiler.labelers.column_name_model +
    + dataprofiler.labelers.data_labelers +
    + dataprofiler.labelers.data_processing +
    + dataprofiler.labelers.labeler_utils +
    + dataprofiler.labelers.regex_model +
    + dataprofiler.labelers.utils +
    + dataprofiler.plugins +
    + dataprofiler.plugins.decorators +
    + dataprofiler.profilers +
    + dataprofiler.profilers.base_column_profilers +
    + dataprofiler.profilers.categorical_column_profile +
    + dataprofiler.profilers.column_profile_compilers +
    + dataprofiler.profilers.data_labeler_column_profile +
    + dataprofiler.profilers.datetime_column_profile +
    + dataprofiler.profilers.float_column_profile +
    + dataprofiler.profilers.graph_profiler +
    + dataprofiler.profilers.helpers +
    + dataprofiler.profilers.helpers.report_helpers +
    + dataprofiler.profilers.histogram_utils +
    + dataprofiler.profilers.int_column_profile +
    + dataprofiler.profilers.json_decoder +
    + dataprofiler.profilers.json_encoder +
    + dataprofiler.profilers.numerical_column_stats +
    + dataprofiler.profilers.order_column_profile +
    + dataprofiler.profilers.profile_builder +
    + dataprofiler.profilers.profiler_options +
    + dataprofiler.profilers.profiler_utils +
    + dataprofiler.profilers.text_column_profile +
    + dataprofiler.profilers.unstructured_labeler_profile +
    + dataprofiler.profilers.unstructured_text_profile +
    + dataprofiler.reports +
    + dataprofiler.reports.graphs +
    + dataprofiler.reports.utils +
    + dataprofiler.rng_utils +
    + dataprofiler.settings +
    + dataprofiler.validators +
    + dataprofiler.validators.base_validators +
    + dataprofiler.version +
+ +
+
+ + + + + +
+
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/regex_labeler_from_scratch.html b/docs/0.12.0/html/regex_labeler_from_scratch.html new file mode 100644 index 000000000..1d5768b44 --- /dev/null +++ b/docs/0.12.0/html/regex_labeler_from_scratch.html @@ -0,0 +1,859 @@ + + + + + + + + + Building a Regex Data Labeler w/ your own Regex - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Building a Regex Data Labeler w/ your own Regex

+

This notebook teaches how to use the existing / create your own regex labeler as well as utilize it for structured data profiling.

+
    +
  1. Loading and utilizing the pre-existing regex data labeler

  2. +
  3. Replacing the existing regex rules with your own.

  4. +
  5. Utilizng a regex data labeler inside of the structured profiler

  6. +
+

First, let’s import the libraries needed for this example.

+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+from pprint import pprint
+
+import pandas as pd
+
+try:
+    import dataprofiler as dp
+except ImportError:
+    sys.path.insert(0, '../..')
+    import dataprofiler as dp
+
+
+
+
+

Loading and using the pre-existing regex data labeler

+

We can easily import the exsting regex labeler via the load_from_library command from the dp.DataLabeler. This allows us to import models other than the default structured / unstructured labelers which exist in the library.

+
+
[ ]:
+
+
+
+data_labeler = dp.DataLabeler.load_from_library('regex_model')
+data_labeler.model.help()
+
+
+
+
+
[ ]:
+
+
+
+pprint(data_labeler.label_mapping)
+
+
+
+
+
[ ]:
+
+
+
+pprint(data_labeler.model._parameters['regex_patterns'])
+
+
+
+
+

Predicting with the regex labeler

+

In the prediction below, the default settings will split the predictions by default as it’s aggregation function. In other words, if a string ‘123 Fake St.’ The first character would receive a vote for integer and for address giving both a 50% probability. This is because these regex functions are defined individually and a post prediction aggregation function must be used to get the results.

+
+
[ ]:
+
+
+
+# evaluate a prediction using the default parameters
+data_labeler.predict(['123 Fake St.'])
+
+
+
+
+
+
+

Replacing the regex rules in the existing labeler

+

We can achieve this by: 1. Setting the label mapping to the new labels 2. Setting the model parameters which include: regex_patterns, default_label, ignore_case, and encapsulators

+

where regex_patterns is a dict of lists or regex for each label, default_label is the expected default label for the regex, ignore_case tells the model to ignore case during its detection, and encapsulators are generic regex statements placed before (start) and after (end) each regex. Currently, this is used by the default model to capture labels that are within a cell rather than matching the entire cell. (e.g. ’ 123 ’ will still capture 123 as digits).

+

Below, we created 4 labels where other is the default_label. Additionally, we set enabled case sensitivity such that upper and lower case letters would be detected separately.

+
+
[ ]:
+
+
+
+data_labeler.set_labels({'other': 0, 'digits':1, 'lowercase_char': 2, 'uppercase_chars': 3})
+data_labeler.model.set_params(
+    regex_patterns={
+        'digits': [r'[+-]?[0-9]+'],
+        'lowercase_char': [r'[a-z]+'],
+        'uppercase_chars': [r'[A-Z]+'],
+    },
+    default_label='other',
+    ignore_case=False,
+)
+data_labeler.label_mapping
+
+
+
+
+

Predicting with the new regex labels

+

Here we notice the otuput of the predictions gives us a prediction per character for each regex. Note how by default it is matching subtext due to the encapsulators. Where 123 were found to be digits, FAKE was foudn to be upper case, and the whitespaces and St. were other due no single regex being correct.

+
+
[ ]:
+
+
+
+data_labeler.predict(['123 FAKE St.'])
+
+
+
+

Below we turn off case-sensitivity and we see how the aggregation funciton splits the votes for characters between the lowercase and uppercase chars.

+
+
[ ]:
+
+
+
+data_labeler.model.set_params(ignore_case=True)
+data_labeler.predict(['123 FAKE St.'])
+
+
+
+

For the rest of this notebook, we will just use a single regex serach which will capture both upper and lower case chars.

+
+
[ ]:
+
+
+
+data_labeler.set_labels({'other': 0, 'digits':1, 'chars': 2})
+data_labeler.model.set_params(
+    regex_patterns={
+        'digits': [r'[=-]?[0-9]+'],
+        'chars': [r'[a-zA-Z]+'],
+    },
+    default_label='other',
+    ignore_case=False,
+)
+data_labeler.label_mapping
+
+
+
+
+
[ ]:
+
+
+
+data_labeler.predict(['123 FAKE St.'])
+
+
+
+
+
+

Adjusting postprocessor properties

+

Below we can look at the possible postprocessor parameters to adjust the aggregation function to the desired output. The previous outputs by default used the split aggregation function, however, below we will show the random aggregation function which will randomly choose a label if multiple labels have a vote for a given character.

+

data_labeler.postprocessor.help()

+
+
[ ]:
+
+
+
+data_labeler.postprocessor.set_params(aggregation_func='random')
+data_labeler.predict(['123 FAKE St.'], predict_options=dict(show_confidences=True))
+
+
+
+
+
+
+

Integrating the new Regex labeler into Structured Profiling

+

While the labeler can be used alone, it is also possible to integrate the labeler into the StructuredProfiler with a slight change to its postprocessor. The StructuredProfiler requires a labeler which outputs othe confidence of each label for a given cell being processed. To convert the output of the RegexPostProcessor into said format, we will use the StructRegexPostProcessor. We can create the postprocessor and set the data_labeler’s postprocessor to this value.

+
+
[ ]:
+
+
+
+from dataprofiler.labelers.data_processing import StructRegexPostProcessor
+
+postprocesor = StructRegexPostProcessor()
+data_labeler.set_postprocessor(postprocesor)
+
+
+
+

Below we will see the output is now one vote per sample.

+
+
[ ]:
+
+
+
+data_labeler.predict(['123 FAKE St.', '123', 'FAKE'], predict_options=dict(show_confidences=True))
+
+
+
+
+

Setting the Structuredprofiler’s DataLabeler

+

We can create a ProfilerOption and set the structured options to have the new data_labeler as its value. We then run the StructuredProfiler with the specified options.

+
+
[ ]:
+
+
+
+# create and set the option for the regex data labeler to be used at profile time
+profile_options = dp.ProfilerOptions()
+profile_options.set({'structured_options.data_labeler.data_labeler_object': data_labeler})
+
+# profile the dataset using the suggested regex data labeler
+data = pd.DataFrame(
+    [['123 FAKE St.', 123, 'this'],
+     [123           ,  -9, 'IS'],
+     ['...'         , +80, 'A'],
+     ['123'         , 202, 'raNDom'],
+     ['test'        ,  -1, 'TEST']],
+    dtype=object)
+profiler = dp.Profiler(data, options=profile_options)
+
+
+
+

Below we see the first column is given 3 labels as it received multiple votes for said column. However, it was confident on the second and third column which is why it only specified digits and chars respectively.

+
+
[ ]:
+
+
+
+pprint(profiler.report(
+    dict(output_format='compact',
+         omit_keys=['data_stats.*.statistics',
+                    'data_stats.*.categorical',
+                    'data_stats.*.order',
+                    'global_stats'])))
+
+
+
+
+
+
+

Saving the Data Labeler for future use

+
+
[ ]:
+
+
+
+if not os.path.isdir('my_new_regex_labeler'):
+    os.mkdir('my_new_regex_labeler')
+data_labeler.save_to_disk('my_new_regex_labeler')
+
+
+
+
+
+

Loading the saved Data Labeler

+
+
[ ]:
+
+
+
+saved_labeler = dp.DataLabeler.load_from_disk('my_new_regex_labeler')
+
+
+
+
+
[ ]:
+
+
+
+# ensuring the parametesr are what we saved.
+print("label_mapping:")
+pprint(saved_labeler.label_mapping)
+print("\nmodel parameters:")
+pprint(saved_labeler.model._parameters)
+print()
+print("postprocessor: " + saved_labeler.postprocessor.__class__.__name__)
+
+
+
+
+
[ ]:
+
+
+
+# predicting with the loaded labeler.
+saved_labeler.predict(['test', '123'])
+
+
+
+
+
[ ]:
+
+
+
+
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/regex_labeler_from_scratch.ipynb b/docs/0.12.0/html/regex_labeler_from_scratch.ipynb new file mode 100644 index 000000000..96aee213a --- /dev/null +++ b/docs/0.12.0/html/regex_labeler_from_scratch.ipynb @@ -0,0 +1,444 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e04c382a-7c49-452b-b9bf-e448951c64fe", + "metadata": {}, + "source": [ + "# Building a Regex Data Labeler w/ your own Regex" + ] + }, + { + "cell_type": "markdown", + "id": "6fb3ecb9-bc51-4c18-93d5-7991bbee5165", + "metadata": {}, + "source": [ + "This notebook teaches how to use the existing / create your own regex labeler as well as utilize it for structured data profiling.\n", + "\n", + "1. Loading and utilizing the pre-existing regex data labeler\n", + "1. Replacing the existing regex rules with your own.\n", + "1. Utilizng a regex data labeler inside of the structured profiler\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a67c197b-d3ee-4896-a96f-cc3d043601d3", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "from pprint import pprint\n", + "\n", + "import pandas as pd\n", + "\n", + "try:\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " sys.path.insert(0, '../..')\n", + " import dataprofiler as dp" + ] + }, + { + "cell_type": "markdown", + "id": "c71356f4-9020-4862-a1e1-816effbb5443", + "metadata": {}, + "source": [ + "## Loading and using the pre-existing regex data labeler\n", + "We can easily import the exsting regex labeler via the `load_from_library` command from the `dp.DataLabeler`. This allows us to import models other than the default structured / unstructured labelers which exist in the library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "113d6655-4bca-4d8e-9e6f-b972e29d5684", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler = dp.DataLabeler.load_from_library('regex_model')\n", + "data_labeler.model.help()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b405887-2b92-44ca-b8d7-29c384f6dd9c", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(data_labeler.label_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11916a48-098c-4056-ac6c-b9542d85fa86", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(data_labeler.model._parameters['regex_patterns'])" + ] + }, + { + "cell_type": "markdown", + "id": "da0e97ee-8d6d-4631-9b55-78ed904d5f41", + "metadata": {}, + "source": [ + "### Predicting with the regex labeler\n", + "In the prediction below, the default settings will `split` the predictions by default as it's aggregation function. In other words, if a string '123 Fake St.' The first character would receive a vote for integer and for address giving both a 50% probability. This is because these regex functions are defined individually and a post prediction aggregation function must be used to get the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe519e65-36a7-4f42-8314-5369de8635c7", + "metadata": {}, + "outputs": [], + "source": [ + "# evaluate a prediction using the default parameters\n", + "data_labeler.predict(['123 Fake St.'])" + ] + }, + { + "cell_type": "markdown", + "id": "b41d834d-e47b-45a6-8970-d2d2033e2ade", + "metadata": {}, + "source": [ + "## Replacing the regex rules in the existing labeler\n", + "\n", + "We can achieve this by:\n", + "1. Setting the label mapping to the new labels\n", + "2. Setting the model parameters which include: `regex_patterns`, `default_label`, `ignore_case`, and `encapsulators`\n", + "\n", + "where `regex_patterns` is a `dict` of lists or regex for each label, `default_label` is the expected default label for the regex, `ignore_case` tells the model to ignore case during its detection, and `encapsulators` are generic regex statements placed before (start) and after (end) each regex. Currently, this is used by the default model to capture labels that are within a cell rather than matching the entire cell. (e.g. ' 123 ' will still capture 123 as digits)." + ] + }, + { + "cell_type": "markdown", + "id": "c6bb010a-406f-4fd8-abd0-3355a5ad0ded", + "metadata": {}, + "source": [ + "Below, we created 4 labels where `other` is the `default_label`. Additionally, we set enabled case sensitivity such that upper and lower case letters would be detected separately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f86584cf-a7af-4bae-bf44-d87caa68833a", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.set_labels({'other': 0, 'digits':1, 'lowercase_char': 2, 'uppercase_chars': 3})\n", + "data_labeler.model.set_params(\n", + " regex_patterns={\n", + " 'digits': [r'[+-]?[0-9]+'],\n", + " 'lowercase_char': [r'[a-z]+'],\n", + " 'uppercase_chars': [r'[A-Z]+'],\n", + " },\n", + " default_label='other',\n", + " ignore_case=False,\n", + ")\n", + "data_labeler.label_mapping" + ] + }, + { + "cell_type": "markdown", + "id": "1ece1c8c-18a5-46fc-b563-6458e6e71e53", + "metadata": {}, + "source": [ + "### Predicting with the new regex labels\n", + "\n", + "Here we notice the otuput of the predictions gives us a prediction per character for each regex. Note how by default it is matching subtext due to the encapsulators. Where `123` were found to be digits, `FAKE` was foudn to be upper case, and the whitespaces and `St.` were other due no single regex being correct." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92842e14-2ea6-4879-b58c-c52b607dc94c", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.predict(['123 FAKE St.'])" + ] + }, + { + "cell_type": "markdown", + "id": "2ce14e54-094f-41ff-9ce0-69acace6abc2", + "metadata": {}, + "source": [ + "Below we turn off case-sensitivity and we see how the aggregation funciton splits the votes for characters between the `lowercase` and `uppercase` chars." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7b8ed9d-c912-4dc7-82c5-ba78a3affc1e", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.model.set_params(ignore_case=True)\n", + "data_labeler.predict(['123 FAKE St.'])" + ] + }, + { + "cell_type": "markdown", + "id": "dc66515f-24e4-40f0-8592-b1ee4fba7077", + "metadata": {}, + "source": [ + "For the rest of this notebook, we will just use a single regex serach which will capture both upper and lower case chars." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e0c1b11-d111-4080-873f-40aff7cf7930", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.set_labels({'other': 0, 'digits':1, 'chars': 2})\n", + "data_labeler.model.set_params(\n", + " regex_patterns={\n", + " 'digits': [r'[=-]?[0-9]+'],\n", + " 'chars': [r'[a-zA-Z]+'],\n", + " },\n", + " default_label='other',\n", + " ignore_case=False,\n", + ")\n", + "data_labeler.label_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28e7b2ee-c661-4b31-b727-078f1393b5c4", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.predict(['123 FAKE St.'])" + ] + }, + { + "cell_type": "markdown", + "id": "f60c8fd1-76e1-469f-9e5a-62d7529301b3", + "metadata": {}, + "source": [ + "### Adjusting postprocessor properties\n", + "\n", + "Below we can look at the possible postprocessor parameters to adjust the aggregation function to the desired output. The previous outputs by default used the `split` aggregation function, however, below we will show the `random` aggregation function which will randomly choose a label if multiple labels have a vote for a given character." + ] + }, + { + "cell_type": "markdown", + "id": "36afa82b-1ca5-49ad-9aa9-84c6de621f59", + "metadata": {}, + "source": [ + "data_labeler.postprocessor.help()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66840940-47bf-433a-8ee8-977f26926e0b", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.postprocessor.set_params(aggregation_func='random')\n", + "data_labeler.predict(['123 FAKE St.'], predict_options=dict(show_confidences=True))" + ] + }, + { + "cell_type": "markdown", + "id": "c32b74fc-5051-4d53-b02a-4d1e4a35958f", + "metadata": {}, + "source": [ + "## Integrating the new Regex labeler into Structured Profiling\n", + "\n", + "While the labeler can be used alone, it is also possible to integrate the labeler into the StructuredProfiler with a slight change to its postprocessor. The StructuredProfiler requires a labeler which outputs othe confidence of each label for a given cell being processed. To convert the output of the `RegexPostProcessor` into said format, we will use the `StructRegexPostProcessor`. We can create the postprocessor and set the `data_labeler`'s postprocessor to this value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2663f2d-29a2-41ed-88dd-8a213d303365", + "metadata": {}, + "outputs": [], + "source": [ + "from dataprofiler.labelers.data_processing import StructRegexPostProcessor\n", + "\n", + "postprocesor = StructRegexPostProcessor()\n", + "data_labeler.set_postprocessor(postprocesor)" + ] + }, + { + "cell_type": "markdown", + "id": "f7352769-d636-42c6-9706-7d9cff520a72", + "metadata": {}, + "source": [ + "Below we will see the output is now one vote per sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18814634-0fd0-4ce8-b0c3-9b9454701a43", + "metadata": {}, + "outputs": [], + "source": [ + "data_labeler.predict(['123 FAKE St.', '123', 'FAKE'], predict_options=dict(show_confidences=True))" + ] + }, + { + "cell_type": "markdown", + "id": "b4aa4e36-7362-4966-b827-3f5a6f2dfa7c", + "metadata": {}, + "source": [ + "### Setting the Structuredprofiler's DataLabeler\n", + "\n", + "We can create a `ProfilerOption` and set the structured options to have the new data_labeler as its value. We then run the StructuredProfiler with the specified options." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f18cf7f-283e-4e54-b3f9-1312828c3029", + "metadata": {}, + "outputs": [], + "source": [ + "# create and set the option for the regex data labeler to be used at profile time\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': data_labeler})\n", + "\n", + "# profile the dataset using the suggested regex data labeler\n", + "data = pd.DataFrame(\n", + " [['123 FAKE St.', 123, 'this'], \n", + " [123 , -9, 'IS'], \n", + " ['...' , +80, 'A'], \n", + " ['123' , 202, 'raNDom'], \n", + " ['test' , -1, 'TEST']], \n", + " dtype=object)\n", + "profiler = dp.Profiler(data, options=profile_options)" + ] + }, + { + "cell_type": "markdown", + "id": "663e49f7-358b-4b0f-99a4-1823908ef990", + "metadata": {}, + "source": [ + "Below we see the first column is given 3 labels as it received multiple votes for said column. However, it was confident on the second and third column which is why it only specified `digits` and `chars` respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f796d7f5-7e8a-447b-9cbb-d5b8180660a3", + "metadata": {}, + "outputs": [], + "source": [ + "pprint(profiler.report(\n", + " dict(output_format='compact', \n", + " omit_keys=['data_stats.*.statistics', \n", + " 'data_stats.*.categorical', \n", + " 'data_stats.*.order', \n", + " 'global_stats'])))" + ] + }, + { + "cell_type": "markdown", + "id": "261b903f-8f4c-403f-839b-ab8813f850e9", + "metadata": {}, + "source": [ + "## Saving the Data Labeler for future use" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6ffbaf2-9400-486a-ba83-5fc9ba9334d7", + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.isdir('my_new_regex_labeler'):\n", + " os.mkdir('my_new_regex_labeler')\n", + "data_labeler.save_to_disk('my_new_regex_labeler')" + ] + }, + { + "cell_type": "markdown", + "id": "09e40cb6-9d89-41c4-ae28-3dca498f8c68", + "metadata": {}, + "source": [ + "## Loading the saved Data Labeler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52615b25-70a6-4ebb-8a32-14aaf1e747d9", + "metadata": {}, + "outputs": [], + "source": [ + "saved_labeler = dp.DataLabeler.load_from_disk('my_new_regex_labeler')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1ccc0b3-1dc2-4847-95c2-d6b8769b1590", + "metadata": {}, + "outputs": [], + "source": [ + "# ensuring the parametesr are what we saved.\n", + "print(\"label_mapping:\")\n", + "pprint(saved_labeler.label_mapping)\n", + "print(\"\\nmodel parameters:\")\n", + "pprint(saved_labeler.model._parameters)\n", + "print()\n", + "print(\"postprocessor: \" + saved_labeler.postprocessor.__class__.__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c827f2ae-4af6-4f3f-9651-9ee9ebea9fa0", + "metadata": {}, + "outputs": [], + "source": [ + "# predicting with the loaded labeler.\n", + "saved_labeler.predict(['test', '123'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "606f9bbf-5955-4b7b-b0d1-390de5600f73", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.12.0/html/roadmap.html b/docs/0.12.0/html/roadmap.html new file mode 100644 index 000000000..dba55df40 --- /dev/null +++ b/docs/0.12.0/html/roadmap.html @@ -0,0 +1,389 @@ + + + + + + + + + Roadmap - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Roadmap

+

For more detailed tasks, checkout the repo’s github issues page here: +Github Issues.

+
+

Data Reader Updates

+
    +
  • +
    Read data from S3 bucket
      +
    • All in the current dp.Data() API paradigm, we want to enable passing an S3 bucket file path to read in data from AWS s3.

    • +
    +
    +
    +
  • +
  • Pass list of data file paths to data reader

  • +
  • Pass in linst of data frames to data reader

  • +
+
+
+

New Model

+
    +
  • Transformer model from sensitive data detection

  • +
+
+
+

Historical Profiles

+
    +
  • +
    Some questions about Historical Profiles / need to step back and rething design to start:
      +
    • Meta profile on top?

    • +
    • Stored windowed info inside? Etc…

    • +
    +
    +
    +
  • +
  • Branch with current state of Historical Profiles

  • +
  • +
    Two example notebooks of current state:
      +
    • Notebook example one.

    • +
    • Notebook example two.

    • +
    +
    +
    +
  • +
+
+
+

Conditional Report Metric

+
    +
  • Based on what is populated on other metrics in the report, have “secondary” / “derivatives” of that number (or that number in conjunction with another number) populate in thie report as well.

  • +
  • For example, if null_count is not None, then populate a null_percent key with a value of the dividence of (null_count / sample_count).

  • +
+
+
+

Space / Time Testing

+
    +
  • +
    Automatic comparison testing for space and time analysis on PR’s
      +
    • Standardize a report for space time analysis for future comparisons (create baseline numbers)

    • +
    • Include those in integration tests that will automatically run on code when it is changed in PRs

    • +
    +
    +
    +
  • +
  • Could be an optional test, if the user thinks there is concern around the change driving an issue in the library performance

  • +
+
+
+

Testing Suite Upgrades

+
    +
  • Add mocking to unit tests where mocking is not utilized

  • +
  • Integration testing separated out from the unit testing suite. Determine how to only run remotely during PRs

  • +
  • Backward compatibility testing along with informative warnings and errors when a user is utilizing incompatible versions of the library and saved profile object

  • +
+
+
+

Historical Versions

+
    +
  • Legacy version upgrades to enable patches to prior versions of the Data Profiler

  • +
+
+
+

Miscellaneous

+
    +
  • Refact/or Pandas to Polars DataFrames

  • +
  • Spearman correlation calculation

  • +
  • Workflow Profiles

  • +
+
+
+ +
+ +
+ +
+
+ + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/search.html b/docs/0.12.0/html/search.html new file mode 100644 index 000000000..69fad1f4d --- /dev/null +++ b/docs/0.12.0/html/search.html @@ -0,0 +1,268 @@ + + + + + + + Search - <div class='hidden'>Data Profiler</div> <div class='version'> v0.12.0</div> + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + +
+

Error

+

+ Please activate JavaScript to enable the search functionality. +

+
+ + +
+ +
+
+ + + + + +
+
+ +
+
+ + + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/searchindex.js b/docs/0.12.0/html/searchindex.js new file mode 100644 index 000000000..db9ffffc3 --- /dev/null +++ b/docs/0.12.0/html/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({"docnames": ["API", "add_new_model_to_data_labeler", "column_name_labeler_example", "data_labeling", "data_reader", "data_readers", "dataprofiler", "dataprofiler.data_readers", "dataprofiler.data_readers.avro_data", "dataprofiler.data_readers.base_data", "dataprofiler.data_readers.csv_data", "dataprofiler.data_readers.data", "dataprofiler.data_readers.data_utils", "dataprofiler.data_readers.filepath_or_buffer", "dataprofiler.data_readers.graph_data", "dataprofiler.data_readers.json_data", "dataprofiler.data_readers.parquet_data", "dataprofiler.data_readers.structured_mixins", "dataprofiler.data_readers.text_data", "dataprofiler.dp_logging", "dataprofiler.labelers", "dataprofiler.labelers.base_data_labeler", "dataprofiler.labelers.base_model", "dataprofiler.labelers.char_load_tf_model", "dataprofiler.labelers.character_level_cnn_model", "dataprofiler.labelers.classification_report_utils", "dataprofiler.labelers.column_name_model", "dataprofiler.labelers.data_labelers", "dataprofiler.labelers.data_processing", "dataprofiler.labelers.labeler_utils", "dataprofiler.labelers.regex_model", "dataprofiler.labelers.utils", "dataprofiler.plugins", "dataprofiler.plugins.decorators", "dataprofiler.profilers", "dataprofiler.profilers.base_column_profilers", "dataprofiler.profilers.categorical_column_profile", "dataprofiler.profilers.column_profile_compilers", "dataprofiler.profilers.data_labeler_column_profile", "dataprofiler.profilers.datetime_column_profile", "dataprofiler.profilers.float_column_profile", "dataprofiler.profilers.graph_profiler", "dataprofiler.profilers.helpers", "dataprofiler.profilers.helpers.report_helpers", "dataprofiler.profilers.histogram_utils", "dataprofiler.profilers.int_column_profile", "dataprofiler.profilers.json_decoder", "dataprofiler.profilers.json_encoder", "dataprofiler.profilers.numerical_column_stats", "dataprofiler.profilers.order_column_profile", "dataprofiler.profilers.profile_builder", "dataprofiler.profilers.profiler_options", "dataprofiler.profilers.profiler_utils", "dataprofiler.profilers.text_column_profile", "dataprofiler.profilers.unstructured_labeler_profile", "dataprofiler.profilers.unstructured_text_profile", "dataprofiler.profilers.utils", "dataprofiler.reports", "dataprofiler.reports.graphs", "dataprofiler.reports.utils", "dataprofiler.rng_utils", "dataprofiler.settings", "dataprofiler.validators", "dataprofiler.validators.base_validators", "dataprofiler.version", "examples", "graph_data_demo", "graphs", "index", "install", "labeler", "merge_profile_list", "modules", "overview", "popmon_dp_loader_example", "profiler", "profiler_example", "regex_labeler_from_scratch", "roadmap", "unstructured_profiler_example"], "filenames": ["API.rst", "add_new_model_to_data_labeler.nblink", "column_name_labeler_example.nblink", "data_labeling.rst", "data_reader.nblink", "data_readers.rst", "dataprofiler.rst", "dataprofiler.data_readers.rst", "dataprofiler.data_readers.avro_data.rst", "dataprofiler.data_readers.base_data.rst", "dataprofiler.data_readers.csv_data.rst", "dataprofiler.data_readers.data.rst", "dataprofiler.data_readers.data_utils.rst", "dataprofiler.data_readers.filepath_or_buffer.rst", "dataprofiler.data_readers.graph_data.rst", "dataprofiler.data_readers.json_data.rst", "dataprofiler.data_readers.parquet_data.rst", "dataprofiler.data_readers.structured_mixins.rst", "dataprofiler.data_readers.text_data.rst", "dataprofiler.dp_logging.rst", "dataprofiler.labelers.rst", "dataprofiler.labelers.base_data_labeler.rst", "dataprofiler.labelers.base_model.rst", "dataprofiler.labelers.char_load_tf_model.rst", "dataprofiler.labelers.character_level_cnn_model.rst", "dataprofiler.labelers.classification_report_utils.rst", "dataprofiler.labelers.column_name_model.rst", "dataprofiler.labelers.data_labelers.rst", "dataprofiler.labelers.data_processing.rst", "dataprofiler.labelers.labeler_utils.rst", "dataprofiler.labelers.regex_model.rst", "dataprofiler.labelers.utils.rst", "dataprofiler.plugins.rst", "dataprofiler.plugins.decorators.rst", "dataprofiler.profilers.rst", "dataprofiler.profilers.base_column_profilers.rst", "dataprofiler.profilers.categorical_column_profile.rst", "dataprofiler.profilers.column_profile_compilers.rst", "dataprofiler.profilers.data_labeler_column_profile.rst", "dataprofiler.profilers.datetime_column_profile.rst", "dataprofiler.profilers.float_column_profile.rst", "dataprofiler.profilers.graph_profiler.rst", "dataprofiler.profilers.helpers.rst", "dataprofiler.profilers.helpers.report_helpers.rst", "dataprofiler.profilers.histogram_utils.rst", "dataprofiler.profilers.int_column_profile.rst", "dataprofiler.profilers.json_decoder.rst", "dataprofiler.profilers.json_encoder.rst", "dataprofiler.profilers.numerical_column_stats.rst", "dataprofiler.profilers.order_column_profile.rst", "dataprofiler.profilers.profile_builder.rst", "dataprofiler.profilers.profiler_options.rst", "dataprofiler.profilers.profiler_utils.rst", "dataprofiler.profilers.text_column_profile.rst", "dataprofiler.profilers.unstructured_labeler_profile.rst", "dataprofiler.profilers.unstructured_text_profile.rst", "dataprofiler.profilers.utils.rst", "dataprofiler.reports.rst", "dataprofiler.reports.graphs.rst", "dataprofiler.reports.utils.rst", "dataprofiler.rng_utils.rst", "dataprofiler.settings.rst", "dataprofiler.validators.rst", "dataprofiler.validators.base_validators.rst", "dataprofiler.version.rst", "examples.rst", "graph_data_demo.nblink", "graphs.rst", "index.rst", "install.rst", "labeler.nblink", "merge_profile_list.nblink", "modules.rst", "overview.nblink", "popmon_dp_loader_example.nblink", "profiler.rst", "profiler_example.nblink", "regex_labeler_from_scratch.nblink", "roadmap.rst", "unstructured_profiler_example.nblink"], "titles": ["API", "Adding new model to the existing DataLabeler pipeline", "ColumnName Labeler Tutorial", "Labeler (Sensitive Data)", "Intro to Data Readers", "Data Readers", "Dataprofiler", "Data Readers", "Avro Data", "Base Data", "CSV Data", "Data", "Data Utils", "Filepath Or Buffer", "Graph Data", "JSON Data", "Parquet Data", "Structured Mixins", "Text Data", "Dp Logging", "Labelers", "Base Data Labeler", "Base Model", "Char Load Tf Model", "Character Level Cnn Model", "Classification Report Utils", "Column Name Model", "Data Labelers", "Data Processing", "Labeler Utils", "Regex Model", "Utils", "Plugins", "Decorators", "Profilers", "Base Column Profilers", "Categorical Column Profile", "Column Profile Compilers", "Data Labeler Column Profile", "Datetime Column Profile", "Float Column Profile", "Graph Profiler", "Helpers", "Report Helpers", "Histogram Utils", "Int Column Profile", "JSON Decoder", "JSON Encoder", "Numerical Column Stats", "Order Column Profile", "Profile Builder", "Profiler Options", "Profiler Utils", "Text Column Profile", "Unstructured Labeler Profile", "Unstructured Text Profile", "Utils", "Reports", "Graphs", "Utils", "Rng Utils", "Settings", "Validators", "Base Validators", "Version", "Examples", "Graph Pipeline Demo", "Graphs", "Data Profiler | What\u2019s in your data?", "Install", "Sensitive Data Detection with the Labeler", "Merge List of Profiles", "dataprofiler", "Data Profiler - What\u2019s in your data?", "Dataloader with Popmon Reports", "Profiler", "Structured Profilers", "Building a Regex Data Labeler w/ your own Regex", "Roadmap", "Unstructured Profilers"], "terms": {"The": [0, 1, 2, 3, 4, 5, 12, 20, 24, 25, 29, 47, 50, 51, 52, 63, 66, 67, 68, 69, 70, 73, 74, 75, 76, 77, 79], "split": [0, 1, 21, 28, 66, 68, 70, 75, 77], "4": [0, 2, 4, 29, 67, 68, 70, 71, 73, 75, 76, 77, 79], "main": [0, 1, 3, 4, 25, 44, 67], "compon": [0, 1, 21, 24, 25, 27, 46, 66, 70, 73, 74, 75, 76, 79], "profil": [0, 1, 2, 3, 4, 6, 7, 42, 43, 46, 47, 48, 65, 67, 69, 70, 72, 74], "label": [0, 1, 6, 22, 23, 24, 25, 26, 28, 30, 31, 34, 50, 51, 52, 65, 69, 72, 73, 74, 75, 76, 79], "data": [0, 1, 6, 17, 20, 22, 23, 24, 25, 26, 29, 30, 33, 34, 35, 36, 37, 39, 40, 41, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 58, 63, 65, 69, 72], "reader": [0, 1, 6, 12, 65, 68, 70, 72, 74, 75], "valid": [0, 1, 3, 5, 6, 8, 10, 12, 15, 16, 21, 22, 23, 24, 27, 51, 68, 72, 73, 74, 75, 76, 79], "view": [1, 2, 4, 52, 66, 70, 71, 73, 74, 76, 77, 79], "thi": [1, 2, 3, 4, 7, 8, 9, 10, 12, 14, 15, 16, 18, 24, 25, 26, 28, 29, 30, 36, 37, 38, 39, 40, 42, 44, 45, 47, 48, 49, 50, 51, 52, 53, 63, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 79], "notebook": [1, 2, 4, 66, 70, 71, 73, 74, 76, 77, 78, 79], "github": [1, 2, 4, 29, 44, 66, 69, 70, 71, 73, 74, 76, 77, 78, 79], "consid": [1, 12, 29, 30, 51, 53, 75], "case": [1, 4, 25, 28, 49, 51, 52, 53, 68, 74, 75, 77], "when": [1, 3, 4, 22, 23, 24, 25, 26, 28, 29, 30, 38, 46, 50, 51, 66, 68, 74, 75, 78, 79], "we": [1, 2, 4, 12, 36, 51, 66, 70, 73, 74, 75, 76, 77, 78, 79], "would": [1, 3, 47, 70, 75, 77], "like": [1, 3, 24, 25, 43, 52, 66, 73, 75, 76, 79], "explor": [1, 68], "differ": [1, 4, 35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 52, 53, 54, 55, 63, 65, 68, 70, 73, 74], "neural": 1, "network": [1, 24], "evalu": [1, 2, 3, 29, 77], "perform": [1, 24, 25, 29, 52, 70, 75, 78], "dataprofil": [1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 60, 61, 63, 65, 66, 67, 68, 69, 70, 71, 73, 75, 76, 77, 79], "librari": [1, 2, 3, 21, 27, 28, 65, 66, 68, 70, 73, 77, 78], "alreadi": [1, 24, 29, 69], "contain": [1, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 35, 36, 38, 39, 43, 44, 46, 47, 49, 51, 52, 58, 59, 64, 67, 68, 70, 73, 75, 76, 79], "preprocessor": [1, 2, 20, 21, 27, 28, 70], "postprocessor": [1, 2, 20, 21, 27, 28, 65, 70, 75], "cnn": [1, 6, 20], "convolut": 1, "ar": [1, 2, 3, 4, 5, 10, 12, 18, 22, 23, 24, 25, 26, 28, 29, 30, 36, 47, 50, 51, 52, 67, 68, 69, 70, 73, 74, 75, 76, 77, 79], "combin": [1, 4, 10, 52, 73], "work": [1, 12, 14, 29, 51, 66, 67, 68, 73, 74, 75, 76, 79], "all": [1, 4, 8, 9, 10, 12, 14, 15, 16, 18, 22, 23, 24, 25, 26, 28, 29, 30, 38, 47, 50, 51, 52, 53, 67, 69, 70, 71, 73, 74, 75, 76, 78], "need": [1, 2, 3, 12, 19, 21, 22, 23, 24, 26, 30, 51, 52, 66, 68, 70, 73, 74, 76, 77, 78, 79], "build": [1, 12, 24, 25, 48, 50, 63, 65], "addit": [1, 4, 12, 29, 68, 70, 73, 75, 76, 79], "class": [1, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 63, 65, 66, 68, 70, 75, 76, 79], "inherit": [1, 21, 70], "function": [1, 3, 4, 12, 13, 17, 21, 24, 25, 26, 28, 29, 31, 33, 40, 41, 42, 43, 44, 45, 47, 48, 51, 52, 53, 58, 59, 66, 67, 70, 71, 73, 74, 75, 76, 77, 79], "from": [1, 2, 3, 4, 5, 8, 10, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 65, 68, 71, 73, 74, 75, 76, 77, 78, 79], "also": [1, 3, 4, 5, 8, 10, 14, 15, 16, 18, 24, 25, 51, 63, 66, 67, 68, 70, 73, 75, 76, 77, 79], "adapt": [1, 29], "construct": [1, 3, 46, 51], "desir": [1, 4, 50, 68, 73, 76, 77, 79], "architectur": [1, 3], "In": [1, 2, 3, 4, 12, 25, 28, 52, 68, 70, 73, 75, 76, 77, 79], "exampl": [1, 2, 3, 4, 5, 24, 25, 29, 30, 47, 63, 66, 68, 70, 71, 73, 75, 76, 77, 78, 79], "defin": [1, 5, 24, 48, 51, 67, 75, 77], "us": [1, 3, 4, 8, 10, 12, 13, 14, 15, 16, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 38, 46, 47, 50, 51, 52, 60, 65, 66, 67, 68, 69, 70, 73, 75, 76, 79], "particular": [1, 2, 12, 73], "long": 1, "short": 1, "term": [1, 3, 76, 79], "memori": [1, 5, 8, 10, 15, 16, 18, 52], "train": [1, 21, 22, 23, 24, 27, 29, 65, 66, 68, 76, 79], "tabular": [1, 3, 66, 70], "process": [1, 3, 6, 12, 20, 22, 26, 52, 63, 73, 75, 76, 77, 79], "includ": [1, 2, 21, 23, 24, 25, 27, 29, 51, 69, 73, 76, 77, 78, 79], "follow": [1, 3, 4, 5, 19, 20, 21, 25, 27, 28, 52, 66, 67, 70, 73, 74, 75, 76, 79], "step": [1, 2, 29, 69, 78], "load": [1, 5, 6, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 24, 26, 27, 28, 30, 35, 36, 37, 38, 39, 40, 41, 45, 46, 48, 49, 50, 51, 52, 53, 65, 74], "swap": [1, 3], "given": [1, 3, 4, 8, 10, 11, 12, 13, 16, 19, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 34, 37, 46, 47, 50, 52, 59, 63, 68, 69, 70, 73, 75, 76, 77, 79], "first": [1, 2, 4, 10, 15, 24, 38, 52, 66, 67, 68, 70, 73, 74, 75, 76, 77], "let": [1, 2, 66, 70, 71, 73, 76, 77, 79], "s": [1, 2, 3, 5, 12, 22, 24, 28, 40, 43, 45, 48, 51, 52, 53, 65, 66, 67, 70, 71, 75, 76, 78, 79], "import": [1, 2, 3, 4, 5, 32, 47, 65, 66, 68, 70, 73, 74, 75, 76, 77, 79], "os": [1, 2, 4, 66, 68, 70, 71, 73, 74, 76, 77, 79], "sy": [1, 2, 4, 31, 59, 66, 70, 71, 73, 74, 76, 77, 79], "json": [1, 2, 3, 4, 5, 6, 7, 12, 16, 34, 35, 36, 37, 38, 39, 40, 45, 48, 49, 50, 51, 53, 68, 70, 71, 73, 74, 75, 76, 77, 79], "panda": [1, 2, 5, 10, 28, 35, 36, 37, 38, 39, 40, 45, 48, 49, 50, 52, 53, 55, 63, 65, 67, 70, 71, 73, 74, 76, 77, 78, 79], "pd": [1, 2, 4, 12, 21, 22, 27, 28, 50, 63, 67, 68, 70, 71, 73, 74, 75, 76, 77, 79], "path": [1, 2, 3, 4, 5, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 26, 27, 28, 30, 41, 50, 54, 66, 70, 71, 73, 74, 75, 76, 77, 78, 79], "insert": [1, 2, 4, 47, 52, 66, 68, 70, 71, 73, 74, 76, 77, 79], "0": [1, 2, 3, 4, 5, 12, 13, 14, 21, 24, 25, 28, 29, 36, 40, 45, 47, 48, 50, 51, 53, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 79], "dp": [1, 2, 3, 4, 5, 6, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79], "structur": [1, 4, 5, 6, 7, 12, 13, 25, 27, 28, 38, 40, 45, 47, 49, 50, 51, 53, 65, 68, 79], "aw": [1, 12, 70, 78], "honeypot": [1, 70], "test": [1, 2, 4, 8, 12, 15, 16, 40, 45, 47, 48, 52, 53, 66, 70, 73, 74, 75, 76, 77, 79], "folder": [1, 2, 32, 69, 70], "read": [1, 5, 8, 10, 11, 12, 13, 14, 15, 16, 18, 25, 65, 68, 73, 74, 75, 76, 78, 79], "next": [1, 4, 70, 74, 76, 79], "section": [1, 4, 68, 70, 71, 73, 75, 76, 79], "input": [1, 3, 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 51, 52, 66, 70, 73, 75, 76, 79], "csv": [1, 3, 4, 5, 6, 7, 12, 14, 66, 68, 70, 73, 74, 75, 76, 79], "aws_honeypot_marx_geo": [1, 4, 73, 74, 76], "df_data": [1, 70], "set": [1, 2, 3, 4, 5, 6, 8, 12, 16, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 51, 52, 59, 63, 65, 68, 70, 72, 73, 74, 76, 79], "split_ratio": [1, 70], "2": [1, 2, 3, 4, 21, 25, 27, 29, 36, 37, 50, 51, 52, 63, 67, 68, 70, 71, 73, 75, 76, 77], "sampl": [1, 3, 5, 12, 18, 21, 22, 23, 24, 25, 28, 38, 39, 40, 45, 50, 51, 52, 53, 68, 70, 73, 76, 77, 79], "frac": [1, 29, 70], "1": [1, 2, 4, 5, 12, 14, 21, 25, 28, 29, 36, 38, 40, 45, 48, 51, 53, 63, 67, 68, 70, 71, 73, 75, 76, 77], "reset_index": [1, 70], "drop": [1, 50, 70], "true": [1, 2, 3, 4, 5, 9, 12, 13, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 36, 40, 45, 47, 48, 51, 52, 53, 66, 68, 70, 74, 75, 77, 79], "data_train": [1, 70], "int": [1, 5, 6, 8, 9, 10, 12, 13, 14, 15, 16, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 34, 35, 36, 38, 39, 40, 42, 43, 47, 48, 49, 50, 51, 52, 53, 63, 67, 68, 70, 73, 75, 76, 79], "len": [1, 4, 70, 73, 76, 79], "data_test": [1, 70], "head": [1, 4, 68, 70, 73, 75], "characterlevelcnnmodel": [1, 3, 20, 24, 28, 70], "some": [1, 67, 70, 73, 74, 75, 78], "modif": 1, "__init__": [1, 3], "add": [1, 3, 17, 21, 22, 23, 24, 26, 27, 30, 52, 68, 70, 76, 78, 79], "paramet": [1, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 63, 65, 66, 70, 75, 77], "size_lstm": 1, "rec_dropout": 1, "activ": [1, 4, 69, 75], "recurrent_activ": 1, "specifi": [1, 2, 3, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 21, 25, 27, 28, 43, 47, 50, 51, 52, 63, 65, 66, 67, 73, 74, 76, 77], "number": [1, 3, 12, 21, 22, 23, 24, 25, 27, 28, 29, 42, 43, 50, 51, 52, 60, 63, 64, 66, 68, 70, 73, 75, 76, 78, 79], "layer": [1, 24, 29], "recurr": 1, "dropout": [1, 23, 24], "ratio": [1, 36, 39, 40, 45, 51, 53, 73, 75, 76], "_validate_paramet": [1, 3], "check": [1, 2, 3, 8, 10, 12, 14, 16, 21, 27, 31, 36, 40, 45, 47, 48, 51, 53, 59, 63, 65, 70, 71, 73, 75], "_construct_model": [1, 3], "tensorflow": [1, 29, 52, 68, 69, 70, 71, 76, 79], "tf": [1, 6, 20, 24, 29, 70, 71, 76, 79], "numpi": [1, 12, 23, 24, 28, 40, 44, 45, 48, 52, 53, 68], "np": [1, 12, 21, 22, 23, 24, 25, 27, 28, 40, 45, 48, 50, 52, 53, 68], "character_level_cnn_model": [1, 3, 24, 70], "create_glove_char": [1, 24], "build_embd_dictionari": [1, 24], "base_model": [1, 3, 21, 22, 27], "basemodel": [1, 3, 21, 22, 23, 24, 26, 27, 30, 70], "labeler_util": [1, 29], "f1score": [1, 29], "characterlevellstmmodel": 1, "deriv": [1, 78], "boolean": [1, 2, 5, 12, 13, 24, 29, 35, 36, 37, 38, 39, 45, 49, 51, 54, 75], "map": [1, 2, 3, 22, 23, 24, 26, 28, 30, 38, 50, 51, 70, 75, 77], "requir": [1, 3, 4, 26, 28, 29, 30, 50, 51, 52, 67, 68, 69, 70, 75, 76, 77, 79], "index": [1, 3, 5, 12, 14, 21, 22, 23, 24, 26, 27, 28, 29, 30, 49, 50, 52, 67, 75, 76], "reserv": [1, 29], "requires_zero_map": [1, 22, 23, 24, 26, 30], "def": [1, 4, 24, 70, 74], "self": [1, 24, 35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 51, 53, 55], "label_map": [1, 2, 21, 22, 23, 24, 26, 27, 28, 30, 70, 77], "none": [1, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 35, 36, 37, 38, 39, 40, 41, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 63, 67, 68, 69, 70, 74, 75, 76, 78, 79], "initi": [1, 3, 5, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 26, 27, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 51, 52, 53, 54, 55, 63], "setdefault": [1, 52], "max_length": [1, 23, 24, 26, 28, 30, 70], "3400": [1, 28], "max_char_encoding_id": [1, 23, 24], "127": 1, "dim_emb": [1, 23, 24, 26, 30], "64": 1, "size_fc": [1, 23, 24], "32": [1, 21, 23, 24, 27, 28], "tanh": 1, "sigmoid": 1, "default_label": [1, 2, 23, 24, 27, 28, 30, 70, 77], "unknown": [1, 3, 28, 29, 68, 70], "pad_label": [1, 28], "pad": [1, 28, 29, 70], "_epoch_id": 1, "reconstruct": [1, 3, 70], "flag": [1, 2, 3, 21, 22, 23, 24, 26, 27, 28, 30, 35, 36, 37, 38, 39, 45, 49, 50, 51, 54], "_model_num_label": 1, "_model_default_ind": 1, "sent": [1, 21, 70, 75], "rais": [1, 4, 12, 25, 38, 46, 47, 50, 51, 52, 73], "error": [1, 4, 19, 21, 27, 38, 51, 52, 69, 70, 71, 73, 76, 78, 79], "invalid": 1, "present": [1, 25, 48, 52, 75], "list_of_necessary_param": 1, "make": [1, 3, 24, 48, 68, 70, 73], "sure": [1, 24], "necessari": [1, 66, 71], "param": [1, 8, 10, 16, 21, 22, 27, 29, 35, 36, 37, 38, 39, 40, 45, 46, 48, 49, 50, 51, 53, 63], "size_conv": [1, 23, 24], "isinst": 1, "float": [1, 3, 6, 12, 21, 22, 23, 24, 25, 28, 29, 34, 36, 38, 39, 41, 45, 47, 48, 49, 50, 51, 52, 53, 63, 66, 67, 68, 70, 75, 76, 79], "append": [1, 70, 76, 79], "must": [1, 3, 5, 19, 21, 22, 23, 24, 27, 28, 38, 50, 75, 76, 77, 79], "integ": [1, 2, 3, 14, 23, 24, 28, 30, 40, 45, 47, 48, 53, 67, 68, 70, 75, 77], "greater": [1, 2, 29, 75], "than": [1, 2, 12, 28, 29, 38, 75, 77], "elif": 1, "list": [1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 36, 38, 40, 45, 47, 48, 50, 51, 52, 53, 54, 59, 63, 65, 66, 67, 68, 70, 73, 75, 76, 77, 78, 79], "non": [1, 24, 28, 47, 50, 70, 75], "empti": [1, 4, 52, 73, 75], "els": [1, 52, 73, 74, 76, 79], "item": [1, 12, 47, 52, 63, 70], "break": 1, "str": [1, 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 35, 36, 37, 38, 39, 40, 41, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 63, 67, 68, 70, 75, 76, 79], "string": [1, 2, 3, 4, 5, 12, 13, 14, 17, 24, 25, 26, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 63, 68, 70, 73, 74, 75, 76, 77, 79], "extra": [1, 4, 10, 66, 70, 71, 76, 79], "thrown": 1, "an": [1, 4, 8, 9, 10, 12, 14, 15, 16, 17, 18, 22, 24, 25, 28, 29, 30, 36, 40, 45, 47, 48, 49, 52, 53, 63, 66, 68, 70, 71, 73, 74, 75, 76, 78, 79], "accept": [1, 3, 14, 29, 75], "valueerror": [1, 12, 46, 47], "n": [1, 3, 4, 5, 12, 14, 26, 28, 30, 36, 70, 73, 75], "join": [1, 4, 8, 15, 28, 43, 66, 73, 74, 76, 79], "constructor": [1, 47], "serv": 1, "weight": [1, 3, 21, 22, 23, 24, 25, 26, 29, 30], "reset": [1, 21, 22, 23, 24, 26, 29, 30, 68], "return": [1, 2, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 59, 63, 68, 70, 74, 75, 76, 79], "num_label": [1, 22, 23, 24, 26, 29, 30], "default_ind": [1, 24], "_paramet": [1, 2, 77], "kera": [1, 24, 29], "backend": 1, "clear_sess": 1, "gener": [1, 8, 9, 10, 12, 14, 15, 16, 18, 28, 29, 33, 35, 37, 38, 39, 42, 43, 49, 50, 51, 52, 57, 58, 60, 66, 68, 73, 74, 75, 76, 77, 79], "glove": [1, 24], "embed": [1, 24, 28], "_model": 1, "sequenti": [1, 73], "default": [1, 2, 3, 4, 5, 10, 12, 24, 25, 26, 28, 29, 46, 47, 50, 51, 52, 66, 68, 69, 70, 74, 75, 77], "encod": [1, 3, 4, 6, 8, 9, 10, 12, 13, 14, 15, 16, 18, 21, 22, 23, 24, 26, 27, 28, 29, 30, 34, 46, 68, 70, 75, 76, 79], "encoding_funct": 1, "input_str": [1, 30], "char_in_vector": 1, "_char_encoding_lay": 1, "shape": [1, 24, 25, 28, 29, 66, 75], "dtype": [1, 12, 24, 29, 67, 77], "lambda": 1, "output_shap": 1, "tupl": [1, 12, 22, 23, 24, 25, 28, 29, 47, 50, 52], "creat": [1, 2, 3, 11, 12, 22, 24, 27, 28, 30, 41, 46, 52, 60, 65, 66, 70, 74, 75, 76, 77, 78, 79], "pre": [1, 24, 28, 51, 65, 68, 75], "matrix": [1, 25, 29, 51, 52, 68, 75, 76], "indic": [1, 22, 23, 24, 25, 26, 29, 30, 50, 52, 75], "rang": [1, 14, 29, 63, 75, 76, 79], "one": [1, 3, 5, 12, 19, 24, 25, 36, 37, 50, 68, 69, 70, 73, 74, 75, 76, 77, 78, 79], "out": [1, 2, 3, 5, 12, 24, 29, 52, 70, 71, 75, 78], "vocabulari": 1, "embed_fil": 1, "reduc": [1, 52], "d": [1, 48, 52, 67, 71], "txt": [1, 4, 5, 44, 68, 69, 73, 75, 76, 79], "format": [1, 3, 4, 5, 8, 9, 10, 14, 15, 16, 18, 21, 24, 25, 27, 28, 38, 39, 46, 49, 50, 51, 70, 73, 74, 75, 76, 77, 79], "embedding_matrix": 1, "zero": [1, 24, 40, 45, 48, 53, 75], "embedding_dict": 1, "input_shap": [1, 24], "fill": 1, "space": 1, "0s": 1, "ascii_num": 1, "chr": 1, "input_length": 1, "trainabl": [1, 3, 21, 24, 27, 70], "size": [1, 12, 21, 27, 28, 50, 51, 52, 66, 73], "unit": [1, 52, 69, 78], "recurrent_dropout": 1, "return_sequ": 1, "fulli": [1, 24, 75], "connect": [1, 21, 24, 27], "dens": 1, "relu": 1, "final": [1, 2, 24, 74], "softmax": 1, "output": [1, 2, 3, 5, 12, 14, 21, 24, 25, 27, 28, 29, 38, 39, 41, 47, 49, 50, 52, 54, 55, 66, 70, 74, 75, 76, 77, 79], "pb": 1, "file": [1, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 27, 29, 41, 44, 50, 64, 65, 66, 69, 70, 73, 74, 76, 78, 79], "argmax_lay": [1, 24], "argmax": [1, 24, 28, 29], "confid": [1, 2, 3, 22, 23, 24, 26, 28, 30, 51, 70, 75, 77], "final_predicted_lay": 1, "_argmax_threshold_lay": 1, "threshold": [1, 24, 28, 29, 51, 52, 75], "argmax_output": 1, "compil": [1, 6, 34, 46], "softmax_output_layer_nam": 1, "name": [1, 4, 6, 12, 14, 19, 20, 21, 24, 25, 27, 28, 29, 31, 33, 35, 36, 37, 38, 39, 40, 45, 46, 48, 49, 51, 52, 53, 55, 59, 65, 66, 67, 68, 69, 70, 73, 75, 76], "loss": [1, 24], "categorical_crossentropi": 1, "f1": [1, 22, 23, 24, 25, 29, 70], "score": [1, 25, 26, 29, 70], "metric": [1, 24, 25, 29, 51, 75], "f1_score_train": 1, "num_class": [1, 29], "averag": [1, 25, 29, 38, 66, 75], "micro": [1, 25, 29], "acc": 1, "optim": [1, 23, 24, 75], "adam": 1, "onc": [1, 4, 75, 76, 79], "built": [1, 20, 24, 28], "replac": [1, 8, 9, 10, 14, 15, 16, 18, 30, 65, 75], "which": [1, 2, 3, 4, 5, 8, 9, 10, 12, 14, 15, 16, 18, 21, 24, 25, 28, 36, 38, 39, 40, 45, 47, 48, 49, 51, 52, 53, 68, 70, 73, 74, 75, 76, 77, 79], "note": [1, 3, 12, 24, 25, 29, 44, 53, 69, 75, 76, 77, 79], "abov": [1, 5, 29, 66, 68, 69, 70, 75], "its": [1, 3, 4, 12, 21, 24, 25, 27, 48, 50, 63, 68, 75, 77], "updat": [1, 3, 24, 25, 29, 35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 51, 52, 53, 54, 55, 65, 67], "column": [1, 3, 5, 6, 10, 12, 14, 16, 20, 25, 28, 34, 46, 50, 51, 52, 54, 55, 63, 65, 67, 68, 70, 73, 74, 75, 76, 77, 79], "while": [1, 3, 4, 25, 73, 77], "get": [1, 3, 8, 9, 10, 14, 15, 16, 18, 22, 23, 24, 26, 28, 29, 30, 40, 45, 47, 48, 52, 53, 63, 67, 69, 70, 73, 74, 76, 77, 79], "value_label_df": 1, "melt": [1, 70], "valu": [1, 2, 4, 10, 12, 13, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 51, 52, 53, 68, 70, 74, 75, 76, 77, 78, 79], "order": [1, 3, 6, 22, 23, 24, 25, 26, 28, 30, 34, 36, 37, 38, 51, 52, 68, 70, 76, 77], "astyp": [1, 70], "uniqu": [1, 4, 29, 36, 51, 52, 70, 75, 76, 79], "tolist": [1, 70], "comment": 1, "data_label": [1, 2, 3, 27, 38, 51, 54, 68, 70, 73, 75, 76, 77, 79], "labeler_typ": [1, 3, 27, 70], "set_model": [1, 3, 21, 27, 70], "processor_param": 1, "_preprocessor": 1, "set_param": [1, 2, 3, 21, 22, 23, 24, 26, 27, 28, 30, 70, 77], "_postprocessor": 1, "save_dirpath": [1, 3, 27, 70], "data_labeler_sav": [1, 70], "makedir": [1, 70, 74], "epoch": [1, 3, 21, 22, 23, 24, 27, 29, 70], "fit": [1, 3, 21, 22, 23, 24, 38, 70], "x": [1, 3, 4, 12, 21, 24, 40, 45, 48, 53, 63, 69, 70, 75], "y": [1, 3, 4, 21, 69, 70], "save_to_disk": [1, 2, 3, 21, 22, 23, 24, 26, 27, 28, 30, 70, 77], "provid": [1, 2, 4, 5, 12, 27, 34, 42, 50, 52, 65, 66, 67, 68, 70, 71, 73, 75, 76, 79], "predict": [1, 3, 21, 22, 23, 24, 26, 27, 28, 29, 30, 34, 38, 65, 68, 73, 75, 76, 79], "option": [1, 2, 3, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 23, 24, 25, 26, 27, 28, 29, 30, 34, 35, 36, 37, 38, 39, 40, 41, 45, 46, 48, 49, 50, 52, 53, 54, 55, 63, 65, 66, 67, 68, 70, 74, 77, 78], "except": [1, 2, 3, 4, 28, 29, 52, 66, 70, 71, 74, 76, 77, 79], "disabl": [1, 26, 35, 36, 37, 38, 39, 45, 48, 49, 51, 54, 68, 69, 70, 73, 74, 75], "sake": [1, 70], "result": [1, 2, 3, 25, 28, 29, 37, 50, 52, 63, 68, 70, 73, 75, 76, 77], "columnar": 1, "where": [1, 2, 3, 5, 12, 21, 22, 23, 24, 26, 27, 28, 30, 36, 50, 75, 77, 78], "type": [1, 3, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 63, 65, 68, 70, 73, 74, 75], "second": [1, 38, 52, 68, 73, 77], "object": [1, 4, 5, 9, 11, 12, 13, 17, 21, 22, 24, 27, 28, 29, 37, 41, 43, 46, 47, 48, 50, 51, 52, 54, 55, 63, 67, 68, 70, 71, 73, 75, 76, 77, 78, 79], "profile_opt": [1, 70, 73, 75, 76, 77, 79], "profileropt": [1, 41, 50, 51, 70, 73, 74, 75, 76, 77, 79], "structured_opt": [1, 51, 70, 73, 75, 76, 77], "text": [1, 3, 4, 5, 6, 7, 10, 20, 23, 24, 25, 28, 29, 34, 48, 51, 68, 70, 73, 75, 76, 79], "is_en": [1, 51, 52, 68, 70, 73, 74, 75, 76, 79], "fals": [1, 2, 4, 5, 9, 12, 13, 21, 22, 23, 24, 25, 26, 27, 28, 30, 35, 36, 37, 38, 39, 40, 41, 45, 47, 48, 49, 50, 51, 53, 54, 55, 66, 68, 70, 73, 74, 75, 76, 77, 79], "categori": [1, 36, 51, 52, 68, 70, 73, 75, 76], "datetim": [1, 3, 4, 6, 34, 51, 52, 66, 67, 68, 70, 74, 75], "data_labeler_object": [1, 51, 70, 75, 77], "get_structured_result": [1, 70], "col_report": 1, "data_stat": [1, 68, 70, 73, 75, 76, 77, 79], "column_nam": [1, 14, 63, 67, 68, 70, 75, 76], "df_result": [1, 70], "datafram": [1, 5, 8, 9, 10, 12, 14, 15, 16, 18, 21, 22, 23, 24, 26, 27, 28, 30, 35, 50, 52, 63, 65, 67, 70, 71, 73, 74, 76, 77, 78, 79], "report": [1, 6, 20, 29, 34, 35, 36, 37, 38, 39, 40, 41, 42, 45, 48, 49, 50, 53, 54, 55, 58, 59, 65, 66, 67, 68, 70, 71, 72, 73, 77], "print": [1, 2, 3, 4, 12, 21, 22, 23, 24, 26, 27, 29, 30, 47, 68, 70, 73, 74, 75, 76, 77, 79], "summari": [1, 3, 23, 24, 25, 70, 76, 79], "user": [1, 2, 3, 4, 5, 8, 10, 15, 16, 18, 21, 22, 23, 24, 25, 27, 30, 31, 50, 59, 66, 68, 70, 71, 73, 75, 76, 78, 79], "can": [1, 2, 3, 4, 5, 8, 9, 10, 15, 16, 17, 18, 21, 24, 25, 47, 50, 52, 53, 66, 67, 68, 69, 70, 73, 74, 75, 76, 77, 79], "own": [1, 70, 75], "plug": 1, "show": [1, 2, 25, 67, 70, 71, 73, 75, 77], "interest": [1, 73], "other": [1, 2, 23, 24, 37, 68, 70, 73, 75, 77, 78], "same": [1, 3, 4, 12, 21, 22, 23, 24, 25, 26, 27, 28, 30, 38, 51, 52, 66, 68, 73, 74, 75, 76, 79], "teach": [2, 36, 77], "how": [2, 3, 4, 5, 10, 22, 24, 36, 47, 63, 65, 66, 67, 70, 75, 77, 78], "columnnamemodel": [2, 26], "util": [2, 3, 4, 6, 7, 19, 20, 34, 37, 50, 57, 68, 70, 71, 72, 75, 77, 78], "run": [2, 3, 22, 23, 24, 31, 51, 59, 63, 65, 66, 69, 70, 73, 74, 76, 77, 78, 79], "pprint": [2, 66, 68, 76, 77, 79], "try": [2, 4, 29, 66, 70, 71, 73, 74, 76, 77, 79], "importerror": [2, 4, 66, 70, 71, 74, 76, 77, 79], "easiest": 2, "resourc": [2, 52, 74], "quickli": [2, 76, 79], "start": [2, 3, 13, 28, 30, 63, 70, 71, 77, 78], "ani": [2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 16, 17, 22, 23, 24, 26, 28, 29, 30, 35, 36, 37, 38, 39, 40, 45, 48, 49, 50, 51, 52, 53, 67, 68, 69, 70, 73, 74, 75, 76, 79], "avail": [2, 5, 44, 47, 67, 75, 76], "labeler_from_librari": 2, "datalabel": [2, 3, 21, 27, 38, 51, 65, 70, 75, 76], "column_name_label": 2, "ssn": [2, 3, 5, 68], "For": [2, 3, 4, 5, 22, 23, 25, 28, 36, 37, 40, 45, 47, 48, 50, 51, 52, 53, 55, 63, 66, 69, 70, 73, 75, 76, 77, 78, 79], "purpos": [2, 24, 76, 79], "here": [2, 4, 22, 24, 44, 52, 70, 73, 74, 76, 77, 78], "exst": [2, 77], "via": [2, 4, 5, 8, 10, 12, 15, 16, 18, 22, 23, 24, 26, 30, 68, 69, 75, 76, 77, 79], "command": [2, 5, 68, 74, 76, 77, 79], "bit": 2, "more": [2, 4, 9, 25, 28, 38, 65, 66, 68, 70, 73, 74, 75, 76, 78, 79], "detail": [2, 3, 4, 23, 24, 25, 50, 65, 73, 75, 76, 78, 79], "flow": 2, "true_positive_dict": 2, "attribut": [2, 14, 35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 51, 52, 53, 55, 65, 66, 75], "suffix": 2, "my_home_address": 2, "address": [2, 3, 68, 70, 77], "false_positive_dict": 2, "contract_numb": 2, "role": 2, "send_address": 2, "negative_threshold_config": 2, "50": [2, 29, 77], "positive_threshold_config": 2, "85": [2, 73], "include_label": 2, "3": [2, 3, 4, 25, 29, 63, 67, 68, 70, 71, 73, 74, 75, 76, 77], "processor": [2, 3, 20, 21, 27, 28, 70], "data_process": [2, 3, 21, 27, 28, 70, 77], "directpasspreprocessor": [2, 20, 28], "column_name_model": [2, 26], "post": [2, 4, 77], "columnnamemodelpostprocessor": [2, 28], "help": [2, 3, 21, 22, 23, 24, 26, 27, 28, 30, 70, 77], "below": [2, 4, 5, 24, 25, 28, 44, 51, 66, 67, 68, 70, 73, 74, 75, 76, 77, 79], "pass": [2, 3, 5, 8, 10, 12, 14, 15, 16, 18, 24, 71, 78], "stage": 2, "background": [2, 28, 30], "compare_neg": 2, "idea": [2, 52], "behind": 2, "filter": [2, 29], "possibl": [2, 3, 5, 8, 10, 14, 15, 16, 18, 23, 24, 26, 30, 32, 38, 52, 68, 75, 76, 77, 79], "posit": [2, 3, 24, 25, 40, 45, 48, 53, 66, 70], "similar": [2, 3, 26, 50, 75, 76, 79], "too": [2, 52, 68, 69], "close": 2, "being": [2, 4, 5, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 30, 38, 41, 50, 51, 75, 77], "remov": [2, 48, 50, 52, 69, 70, 71, 73, 75, 76, 79], "compare_posit": 2, "again": [2, 29, 66, 70], "dure": [2, 3, 24, 29, 47, 77, 78], "onli": [2, 4, 5, 8, 10, 14, 15, 16, 18, 22, 24, 25, 47, 52, 63, 67, 68, 70, 73, 74, 75, 76, 77, 78, 79], "those": [2, 70, 78], "equal": [2, 28, 51, 73, 75], "achiev": [2, 77], "dict": [2, 3, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 63, 68, 73, 75, 76, 77, 79], "between": [2, 3, 4, 25, 27, 28, 29, 37, 38, 39, 41, 49, 50, 51, 52, 54, 55, 66, 70, 74, 75, 76, 77, 79], "100": [2, 70, 76], "determin": [2, 3, 4, 8, 9, 10, 12, 13, 14, 15, 16, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 35, 36, 37, 38, 39, 45, 49, 51, 54, 68, 75, 76, 78, 79], "should": [2, 4, 24, 26, 30, 35, 36, 37, 38, 39, 45, 47, 49, 54, 63, 68, 73, 75, 76, 79], "set_label": [2, 21, 27, 77], "funky_on": 2, "funky_two": 2, "funky_thre": 2, "not_my_address": 2, "method": [2, 3, 4, 12, 22, 24, 25, 28, 36, 38, 39, 46, 48, 49, 50, 51, 52, 73, 75, 76, 79], "brand": 2, "As": [2, 4, 44, 70, 73, 76, 79], "see": [2, 25, 29, 46, 51, 63, 66, 68, 69, 73, 74, 75, 76, 77, 79], "throught": 2, "predict_opt": [2, 3, 21, 27, 70, 77], "show_confid": [2, 3, 21, 22, 23, 24, 26, 27, 30, 70, 77], "isdir": [2, 77], "new_column_name_label": 2, "mkdir": [2, 77], "saved_label": [2, 77], "load_from_disk": [2, 3, 21, 22, 23, 24, 26, 27, 28, 30, 77], "ensur": [2, 4, 12, 29, 38, 47, 49, 66, 70, 74, 75, 77], "parametesr": [2, 77], "what": [2, 3, 4, 12, 51, 52, 65, 76, 77, 78, 79], "nmodel": [2, 77], "__class__": [2, 77], "__name__": [2, 46, 77], "refer": [3, 25, 36, 47], "recognit": [3, 68, 69, 70, 75], "builtin": 3, "classifi": 3, "complex": 3, "dataset": [3, 4, 5, 8, 9, 10, 14, 15, 16, 17, 18, 21, 29, 34, 35, 36, 38, 39, 40, 41, 45, 48, 49, 50, 51, 52, 53, 54, 63, 65, 66, 68, 69, 70, 73, 74, 75, 76, 77, 79], "each": [3, 4, 12, 21, 22, 23, 24, 25, 27, 28, 30, 36, 38, 40, 43, 47, 52, 66, 68, 70, 73, 74, 75, 76, 77, 79], "howev": [3, 4, 73, 75, 76, 77], "allow": [3, 4, 19, 21, 22, 27, 28, 50, 52, 70, 73, 74, 75, 76, 77, 79], "well": [3, 12, 73, 75, 76, 77, 78, 79], "per": [3, 18, 25, 68, 70, 77], "cell": [3, 12, 66, 68, 70, 77], "row": [3, 4, 5, 10, 12, 51, 52, 68, 70, 73, 75, 76], "charact": [3, 4, 6, 10, 12, 14, 20, 28, 30, 43, 47, 50, 51, 65, 68, 70, 73, 75, 77], "level": [3, 6, 19, 20, 22, 23, 26, 28, 30, 43, 47, 51, 65, 67, 68, 70, 73, 75, 76], "ban": [3, 68], "bank": [3, 68], "account": [3, 25, 52, 68], "10": [3, 4, 12, 67, 68, 70, 74, 75], "18": [3, 68], "digit": [3, 25, 68, 75, 77], "credit_card": [3, 68], "email_address": [3, 68], "uuid": [3, 68], "hash_or_kei": [3, 68], "md5": [3, 68], "sha1": [3, 68], "sha256": [3, 68], "random": [3, 4, 5, 24, 28, 36, 50, 60, 68, 70, 74, 75, 77], "hash": [3, 51, 68, 75], "etc": [3, 4, 12, 28, 50, 52, 68, 73, 75, 76, 78, 79], "ipv4": [3, 68], "ipv6": [3, 68], "mac_address": [3, 68], "person": [3, 68, 70], "phone_numb": [3, 68], "url": [3, 4, 12, 68, 70], "us_stat": [3, 68], "drivers_licens": [3, 68], "date": [3, 4, 52, 68, 70, 74], "time": [3, 12, 24, 25, 35, 36, 38, 39, 40, 41, 45, 48, 49, 52, 53, 63, 66, 68, 74, 75, 76, 77, 79], "quantiti": [3, 68], "ordin": [3, 68], "your_data": 3, "belong": 3, "arrai": [3, 24, 25, 29, 47, 52, 68], "multipl": [3, 8, 9, 10, 14, 15, 16, 17, 18, 63, 67, 69, 70, 73, 75, 76, 77, 79], "tjohn": 3, "macklemor": 3, "tneed": 3, "tfood": 3, "tpleas": 3, "tcall": 3, "t555": 3, "301": 3, "1234": 3, "tssn": 3, "ti": 3, "tnot": 3, "t334": 3, "97": 3, "i": [3, 4, 21, 27, 36, 69, 70, 76, 79], "m": [3, 5, 12, 36, 52, 69], "000043219499392912": 3, "model_predict": 3, "final_result": 3, "pred": [3, 25, 28, 70], "final_confid": 3, "conf": [3, 28, 70], "It": [3, 5, 24, 29, 47, 66, 68, 70, 76, 79], "chang": [3, 4, 37, 40, 45, 48, 50, 53, 74, 75, 76, 77, 78, 79], "spaci": [3, 70], "ner": [3, 28, 70], "end": [3, 28, 30, 63, 70, 77], "output_format": [3, 28, 50, 68, 70, 73, 75, 76, 77, 79], "use_word_level_argmax": [3, 28, 70], "mechan": [3, 4], "Will": [3, 51], "your_fil": [3, 5, 68, 75], "train_structured_label": [3, 27, 70], "save": [3, 8, 9, 10, 15, 16, 18, 21, 22, 23, 24, 26, 27, 28, 30, 41, 50, 65, 74, 78], "my": [3, 4, 5, 75], "reus": [3, 52], "dirpath": [3, 21, 22, 23, 24, 26, 27, 28, 30, 70], "inform": [3, 4, 24, 29, 66, 70, 73, 75, 76, 78, 79], "about": [3, 4, 68, 70, 76, 78, 79], "w": [3, 30, 70], "By": [3, 4, 24, 25, 70, 75], "trainabledatalabel": [3, 21, 27], "illustr": [3, 4], "label1": 3, "label2": 3, "retrain": [3, 70], "interpret": [3, 4], "pleas": [3, 68], "expect": [3, 12, 46, 70, 73, 77], "current": [3, 5, 14, 22, 23, 24, 26, 28, 30, 66, 67, 68, 75, 76, 77, 78, 79], "cannot": [3, 12, 67], "take": [3, 5, 12, 24, 25, 28, 47, 67, 68, 70, 71, 75, 76, 79], "ingest": [3, 76, 79], "two": [3, 4, 28, 38, 39, 40, 41, 45, 48, 49, 50, 52, 53, 54, 55, 66, 68, 71, 73, 74, 75, 76, 78, 79], "model_result": [3, 70], "validation_split": [3, 21, 70], "epoch_id": [3, 24], "new_label": [3, 70], "maintain": 3, "add_label": [3, 21, 22, 23, 24, 26, 27, 30, 70], "same_a": [3, 21, 22, 23, 24, 26, 27, 30], "label_nam": [3, 29], "abl": [3, 21, 66, 70], "continu": [3, 66, 75], "sinc": [3, 8, 16, 70], "graph": [3, 4, 5, 6, 7, 31, 34, 50, 57, 59, 65], "ha": [3, 4, 8, 10, 14, 15, 16, 25, 28, 30, 38, 46, 48, 52, 70, 73, 75, 76, 79], "pipelin": [3, 21, 27, 65, 68, 70], "specif": [3, 4, 5, 9, 12, 19, 25, 29, 33, 47, 67, 68, 70, 73, 74, 75, 76, 79], "param1": 3, "value1": [3, 46], "simultan": 3, "param2": 3, "value2": [3, 46], "param3": 3, "value3": 3, "To": [3, 47, 68, 69, 70, 73, 74, 76, 77, 79], "have": [3, 4, 12, 15, 21, 22, 23, 24, 26, 27, 28, 30, 38, 52, 63, 66, 68, 69, 70, 73, 74, 75, 77, 78], "you": [3, 4, 12, 22, 23, 24, 26, 27, 29, 30, 47, 51, 63, 66, 67, 68, 69, 70, 73, 74, 75, 76, 79], "structcharpreprocessor": [3, 20, 28, 70], "structcharpostprocessor": [3, 20, 28, 70], "set_preprocessor": [3, 21, 27, 70], "set_postprocessor": [3, 21, 27, 70, 77], "basic": [3, 70, 71], "compat": [3, 8, 9, 10, 14, 15, 16, 18, 68, 70, 71, 76, 78, 79], "check_pipelin": [3, 21, 27, 70], "overrid": [3, 24, 46, 50, 51], "abstract": [3, 9, 21, 22, 28, 35, 36, 37, 38, 39, 48, 49, 50], "review": [3, 76], "call": [3, 5, 12, 24, 25, 29, 46, 47, 48, 51, 52, 73, 76], "base": [3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 26, 27, 28, 29, 30, 34, 36, 37, 38, 39, 40, 41, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 62, 73, 75, 76, 78], "_need_to_reconstruct_model": 3, "e": [3, 4, 21, 24, 27, 52, 70, 75, 77], "_reconstruct_model": 3, "disk": [3, 20, 21, 22, 23, 24, 26, 27, 28, 30, 41, 50, 76, 79], "basedatapreprocessor": [3, 21, 27, 28], "execut": [3, 52, 74], "extran": 3, "calcul": [3, 25, 36, 39, 40, 41, 42, 43, 45, 48, 50, 51, 52, 53, 68, 75, 76, 78, 79], "convert": [3, 4, 12, 22, 24, 25, 26, 28, 29, 30, 40, 45, 48, 53, 70, 74, 77], "digest": 3, "iter": [3, 12, 21, 22, 26, 30, 52], "befor": [3, 51, 75, 77], "assist": 3, "_save_processor": 3, "serializ": [3, 24, 29, 50, 75, 76, 79], "nearli": 3, "ident": 3, "handl": [3, 12, 13, 24, 73], "basedatapostprocessor": [3, 21, 27, 28], "within": [4, 12, 19, 21, 27, 28, 29, 30, 38, 40, 45, 49, 51, 53, 54, 67, 75, 77], "5": [4, 25, 51, 52, 67, 68, 75], "tsv": [4, 68, 76, 79], "jsondata": [4, 8, 11, 15, 73], "parquetdata": [4, 11, 16, 68, 73, 75], "avrodata": [4, 8, 11, 73], "textdata": [4, 11, 18, 68, 73], "individu": [4, 12, 40, 45, 49, 53, 69, 70, 73, 77], "capabl": [4, 24], "auto": [4, 5, 11, 22, 51, 68, 75], "mydata": 4, "abc": [4, 22, 28, 70], "your": [4, 24, 70, 74, 76], "demonstr": [4, 67, 73], "data_fold": [4, 76], "csv_file": [4, 73], "skip": [4, 21, 27, 47, 73, 76], "author": [4, 29, 73, 76], "descript": [4, 5, 66, 73], "line": [4, 10, 12, 15, 18, 68, 70, 73, 75], "spars": [4, 73, 76], "last": [4, 12, 52, 67, 70, 73, 75, 76], "extens": [4, 73], "json_fil": [4, 73], "complex_nest": [4, 73], "honeypot_intentially_mislabeled_fil": [4, 73], "parquet_fil": [4, 73], "parquet": [4, 5, 6, 7, 12, 68, 69, 73, 75, 76, 79], "nation": [4, 73], "plain": [4, 70, 73], "intentionally_mislabled_fil": [4, 73], "avro_fil": [4, 73], "avro": [4, 5, 6, 7, 16, 68, 69, 73, 75, 76, 79], "userdata1": [4, 73], "userdata1_intentionally_mislabled_fil": [4, 73], "graph_fil": [4, 68], "graph_data_csv_identifi": [4, 66], "text_fil": [4, 68, 73], "discussion_reddit": [4, 73, 79], "all_fil": [4, 73], "filepath": [4, 6, 7, 19, 41, 50, 66, 75, 76, 79], "58": [4, 36], "80": [4, 29, 40, 45, 48, 53, 77], "65": 4, "15": [4, 51, 73], "data_typ": [4, 8, 9, 10, 11, 14, 15, 16, 18, 37, 51, 66, 68, 73, 74, 75, 76], "http": [4, 5, 29, 44, 52, 66, 69], "raw": [4, 70], "githubusercont": 4, "com": [4, 5, 29, 44, 52, 69, 70], "capitalon": [4, 69], "diamond": [4, 76], "give": [4, 28, 31, 59, 73, 77], "abil": [4, 68, 73, 76, 79], "want": [4, 12, 22, 23, 24, 26, 27, 30, 51, 66, 67, 68, 69, 73, 78], "doc": [4, 66], "io": 4, "html": [4, 66, 74], "data_read": [4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 41, 50, 68, 69, 73, 75], "later": [4, 68, 76, 79], "tutori": [4, 66], "discuss": 4, "addition": [4, 5, 50, 67, 73, 77], "directli": [4, 67, 68, 70, 73, 75, 76, 79], "thei": [4, 22, 23, 24, 26, 28, 30, 31, 59, 70, 73, 75, 76, 79], "choos": [4, 27, 28, 74, 75, 77], "csv_data": [4, 5, 10, 11, 68, 73, 75], "df": [4, 38, 39, 40, 45, 48, 49, 53, 55, 70, 71, 73], "properti": [4, 5, 8, 9, 10, 14, 15, 16, 18, 21, 22, 23, 24, 26, 27, 29, 30, 35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 51, 53, 54, 55, 65, 66, 68, 73, 75, 76, 79], "mai": [4, 5, 8, 10, 15, 16, 18, 29, 51, 73, 76, 79], "focu": 4, "input_file_path": [4, 5, 8, 9, 10, 11, 14, 15, 16, 17, 18], "file_encod": [4, 8, 9, 10, 12, 14, 15, 16, 18], "length": [4, 8, 9, 10, 12, 14, 15, 16, 18, 24, 28, 52, 66], "techniqu": 4, "whether": [4, 12, 13, 14, 15, 21, 22, 23, 24, 26, 27, 28, 30, 51, 73, 75], "supplement": 4, "add_true_false_color": 4, "green": 4, "red": 4, "x1b": 4, "92m": 4, "0m": 4, "31m": 4, "non_csv_fil": 4, "iri": [4, 76], "utf": [4, 12, 14, 75, 76], "8": [4, 12, 14, 68, 75, 76], "titan": 4, "parq": 4, "code": [4, 25, 44, 48, 52, 68, 70, 73, 74, 78], "sentenc": [4, 24, 28, 29, 79], "snappy_compressed_intentionally_mislabeled_fil": 4, "Is": 4, "There": [4, 68, 70, 75], "both": [4, 12, 29, 49, 51, 65, 70, 75, 76, 77, 79], "been": [4, 46], "correctli": 4, "were": [4, 28, 75, 77], "origin": [4, 12, 24, 28, 42, 43, 70, 73, 74], "record": [4, 5, 8, 10, 15, 16, 52, 79], "record_samples_per_lin": [4, 10], "commonli": [4, 76, 79], "g": [4, 24, 36, 52, 70, 75, 77], "carat": 4, "cut": [4, 68], "color": 4, "clariti": 4, "depth": [4, 65], "tabl": [4, 24], "price": 4, "z": [4, 69, 77], "23": 4, "ideal": 4, "si2": 4, "61": 4, "55": 4, "326": 4, "95": [4, 51, 75], "98": 4, "43": 4, "21": 4, "premium": 4, "si1": 4, "59": 4, "89": 4, "84": 4, "31": [4, 70], "good": [4, 74, 76], "vs1": 4, "56": 4, "9": [4, 68, 74, 77], "327": 4, "05": 4, "07": [4, 70, 74], "29": 4, "vs2": 4, "62": 4, "334": 4, "63": 4, "j": [4, 36], "335": 4, "34": 4, "35": 4, "75": [4, 28, 29], "blogpost": 4, "blog": 4, "subject": [4, 70], "field": 4, "monti": 4, "hall": 4, "meet": 4, "game": 4, "theori": 4, "13": [4, 70], "2014": 4, "statist": [4, 24, 34, 36, 40, 41, 50, 51, 52, 66, 68, 73, 76, 77, 79], "mathemat": [4, 12], "gaussian": 4, "quadratur": 4, "algorithm": 4, "notic": [4, 66, 75, 77], "becaus": [4, 25, 51, 52, 77], "These": [4, 24, 65, 67, 79], "previous": 4, "wa": [4, 24, 28, 29, 31, 46, 59, 66, 70, 75, 77], "shown": [4, 25, 66, 73, 75], "manual": [4, 24, 60], "mention": [4, 70], "preivous": 4, "finetun": 4, "manner": [4, 68, 75], "deciph": [4, 10, 14], "quot": [4, 10, 14], "locat": [4, 10, 12, 14, 21, 24, 27, 50, 73, 75, 76], "selected_column": [4, 5, 10, 12, 16], "entir": [4, 5, 8, 10, 15, 16, 75, 77], "dictionari": [4, 5, 12, 15, 21, 24, 25, 27, 29, 32, 35, 36, 37, 38, 39, 40, 43, 45, 47, 48, 49, 50, 51, 52, 53, 66, 68, 75, 76, 79], "choic": [4, 5, 8, 10, 15, 16, 18], "displai": [4, 25, 36, 51, 70, 75], "daili": 4, "sheet": 4, "singlequot": 4, "num_lin": 4, "open": [4, 12, 13, 68, 70, 74], "fp": 4, "readlin": 4, "intent": 4, "failur": [4, 51], "incorrect": [4, 36], "interept": 4, "delimti": 4, "faliur": 4, "bc": 4, "reach": [4, 75], "someth": [4, 70], "went": 4, "wrong": 4, "fail": 4, "attributeerror": 4, "over": [4, 21, 63], "best": [4, 5, 38, 68], "intention": 4, "incorrectli": [4, 73, 75], "happen": 4, "mani": [4, 5, 10, 22], "singl": [4, 10, 21, 22, 23, 24, 26, 27, 28, 30, 52, 68, 71, 77, 79], "conduct": [4, 28], "unstructur": [4, 6, 27, 28, 34, 50, 51, 65, 76, 77], "request": 4, "host": 4, "src": 4, "proto": 4, "networkx": [4, 5, 41, 66], "effort": 4, "prepar": [4, 70], "automaticali": 4, "graphprofil": [4, 41, 50, 66], "dataproil": 4, "kei": [4, 5, 8, 12, 15, 21, 27, 28, 41, 43, 47, 50, 52, 68, 75, 76, 78, 79], "off": [4, 51, 68, 73, 75, 76, 77, 79], "common": [4, 12, 51], "convent": 4, "node_id_dst": 4, "node_id_src": 4, "continuous_weight": 4, "categorical_statu": 4, "108": 4, "289": 4, "7": [4, 66, 68, 70, 74], "4448069": 4, "81": 4, "180": 4, "65064207": 4, "458": 4, "83": 4, "9959787": 4, "116": 4, "63359209": 4, "79": 4, "454": 4, "177": 4, "76715529": 4, "11": [4, 68, 70], "429": 4, "225": 4, "79556889": 4, "exactli": [4, 73], "implement": [4, 12, 14, 65, 66], "graph_data": [4, 11, 14, 41], "edg": [4, 14, 40, 45, 48, 53, 66, 68, 75], "itself": [5, 24, 36, 40, 45, 48, 53], "identifi": [5, 14, 30, 48, 50, 62, 63, 68, 70, 75, 76, 79], "easi": [5, 68, 70, 75], "just": [5, 73, 75, 76, 77, 79], "through": [5, 12, 32, 36, 70, 73], "support": [5, 24, 25, 29, 70, 75, 76, 79], "delimit": [5, 10, 12, 14, 65, 73, 76, 79], "A": [5, 12, 24, 25, 26, 28, 30, 44, 51, 65, 68, 73, 75, 76, 77], "point": [5, 24, 25, 68, 75, 76, 79], "you_websit": 5, "verify_ssl": 5, "variou": [5, 11], "pertain": [5, 8, 9, 10, 14, 15, 16, 17, 18], "data_format": [5, 8, 9, 10, 14, 15, 16, 18, 65, 79], "select": [5, 8, 10, 15, 16, 18, 28, 37, 65, 75, 79], "sample_nrow": [5, 10, 12, 16], "reservoir": [5, 12], "total": [5, 12, 24, 25, 75], "header": [5, 10, 12, 14, 16, 65, 73, 76], "detect": [5, 8, 11, 12, 16, 65, 66, 68, 69, 73, 75, 76, 77, 78, 79], "access": [5, 12, 19, 24, 65, 68, 75], "metadata": [5, 8, 15, 35, 36, 38, 39, 40, 45, 48, 49, 53], "data_and_metadata": [5, 8, 15], "flattened_datafram": [5, 15], "typic": [5, 12, 24, 52, 70], "found": [5, 38, 52, 70, 75, 77], "stream": [5, 12, 13, 73], "nest": [5, 43, 50, 52, 75, 76, 79], "payload": [5, 15], "respons": [5, 75], "200": 5, "selected_kei": [5, 8, 15], "payload_kei": [5, 15], "further": [5, 76, 79], "samples_per_lin": [5, 18], "chunk": [5, 12, 18, 47, 52], "bucket": [5, 51, 75, 78], "s3a": [5, 12], "file_nam": 5, "storage_opt": 5, "boto3": [5, 12], "If": [5, 12, 24, 25, 29, 31, 38, 47, 51, 52, 59, 66, 67, 68, 69, 74, 75], "variabl": [5, 24, 27, 29, 36, 40, 45, 48, 50, 51, 53, 67], "retriev": [5, 21, 22, 23, 24, 26, 27, 28, 30, 46, 75], "otherwis": [5, 9, 12, 25, 28, 38, 47, 51, 52, 75], "environ": [5, 73], "aws_access_key_id": [5, 12], "aws_secret_access_kei": [5, 12], "aws_session_token": [5, 12], "aws_region": 5, "east": [5, 12], "Or": [6, 7], "buffer": [6, 7, 12], "mixin": [6, 7, 40, 45, 51], "model": [6, 20, 21, 27, 28, 29, 48, 50, 63, 65, 68, 70, 75, 77], "char": [6, 12, 20, 22, 24, 26, 28, 30, 68, 76, 77, 79], "classif": [6, 20, 29, 32, 33, 36], "regex": [6, 12, 20, 28, 50, 65], "plugin": [6, 33, 72], "decor": [6, 22, 28, 32], "helper": [6, 34, 52, 70], "categor": [6, 34, 51, 66, 68, 75, 76, 77], "histogram": [6, 34, 40, 45, 48, 51, 53, 66, 68, 73, 75, 76], "decod": [6, 34, 47], "numer": [6, 34, 40, 45, 51, 67, 75, 76], "stat": [6, 34, 35, 40, 45, 51, 52, 66, 68, 75], "builder": [6, 34], "log": [6, 12, 66, 70, 71, 72, 75, 76, 79], "rng": [6, 24, 72], "version": [6, 29, 44, 46, 47, 69, 70, 72, 74, 76, 79], "packag": [6, 7, 29, 34, 42, 57, 62, 64, 68, 69, 71], "set_se": [6, 75], "seed": [6, 24, 51, 60], "design": [7, 12, 68, 70, 73, 75, 76, 78, 79], "wrap": 7, "spreadsheet": [8, 10, 15, 17], "avro_data": [8, 11, 73], "basedata": [8, 9, 10, 14, 15, 16, 18, 50], "instead": [8, 9, 10, 14, 15, 16, 17, 18, 21, 25, 27, 70], "classmethod": [8, 9, 10, 14, 15, 16, 18, 21, 22, 23, 24, 26, 27, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 51, 53], "is_match": [8, 9, 10, 14, 15, 16, 18, 36, 65], "file_path": [8, 10, 12, 14, 15, 16, 18], "union": [8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 21, 22, 23, 24, 25, 26, 27, 28, 30, 41, 50, 51, 52], "stringio": [8, 12, 13, 15, 16], "bytesio": [8, 11, 12, 13, 16], "bool": [8, 9, 10, 12, 14, 15, 16, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 51, 52, 53, 54, 55, 63, 68, 75, 76], "examin": [8, 10, 15, 16, 18, 76, 79], "frame": [8, 12, 15, 37, 78], "get_batch_gener": [8, 9, 10, 14, 15, 16, 18], "batch_siz": [8, 9, 10, 14, 15, 16, 18, 21, 22, 23, 24, 26, 27, 28, 30], "batch": [8, 9, 10, 14, 15, 16, 18, 21, 22, 23, 24, 27, 28, 68, 73, 75], "info": [8, 9, 10, 14, 15, 16, 18, 19, 70, 78], "is_structur": [8, 9, 10, 14, 15, 16, 18], "structuredprofil": [8, 9, 10, 14, 15, 16, 18, 50, 65], "reload": [8, 9, 10, 14, 15, 16, 18, 65], "new": [8, 9, 10, 14, 15, 16, 18, 21, 22, 23, 24, 26, 27, 30, 36, 52, 65, 66, 68, 70, 71, 75, 76, 79], "eras": [8, 9, 10, 14, 15, 16, 18], "exist": [8, 9, 10, 14, 15, 16, 18, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 46, 51, 52, 59, 65, 68, 70, 74, 75], "base_data": [9, 50], "mayb": 9, "subclass": [9, 13, 21, 22, 23, 24, 26, 27, 28, 30, 35, 36, 39, 46, 47, 48, 49, 53], "match": [9, 12, 21, 25, 27, 30, 39, 40, 45, 53, 66, 76, 77, 79], "csvdata": [10, 11, 65, 68, 73, 75], "spreadsheetdatamixin": [10, 15, 16, 17], "spreadsheetdata": [10, 15, 16], "quotechar": [10, 12, 14, 65], "1000": [10, 15, 51, 73, 75, 76], "factori": [11, 50, 52], "kind": [11, 29], "proper": [11, 74], "data_class": 11, "json_data": [11, 15, 73], "kwarg": [11, 22, 23, 24, 26, 28, 29, 30, 49, 52], "graphdata": [11, 14, 41, 65, 66], "parquet_data": [11, 16, 68, 73, 75], "text_data": [11, 18, 68, 73], "data_util": 12, "data_gener": 12, "data_list": 12, "generator_on_fil": 12, "file_object": 12, "byte": 12, "convert_int_to_str": 12, "00": [12, 29, 40, 45, 48, 53], "nan": [12, 47, 51], "properli": [12, 32, 51, 74], "unicode_to_str": 12, "ignore_dict": 12, "represent": [12, 46, 47], "unicod": 12, "jsontyp": 12, "ignor": [12, 25, 77], "json_to_datafram": 12, "json_lin": 12, "read_in_str": 12, "seri": [12, 21, 22, 23, 24, 26, 27, 30, 36, 37, 38, 39, 40, 45, 48, 49, 50, 52, 53, 54, 55, 67, 68, 76, 79], "repres": [12, 29, 36, 38, 39, 40, 45, 46, 48, 49, 50, 53, 66], "read_json_df": 12, "sourc": [12, 14, 24, 69, 70], "either": [12, 21, 27, 29, 52, 66, 69, 76, 79], "therefor": 12, "pretti": [12, 47, 50, 68, 73, 75, 76, 79], "read_json": 12, "textiowrapp": 12, "logic": 12, "wrapper": [12, 27], "rsampl": 12, "arg": [12, 24, 28, 29, 49, 52], "read_csv_df": 12, "form": [12, 69], "separ": [12, 28, 43, 47, 71, 73, 77, 78], "convert_unicode_col_to_utf8": 12, "input_df": 12, "correct": [12, 75, 77], "sample_parquet": 12, "read_parquet_df": 12, "group": [12, 42, 43, 51, 52, 75], "read_text_as_list_of_str": 12, "rel": 12, "detect_file_encod": 12, "buffer_s": 12, "1024": 12, "max_lin": 12, "20": [12, 52, 75], "detect_cell_typ": 12, "get_delimiter_regex": 12, "pattern": [12, 30], "ad": [12, 21, 22, 23, 24, 26, 27, 30, 32, 50, 65, 76, 79], "find_nth_loc": 12, "search_queri": 12, "ignore_consecut": 12, "search": [12, 29], "nth": 12, "queri": 12, "occur": [12, 36, 63, 70, 75], "less": [12, 52, 74, 75], "loc": [12, 66, 75], "find": [12, 14, 25, 29, 35, 36, 37, 40, 41, 45, 48, 50, 52, 53, 54, 55, 68, 75], "occurr": [12, 25, 75], "consecut": 12, "idx": 12, "rtype": [12, 24, 28, 29, 38, 39, 40, 45, 48, 49, 51, 52, 53], "id_count": 12, "identif": [12, 30], "prior": [12, 31, 59, 76, 78, 79], "load_as_str_from_fil": 12, "max_byt": 12, "65536": 12, "chunk_size_byt": 12, "up": [12, 65], "OR": [12, 29], "byte_s": 12, "maximum": [12, 24, 28, 51, 75], "everi": [12, 75, 76, 79], "is_valid_url": 12, "url_as_str": 12, "typing_extens": [12, 13], "typeguard": [12, 13], "url_to_byt": 12, "download": [12, 74], "s3helper": 12, "amazon": 12, "s3": [12, 78], "uri": 12, "client": 12, "static": [12, 28, 38, 40, 45, 48, 50, 53], "is_s3_uri": 12, "logger": [12, 19, 29], "prefix": 12, "instanc": [12, 19, 21, 24, 25, 27, 29, 36, 46, 75], "create_s3_cli": 12, "region_nam": 12, "id": [12, 70], "secret": 12, "session": 12, "token": [12, 18], "temporari": 12, "credenti": 12, "region": 12, "get_s3_uri": 12, "s3_uri": 12, "s3_client": 12, "content": [12, 47, 70], "servic": 12, "filepath_or_buff": 13, "is_stream_buff": 13, "argument": [13, 21, 24, 27, 52], "fileorbufferhandl": 13, "open_method": 13, "r": [13, 30, 69, 77], "seek_offset": 13, "seek_whenc": 13, "alwai": [13, 53], "readabl": [13, 25, 52, 70, 75], "context": 13, "manag": [13, 24], "describ": [13, 21, 22, 23, 24, 25, 26, 27, 28, 30, 41, 75], "mode": [13, 24, 28, 40, 45, 48, 51, 53, 68, 73, 75], "offset": 13, "textiobas": 13, "bufferediobas": 13, "source_nod": 14, "destination_nod": 14, "target_keyword": 14, "source_keyword": 14, "node": [14, 66, 75], "target_nod": 14, "target": [14, 25], "keyword": [14, 24], "destin": 14, "col": [14, 35, 38, 39, 40, 45, 48, 49, 50, 52, 53, 62, 63, 70, 74, 76], "graph_keyword": 14, "o": [14, 47, 70], "csv_column_nam": 14, "fetch": [14, 32], "check_integ": 14, "At": 15, "least": [15, 75], "60": [15, 29, 36], "percent": 15, "tye": 17, "structured_mixin": 17, "special": 17, "func": [19, 25], "alter": [19, 21, 22, 23, 24, 25, 26, 27, 28, 30, 65, 75], "verbos": [19, 21, 22, 23, 24, 26, 27, 29, 30], "lib": [19, 69], "dp_log": 19, "get_logg": 19, "set_verbos": [19, 70, 71, 76, 79], "notset": 19, "debug": [19, 24, 29], "warn": [19, 21, 25, 27, 29, 31, 51, 52, 59, 78], "critic": 19, "modul": [19, 25, 27, 31, 59, 72], "get_child_logg": 19, "child": 19, "regexmodel": [20, 30], "charpreprocessor": [20, 28], "regexpostprocessor": [20, 28, 77], "unstructureddatalabel": [20, 27], "structureddatalabel": [20, 27], "basedatalabel": [20, 21, 27, 51, 52, 75], "load_from_librari": [20, 21, 27, 28, 65, 77], "unstructured_model": 20, "structured_model": 20, "regex_model": [20, 30, 77], "base_data_label": 21, "load_opt": [21, 27, 46], "parent": [21, 35], "associ": [21, 24, 27, 30], "reverse_label_map": [21, 22, 23, 24, 26, 27, 30, 38], "done": [21, 27], "pair": [21, 27, 52], "multi": [21, 22, 23, 24, 25, 26, 27, 29, 30], "ndarrai": [21, 22, 23, 24, 25, 26, 27, 28, 30, 38, 50, 52], "error_on_mismatch": [21, 27], "upon": [21, 24, 26, 27, 28, 30], "mismatch": [21, 27, 76], "statu": [21, 22, 23, 24, 26, 27, 30], "data_processor": [21, 27], "skip_postprocessor": [21, 27], "togeth": [21, 27, 28, 52, 68, 71, 75, 76, 79], "without": [21, 24, 27, 29, 63], "zoo": [21, 27], "load_with_compon": [21, 27, 65], "dataarrai": [21, 22, 23, 24], "reset_weight": [21, 22, 23, 24, 26, 30, 70], "cross": 21, "refit": 21, "send": [21, 51, 70, 75], "autosubregistrationmeta": [22, 28], "clsname": [22, 28], "attr": [22, 28, 37, 51], "abcmeta": [22, 28], "regist": [22, 28, 29, 51, 75], "registr": 22, "mro": [22, 28], "resolut": [22, 28], "virtual": [22, 28, 69], "usag": [22, 28, 65], "store": [22, 24, 50, 52, 76, 78, 79], "appropri": [22, 23, 24, 26, 30, 38, 39, 49, 51, 70, 76, 79], "revers": [22, 23, 24, 26, 30, 38], "extract": [22, 23, 24, 26, 30], "max": [22, 23, 24, 26, 30, 39, 51, 52, 53, 66, 68, 73, 75, 76], "get_class": [22, 23, 24, 26, 28, 30], "class_nam": [22, 23, 24, 26, 28, 30, 46], "get_paramet": [22, 23, 24, 26, 28, 30], "param_list": [22, 23, 24, 26, 28, 30], "set_label_map": [22, 23, 24, 26, 30], "whole": [22, 23, 24, 26, 30], "directori": [22, 23, 24, 26, 30, 54, 70, 74, 75], "basetrainablemodel": [22, 23, 24], "train_data": [22, 23, 24], "val_data": [22, 23, 24], "histori": [22, 23, 24], "f1_report": [22, 23, 24, 29], "char_load_tf_model": 23, "charloadtfmodel": 23, "model_path": 23, "loadabl": 23, "num_fil": [23, 24], "relev": [23, 24, 75], "filenam": [24, 70, 76, 79], "emb": 24, "n_dim": 24, "source_fil": 24, "princip": 24, "dim": 24, "factor": 24, "down": 24, "threshargmaxlay": 24, "appli": [24, 26, 28, 30, 50, 54, 75, 76, 79], "minimum": [24, 38, 50, 51, 75], "entiti": [24, 28, 29, 30, 54, 68, 69, 70, 73, 75, 76, 79], "so": [24, 51, 53, 67], "tensor": [24, 29], "get_config": [24, 29], "config": [24, 29, 35, 36, 37, 38, 39, 40, 45, 46, 48, 49, 50, 51, 52, 53, 63], "confidence_lay": 24, "add_loss": 24, "insid": [24, 29, 68, 75, 76, 77, 78, 79], "scalar": 24, "python": [24, 40, 45, 47, 48, 53, 68, 69, 75], "mylay": 24, "op": 24, "sum": [24, 29, 38, 51, 53, 68, 75], "add_metr": 24, "add_vari": [24, 29], "autocast": 24, "regular": 24, "constraint": [24, 75], "alia": 24, "add_weight": [24, 29], "aggreg": [24, 28, 29, 77], "mean": [24, 25, 29, 40, 45, 48, 53, 66, 67, 68, 69, 70, 73, 75, 76, 79], "entri": [24, 25, 68, 75], "unspecifi": [24, 75], "popul": [24, 33, 35, 36, 37, 38, 39, 40, 45, 48, 49, 50, 51, 52, 53, 78], "random_norm": 24, "glorot_uniform": 24, "float32": 24, "backprop": 24, "them": [24, 50, 66, 67, 68, 70, 73, 74, 75, 76, 79], "penalti": 24, "contrainst": 24, "after": [24, 65, 66, 73, 77], "only_first_replica": 24, "annot": [24, 52], "replica": 24, "write": [24, 29, 47], "custom": [24, 29, 46, 47, 68, 73, 75], "parallel": 24, "loop": [24, 27], "build_from_config": 24, "state": [24, 28, 29, 51, 78], "suppli": 24, "compute_dtyp": 24, "comput": [24, 25, 29, 73], "compute_mask": 24, "previous_mask": 24, "compute_output_shap": 24, "compute_output_spec": 24, "count_param": 24, "count": [24, 25, 36, 40, 45, 48, 51, 52, 53, 54, 66, 75, 79], "compos": 24, "variable_dtyp": 24, "dtype_polici": 24, "from_config": [24, 29], "instanti": [24, 50, 63, 71], "doe": [24, 25, 26, 30, 46, 52, 66, 68], "nor": 24, "set_weight": 24, "get_build_config": 24, "lookup": 24, "re": [24, 50, 51, 70], "unusu": 24, "wai": [24, 36, 70, 75], "attempt": [24, 47, 75], "get_weight": 24, "symbol": 24, "oper": [24, 68, 75, 76, 79], "correspond": [24, 25, 28, 52, 73], "input_dtyp": 24, "input_spec": 24, "load_own_vari": 24, "full": [24, 51, 68, 69, 73, 75, 76, 79], "control": 24, "load_model": 24, "sublay": 24, "metrics_vari": 24, "non_trainable_vari": 24, "extend": [24, 70], "non_trainable_weight": 24, "seedgener": 24, "unlik": [24, 36], "exclud": [24, 25, 35, 36, 37, 38, 39, 45, 49, 54], "quantiz": 24, "quantized_cal": 24, "save_own_vari": 24, "stateless_cal": 24, "trainable_vari": 24, "return_loss": 24, "side": 24, "effect": 24, "part": [24, 40, 45, 48, 53, 68], "batchnorm": 24, "attach": 24, "element": [24, 29, 40, 47, 75], "until": [24, 28, 75], "do": [24, 31, 51, 59, 67, 74], "still": [24, 68, 77], "ref_var": 24, "zip": 24, "assign": [24, 52], "supports_mask": 24, "mask": 24, "symbolic_cal": 24, "settabl": 24, "equival": 24, "trainable_weight": 24, "visit": [24, 68], "encodinglay": 24, "max_len": 24, "input_str_tensor": 24, "classification_report_util": 25, "convert_confusion_matrix_to_mcm": 25, "conf_matrix": 25, "confus": [25, 29], "mcm": 25, "precis": [25, 29, 40, 51, 52, 68, 70, 75, 76], "recal": [25, 29, 70], "fscore": 25, "sklearn": [25, 29], "multilabel": 25, "neg": [25, 40, 45, 47, 48, 52, 53, 66, 75], "mcm_": 25, "ideolog": 25, "squar": [25, 52, 75], "precision_recall_fscore_support": 25, "beta": [25, 29], "pos_label": 25, "warn_for": 25, "f": [25, 29, 52, 59], "sample_weight": [25, 29], "recision_recall_fscore_support": 25, "copi": [25, 29, 44, 51, 52], "receiv": [25, 77], "n_output": 25, "referenc": [25, 75], "2x2": 25, "strength": 25, "versu": 25, "binari": 25, "multiclass": 25, "major": [25, 70], "macro": [25, 29], "y_true": [25, 29], "y_pred": [25, 29], "sort": [25, 47, 68, 75], "applic": [25, 29, 51, 68, 73, 76, 79], "y_": 25, "global": [25, 66, 68, 75], "unweight": 25, "imbal": 25, "intern": [25, 27, 52], "made": 25, "n_sampl": 25, "n_unique_label": 25, "fbeta_scor": [25, 29], "wikipedia": [25, 52], "discrimin": 25, "advanc": 25, "knowledg": 25, "discoveri": 25, "mine": 25, "2004": 25, "pp": [25, 36, 66], "22": 25, "30": 25, "shantanu": 25, "godbol": 25, "sunita": 25, "sarawagi": 25, "undefin": 25, "undefinedmetricwarn": 25, "classification_report": [25, 29], "target_nam": 25, "output_dict": 25, "guid": 25, "n_label": 25, "round": [25, 50, 75, 76, 79], "67": [25, 29], "subset": [25, 51, 70, 75], "accuraci": [25, 29, 70], "known": [25, 74], "sensit": [25, 51, 68, 69, 73, 75, 77, 78], "confusion_matrix": 25, "multilabel_confusion_matrix": 25, "max_num_char": [26, 30], "cdist": 26, "impact": [26, 30, 37, 50], "fix": [26, 30], "dictat": 27, "labeler_class": 27, "autosubregistr": 28, "basedataprocessor": 28, "processor_typ": 28, "preprocess": 28, "postprocess": 28, "flatten_split": 28, "flatten_separ": 28, "is_separate_at_max_len": 28, "could": [28, 38, 66, 76, 78, 79], "approxim": [28, 75], "flatten": [28, 43, 50, 75, 76, 79], "becom": [28, 52], "leftov": 28, "subsequ": [28, 75], "put": 28, "nearest": 28, "batch_data": 28, "charencodedpreprocessor": 28, "encoding_map": 28, "5000": [28, 50, 51], "charpostprocessor": 28, "character_argmax": 28, "t": [28, 31, 47, 52, 59, 68, 69, 75, 76, 79], "word_level_min_perc": 28, "word": [28, 51, 68, 70, 75, 77, 79], "vs": [28, 65], "domin": 28, "word_level": [28, 68, 75, 79], "convert_to_ner_format": 28, "match_sentence_length": 28, "inplac": [28, 68, 75], "rag": 28, "modifi": [28, 29, 44, 73], "place": [28, 47, 50, 74, 75, 76, 77, 79], "x01": 28, "struct": 28, "convert_to_unstructured_format": 28, "npt": 28, "str_": 28, "num_sampl": 28, "is_pred_label": 28, "random_st": [28, 70], "randomli": [28, 75, 77], "opportun": 28, "convert_to_structured_analysi": 28, "analysi": [28, 40, 45, 49, 53, 54, 68, 75, 78], "assum": [28, 37, 52, 71, 73], "tie": 28, "chose": 28, "anyth": 28, "remain": 28, "said": [28, 76, 77, 79], "aggregation_func": [28, 77], "priority_ord": 28, "prioriti": 28, "priority_predict": 28, "entity_priority_ord": 28, "lowest": 28, "higher": 28, "split_predict": 28, "across": [28, 63, 75], "vote": [28, 38, 77], "structregexpostprocessor": [28, 77], "f1_report_dict_to_str": 29, "avg": 29, "57": 29, "33": 29, "40": 29, "taken": 29, "printout": [29, 74], "evaluate_accuraci": 29, "predicted_entities_in_index": 29, "true_entities_in_index": 29, "entity_rev_dict": 29, "omitted_label": 29, "confusion_matrix_fil": 29, "compar": [29, 47, 49, 66, 74, 75], "omit": 29, "dir": 29, "get_tf_layer_index_from_nam": 29, "layer_nam": 29, "hide_tf_logger_warn": 29, "protected_register_keras_serializ": 29, "callabl": [29, 31, 48, 52, 59], "protect": 29, "against": 29, "fbetascor": 29, "slightli": 29, "addon": 29, "blob": [29, 44], "v0": 29, "12": [29, 68], "tensorflow_addon": 29, "f_score": 29, "py": [29, 69], "l211": 29, "l283": 29, "copyright": 29, "2019": 29, "right": 29, "licens": [29, 44], "under": 29, "apach": 29, "complianc": 29, "obtain": [29, 73], "www": [29, 52], "org": [29, 52, 66], "unless": [29, 74], "law": 29, "agre": 29, "softwar": 29, "distribut": [29, 38, 66, 68, 69, 71, 73, 75], "AS": 29, "IS": [29, 77], "basi": [29, 47], "warranti": 29, "condit": [29, 51, 75], "OF": 29, "express": 29, "impli": 29, "languag": 29, "govern": 29, "permiss": 29, "limit": 29, "harmon": 29, "f_": 29, "textrm": 29, "cdot": 29, "rest": [29, 66, 77], "update_st": 29, "reset_st": 29, "stateless_reset_st": 29, "stateless_result": 29, "metric_vari": 29, "stateless_update_st": 29, "f1_score": 29, "f_1": 29, "configur": [29, 51, 61, 63, 75], "regex_pattern": [30, 77], "label_1": 30, "label_1_pattern_1": 30, "label_1_pattern_2": 30, "label_2": 30, "label_2_pattern_1": 30, "label_2_pattern_2": 30, "encapsul": [30, 77], "b": [30, 52, 67], "pattern_dict": 30, "instal": [31, 59, 65, 67, 68], "depend": [31, 50, 52, 59, 66, 69, 70], "warn_missing_modul": [31, 59], "labeler_funct": 31, "module_nam": [31, 59], "doesn": [31, 59, 76, 79], "miss": [31, 48, 52, 59, 76], "require_modul": [31, 59], "load_plugin": 32, "dig": 32, "consequ": 32, "plugins_dict": [32, 33], "get_plugin": 32, "typ": [32, 33], "certain": 32, "broader": [32, 33], "plugin_decor": 33, "base_column_profil": 35, "basecolumnprofil": [35, 36, 38, 39, 46, 47, 48, 49, 53], "baseopt": [35, 46, 50, 51], "basecolumnprofilert": [35, 48], "col_typ": [35, 36, 38, 39, 40, 45, 48, 49, 53], "diff": [35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 53, 54, 55, 66, 75, 76, 79], "other_profil": [35, 36, 38, 39, 40, 41, 45, 48, 49, 50, 52, 53, 54, 55], "df_seri": [35, 36, 37, 38, 39, 40, 45, 48, 49, 50, 52, 53, 54], "remove_disabled_flag": [35, 36, 37, 38, 39, 40, 41, 45, 48, 49, 50, 53, 54, 55], "load_from_dict": [35, 36, 37, 38, 39, 40, 45, 48, 49, 50, 51, 53], "pars": [35, 36, 37, 38, 39, 40, 45, 48, 49, 50, 51, 53], "basecolumnprimitivetypeprofil": [35, 39, 40, 45, 53], "basecolumnprimitivetypeprofilert": 35, "prim": 35, "sample_s": [35, 36, 38, 39, 40, 45, 48, 49, 50, 53, 68, 73, 75, 76], "thread_saf": [35, 36, 38, 39, 40, 45, 48, 49, 53], "categorical_column_profil": 36, "categoricalcolumn": [36, 37, 49], "categoricalopt": [36, 51], "gini_impur": [36, 68, 75], "gini": 36, "impur": 36, "likelihood": [36, 66, 75], "\u03c3": 36, "p": [36, 69, 75], "travers": 36, "probabl": [36, 38, 50, 51, 75, 76, 77], "unalik": [36, 68, 75], "unik": 36, "often": [36, 75], "observ": [36, 66], "anoth": [36, 66, 68, 70, 75, 76, 78, 79], "perri": 36, "kader": 36, "variat": 36, "vol": 36, "27": 36, "No": [36, 63, 75], "2005": [36, 70], "u": 36, "cij": 36, "privat": [36, 38, 39, 49], "categorical_count": [36, 68, 75], "top": [36, 38, 51, 75, 78], "k": [36, 52], "most": [36, 47, 76, 79], "frequent": [36, 75], "descend": 36, "unique_ratio": [36, 68, 75, 76], "unique_count": [36, 51, 68, 75, 76], "core": [36, 37, 38, 39, 40, 45, 48, 49, 50, 53, 55], "column_profile_compil": 37, "basecompil": [37, 46, 50], "structuredopt": [37, 50, 51], "pool": [37, 50, 52, 75], "basecompilert": 37, "update_profil": [37, 50, 68, 73, 75, 76, 79], "multiprocess": [37, 50, 52, 75], "columnprimitivetypeprofilecompil": 37, "selected_data_typ": 37, "primit": [37, 75], "columnstatsprofilecompil": 37, "ordercolumn": [37, 49], "columndatalabelercompil": 37, "datalabelercolumn": [37, 38], "unstructuredcompil": 37, "textprofil": [37, 55], "unstructuredlabelerprofil": [37, 54], "potenti": [37, 40, 52, 53, 55], "pop": [37, 40, 41, 48, 52, 53, 55], "val": [37, 40, 45, 48, 51, 53, 55], "data_labeler_column_profil": 38, "datalabeleropt": [38, 51, 52, 54], "sublass": 38, "assert_equal_condit": 38, "data_labeler2": 38, "possible_data_label": 38, "rank_distribut": 38, "rank": 38, "sum_predict": 38, "seen": 38, "differenti": [38, 66], "noth": 38, "sai": [38, 68, 69], "avg_predict": [38, 68, 75, 76], "label_represent": [38, 75], "top_k": 38, "simpli": [38, 47, 74], "datetime_column_profil": 39, "datetimecolumn": 39, "datetimeopt": [39, 51], "data_type_ratio": [39, 40, 45, 53], "min": [39, 50, 51, 53, 68, 73, 75, 76], "match_count": [39, 40, 45, 53], "float_column_profil": 40, "floatcolumn": 40, "floatopt": [40, 51], "numericstatsmixin": [40, 45, 48, 53], "signific": [40, 75], "figur": [40, 67, 75, 76], "is_float": [40, 45, 48, 53], "is_int": [40, 45, 48, 53], "kurtosi": [40, 45, 48, 51, 52, 53, 66, 68, 75], "float64": [40, 45, 48, 52, 53], "median": [40, 45, 48, 51, 53, 68, 73, 75], "estim": [40, 45, 48, 51, 52, 53, 75], "median_abs_devi": [40, 45, 48, 53, 73, 75], "absolut": [40, 45, 48, 53, 75], "deviat": [40, 45, 48, 53, 75], "subtract": [40, 45, 48, 52, 53, 75], "bin": [40, 45, 48, 51, 53, 66, 69, 73, 75], "fold": [40, 45, 48, 53], "around": [40, 45, 48, 53, 78], "impos": [40, 45, 48, 53], "superimpos": [40, 45, 48, 53], "interpol": [40, 45, 48, 53], "np_type_to_typ": [40, 45, 48, 53], "skew": [40, 45, 48, 51, 52, 53, 66, 68, 75], "stddev": [40, 45, 48, 53, 68, 75, 76], "varianc": [40, 45, 48, 51, 53, 66, 68, 73, 75, 76], "graph_profil": 41, "nx": 41, "graphopt": 41, "__calcul": [41, 48], "calculate_quantil": [42, 43], "num_quantile_group": [42, 43, 75], "quantil": [42, 43, 51, 53, 68, 75, 76], "report_help": 43, "flat_dict": 43, "od": 43, "_": 43, "collaps": 43, "seper": [43, 76, 79], "success": 43, "concaten": 43, "unnest": 43, "relat": [44, 73, 76], "tree": 44, "int_column_profil": 45, "intcolumn": 45, "intopt": [45, 51], "numericstatsmixint": [45, 48], "sever": [45, 48, 73, 75], "json_decod": 46, "get_column_profiler_class": 46, "get_compiler_class": 46, "col_pro_compil": 46, "get_option_class": 46, "get_profiler_class": 46, "baseprofil": [46, 50, 52], "get_structured_col_profiler_class": 46, "structuredcolprofil": [46, 50, 67], "load_column_profil": 46, "serialized_json": 46, "serial": [46, 47], "json_encod": [46, 47], "attr1": 46, "attr2": 46, "deseri": 46, "load_compil": 46, "load_profil": 46, "load_structured_col_profil": 46, "profilerencod": 47, "profileencod": 47, "skipkei": 47, "ensure_ascii": 47, "check_circular": 47, "allow_nan": 47, "sort_kei": 47, "indent": [47, 68, 73, 75, 76, 79], "jsonencod": 47, "jsonifi": 47, "sensibl": 47, "typeerror": 47, "guarante": 47, "incom": 47, "ascii": [47, 70], "escap": 47, "circular": 47, "prevent": 47, "infinit": 47, "recurs": [47, 52], "caus": [47, 51], "recursionerror": 47, "infin": 47, "behavior": 47, "compliant": 47, "consist": 47, "javascript": 47, "regress": 47, "dai": [47, 52], "member": 47, "newlin": 47, "compact": [47, 50, 66, 68, 73, 75, 76, 77, 79], "item_separ": 47, "key_separ": 47, "elimin": 47, "whitespac": [47, 77], "to_seri": 47, "notimplementederror": [47, 50], "datatyp": 47, "serializbl": 47, "foo": 47, "bar": 47, "baz": 47, "iterencod": 47, "_one_shot": 47, "yield": 47, "bigobject": 47, "mysocket": 47, "along": [48, 50, 63, 68, 75, 78], "respect": [48, 50, 52, 63, 67, 70, 75, 77], "numerical_column_stat": 48, "abstractstaticmethod": 48, "staticmethod": 48, "numericalopt": [48, 51], "tell": [48, 77], "unclean": 48, "null": [48, 50, 51, 68, 73, 74, 75, 76, 79], "order_column_profil": 49, "protocol": [49, 52], "orderopt": [49, 51], "profile_build": [50, 52, 67], "min_sample_s": 50, "sampling_ratio": [50, 51, 75], "min_true_sampl": [50, 75], "sample_id": 50, "column_index": 50, "update_column_profil": 50, "clean_sampled_df": 50, "structuredcol": 50, "ordereddict": 50, "clean_data_and_get_base_stat": 50, "null_valu": [50, 51], "regexflag": [50, 51], "samples_per_upd": [50, 75], "report_opt": [50, 68, 73, 75, 76, 79], "fed": 50, "flat": [50, 75, 76, 79], "four": [50, 75, 76, 79], "decim": [50, 75, 76, 79], "shorten": [50, 73, 75, 76, 79], "runtim": [50, 75, 76], "prettifi": [50, 68, 75, 76, 79], "baseprofilert": 50, "save_method": [50, 75], "pickl": [50, 75, 76, 79], "load_method": [50, 75], "unstructuredprofil": 50, "unstuctur": 50, "profiler_typ": [50, 68, 73, 75, 76, 79], "profiler_opt": [51, 74], "baseoptiont": 51, "booleanopt": 51, "improp": 51, "raise_error": 51, "conflict": 51, "booleanoptiont": 51, "enabl": [51, 68, 73, 75, 77, 78], "histogramandquantilesopt": 51, "bin_count_or_method": [51, 73, 75], "num_quantil": [51, 75], "modeopt": 51, "max_k_mod": 51, "baseinspectoropt": 51, "baseinspectoroptionst": 51, "is_prop_en": 51, "prop": [51, 66], "numericaloptionst": 51, "numerican": 51, "histogram_and_quantil": [51, 73, 75], "ivar": 51, "bias_correct": [51, 53, 75], "bia": [51, 75], "vartyp": 51, "num_zero": [51, 53, 68, 75], "num_neg": [51, 53, 68, 75], "is_numeric_stats_en": [51, 75], "turn": [51, 70, 73, 75, 76, 77, 79], "precisionopt": 51, "sample_ratio": [51, 75], "percis": 51, "textopt": [51, 53], "vocab": [51, 68, 75, 76, 79], "although": [51, 75], "seem": 51, "redund": 51, "setter": 51, "top_k_categori": [51, 75], "max_sample_size_to_check_stop_condit": [51, 75], "stop_condition_unique_value_ratio": [51, 75], "cm": [51, 75], "cms_confid": [51, 75], "cms_relative_error": [51, 75], "01": [51, 52, 75], "cms_max_num_heavy_hitt": [51, 75], "stop": [51, 75, 79], "highest": [51, 75], "sketch": [51, 75], "eg": 51, "frequenc": [51, 75], "correlationopt": 51, "correl": [51, 68, 75, 78], "hyperloglogopt": [51, 75], "register_count": [51, 75], "altern": [51, 75, 76, 79], "gather": [51, 73], "hyperloglog": 51, "hll": [51, 75], "uniquecountopt": [51, 75], "hashing_method": [51, 75], "rowstatisticsopt": 51, "null_count": [51, 68, 75, 76, 78], "data_labeler_dirpath": [51, 54, 70, 75], "max_sample_s": [51, 75, 76], "decid": [51, 75], "textprofileropt": [51, 55], "is_case_sensit": [51, 75, 79], "stop_word": [51, 75, 79], "top_k_char": [51, 75], "top_k_word": [51, 75], "column_null_valu": 51, "chi2_homogen": [51, 70, 75], "row_statist": [51, 75], "null_replication_metr": [51, 68, 75], "replic": [51, 75], "enabled_profil": 51, "unstructuredopt": 51, "preset": [51, 75], "unstructured_opt": [51, 75, 79], "complet": [51, 75], "numeric_stats_dis": [51, 75], "lower_memory_sketch": [51, 75], "overwrit": [51, 70], "profiler_util": [52, 71], "recursive_dict_upd": 52, "update_d": 52, "keydict": 52, "defaultdict": 52, "sample_in_chunk": 52, "drmaciv": 52, "2018": 52, "lazi": 52, "fisher": 52, "yate": 52, "shuffl": 52, "reject": 52, "clear": 52, "shallow": 52, "default_factori": 52, "__missing__": 52, "fromkei": 52, "v": [52, 69], "keyerror": 52, "popitem": 52, "lifo": 52, "lack": 52, "shuffle_in_chunk": 52, "data_length": 52, "chunk_siz": 52, "cost": [52, 70], "warn_on_profil": 52, "col_profil": 52, "messag": [52, 70], "partit": [52, 75], "auto_multiprocess_toggl": 52, "num_rows_threshold": 52, "750000": 52, "num_cols_threshold": 52, "autom": 52, "toggl": [52, 68, 73, 75], "recommend": 52, "suggest_pool_s": 52, "data_s": 52, "suggest": [52, 68, 77], "suggested_pool_s": 52, "generate_pool": 52, "max_pool_s": 52, "alloc": 52, "multiproess": 52, "cpu_count": 52, "cpu": 52, "bound": 52, "overlap": 52, "x1": 52, "x2": [52, 69], "y1": 52, "y2": [52, 69], "iff": 52, "add_nested_dictionari": 52, "first_dict": 52, "second_dict": 52, "merg": [52, 65], "biased_skew": 52, "bias": 52, "definit": 52, "formal": 52, "g_1": 52, "en": 52, "wiki": 52, "sample_skew": 52, "biased_kurt": 52, "g_2": 52, "a_natural_but_biased_estim": 52, "find_diff_of_numb": 52, "stat1": 52, "int64": [52, 53], "stat2": 52, "unchang": [52, 75], "find_diff_of_strings_and_bool": 52, "find_diff_of_lists_and_set": 52, "duplic": [52, 73, 76, 79], "share": [52, 75], "find_diff_of_d": 52, "timedelta": 52, "due": [52, 76, 77], "sign": 52, "find_diff_of_dict": 52, "dict1": 52, "dict2": 52, "find_diff_of_matric": 52, "matrix1": 52, "matrix2": 52, "matric": 52, "find_diff_of_dicts_with_diff_kei": 52, "get_memory_s": 52, "method_timeit": 52, "measur": [52, 75], "perform_chi_squared_test_for_homogen": 52, "categories1": 52, "sample_size1": 52, "categories2": 52, "sample_size2": 52, "chi": [52, 75], "homogen": [52, 75], "lst": 52, "thing": 52, "produc": 52, "top_profil": 52, "merge_profile_list": [52, 71], "list_of_profil": [52, 71], "pool_count": 52, "reload_labeler_from_options_or_get_new": 52, "data_labeler_load_attr": 52, "load_attr": 52, "text_column_profil": 53, "textcolumn": 53, "max_histogram_bin": 53, "min_histogram_bin": 53, "histogram_bin_method_nam": 53, "histogram_select": 53, "user_set_histogram_bin": 53, "histogram_method": 53, "unstructured_labeler_profil": 54, "percentag": [54, 75], "label_encod": 54, "unstructured_text_profil": 55, "graph_func": 59, "dataprofiler_se": [60, 69], "rng_util": 60, "get_random_number_gener": 60, "base_valid": 63, "is_in_rang": 63, "is_in_list": 63, "dd": 63, "partial": 63, "reinstanti": 63, "dask": 63, "wise": 63, "fashion": 63, "look": [65, 66, 70, 71, 76, 77, 79], "overview": 65, "conclus": 65, "automat": [65, 66, 68, 75, 78], "deeper": 65, "dive": [65, 70], "intro": [65, 75], "scratch": 65, "transfer": 65, "learn": [65, 68], "ground": 65, "lstm": 65, "integr": [65, 78], "rule": 65, "adjust": [65, 75], "futur": [65, 78], "columnnam": 65, "setup": [65, 69], "dataload": 65, "popmon": 65, "comparison": [65, 78], "similarli": [66, 76, 79], "data_path": [66, 73, 76, 79], "now": [66, 70, 71, 77], "our": [66, 67], "prettyprint": [66, 68], "sort_dict": [66, 68], "skeleton": 66, "num_nod": [66, 68, 75], "num_edg": [66, 68, 75], "categorical_attribut": [66, 68, 75], "continuous_attribut": [66, 68, 75], "avg_node_degre": [66, 68, 75], "global_max_component_s": [66, 68, 75], "continuous_distribut": [66, 68, 75], "categorical_distribut": [66, 68, 75], "degre": [66, 75], "largest": 66, "scale": [66, 68, 75], "bin_count": [66, 68, 75, 76], "bin_edg": [66, 68, 75, 76], "6": [66, 68, 76], "norm": 66, "uniform": 66, "expon": 66, "logist": 66, "gamma": 66, "lognorm": 66, "scipi": 66, "pkl": [66, 75, 76, 79], "new_profil": 66, "new_report": 66, "previou": [66, 75, 77], "might": [66, 70], "training_profil": 66, "testing_data": [66, 76, 79], "testing_profil": 66, "test_train_diff": 66, "seaborn": 67, "pip": [67, 68, 69], "With": [67, 75, 76, 79], "plot_histogram": [67, 76], "column_ind": 67, "plot_missing_values_matrix": [67, 76], "ax": [67, 76], "titl": [67, 75, 76], "matplotlib": [67, 73, 76], "plot_col_histogram": 67, "bargraph": 67, "plot_col_missing_valu": 67, "c": 67, "fig": [67, 76], "columns_nam": 67, "low": 67, "get_figur": 67, "2021": 67, "2020": 67, "barchart": 67, "monitor": 68, "schema": [68, 75, 76, 79], "downstream": 68, "come": [68, 73], "deep": 68, "effici": 68, "pii": 68, "few": [68, 70], "readable_report": [68, 75], "dump": [68, 73, 75, 76, 79], "pypi": [68, 69], "ml": [68, 69, 70], "strict": [68, 69], "don": [68, 69], "slimmer": [68, 69], "labler": [68, 69], "bug": 68, "issu": [68, 78], "api": [68, 78], "terminolog": 68, "underli": 68, "global_stat": [68, 73, 75, 76, 77, 79], "samples_us": [68, 75, 76, 79], "column_count": [68, 73, 75, 76], "row_count": [68, 73, 75, 76], "row_has_null_ratio": [68, 75, 76], "row_is_null_ratio": [68, 73, 75, 76], "unique_row_ratio": [68, 75, 76], "duplicate_row_count": [68, 73, 75, 76], "file_typ": [68, 73, 75, 76, 79], "correlation_matrix": [68, 75], "chi2_matrix": [68, 75], "profile_schema": [68, 75], "null_typ": [68, 75, 76], "null_types_index": [68, 75, 76], "data_type_represent": [68, 75, 76], "median_absolute_devi": [68, 75], "data_label_represent": [68, 76], "var": [68, 76], "std": [68, 76], "margin_of_error": [68, 76], "confidence_level": [68, 76], "class_prior": [68, 75], "class_sum": [68, 75], "class_mean": [68, 75], "empty_line_count": [68, 75, 79], "memory_s": [68, 75], "mb": [68, 75], "entity_count": [68, 75, 79], "true_char_level": [68, 75, 79], "postprocess_char_level": [68, 75, 79], "entity_percentag": [68, 75], "vocab_count": [68, 75], "word_count": [68, 75, 79], "attribute_1": 68, "attribute_2": 68, "attributt": [68, 75], "sort_valu": [68, 75], "rare": [68, 75], "equip": [68, 75], "new_data": [68, 75, 76, 79], "data1": [68, 75, 76, 79], "file_a": [68, 75], "profile1": [68, 73, 75, 76, 79], "data2": [68, 75, 76, 79], "file_b": [68, 75], "profile2": [68, 73, 75, 76, 79], "profile3": [68, 75, 76, 79], "my_datafram": [68, 73, 75, 76], "my_text": 68, "my_graph": 68, "printer": 68, "000": [68, 75, 76], "10000": [68, 75], "maco": 69, "intel": 69, "chip": 69, "homebrew": 69, "brew": 69, "cppflag": 69, "usr": 69, "local": 69, "l": 69, "appl": 69, "opt": 69, "linux": 69, "sudo": 69, "apt": 69, "libsnappi": 69, "dev": 69, "python3": 69, "virtualenv": 69, "env": 69, "venv3": 69, "pip3": [69, 74], "repo": [69, 78], "sdist": 69, "bdist": 69, "bdist_wheel": 69, "dist": 69, "py3": 69, "whl": 69, "doubl": 69, "z2": 69, "resolv": 69, "older": 69, "delet": 69, "rerun": 69, "git": 69, "egg": 69, "unittest": 69, "discov": 69, "test_profile_build": 69, "pytest": 69, "test_csv_data": 69, "testprofil": 69, "quick": [70, 76], "switch": [70, 74], "suit": 70, "human": [70, 75], "loggin": [70, 71, 76, 79], "v1": [70, 71, 76, 79], "ll": [70, 74], "depart": 70, "educ": 70, "schooldatasmal": [70, 79], "go": 70, "demo": [70, 74], "keep": 70, "get_unstructured_result": 70, "piec": 70, "labeled_data": 70, "label_df": 70, "set_opt": 70, "width": 70, "besid": [70, 73], "spam": 70, "email": 70, "enron": 70, "investig": 70, "11111111": 70, "1111111111111": 70, "javamail": 70, "evan": 70, "thyme": 70, "fri": 70, "aug": 70, "37": 70, "0700": 70, "pdt": 70, "smith": 70, "compani": 70, "john": 70, "mime": 70, "charset": 70, "7bit": 70, "mari": 70, "ou": 70, "na": 70, "cn": 70, "recipi": 70, "ssmith": 70, "jsmith": 70, "cc": 70, "bcc": 70, "privileg": 70, "pst": 70, "ever": 70, "saw": 70, "mail": 70, "offic": 70, "fridai": 70, "august": 70, "pm": [70, 74], "heard": 70, "regard": 70, "sale": 70, "guess": 70, "big": 70, "deal": 70, "think": [70, 78], "123": [70, 77], "456": 70, "7890": 70, "feed": 70, "standard": [70, 74, 75, 78], "phone": 70, "breviti": 70, "opeid6": 70, "insturl": 70, "search_str": 70, "improv": [70, 74], "small": [70, 75], "prep": 70, "ones": [70, 75], "16": [70, 76], "3f": 70, "0f": 70, "mind": 70, "real": 70, "involv": 70, "better": 70, "easili": [70, 73, 75, 77], "simpl": [70, 76, 79], "my_label": 70, "earlier": 70, "compris": [70, 75], "three": [70, 75], "compen": 70, "interchang": 70, "test_label": 70, "baseprocessor": 70, "document": [70, 76, 79], "scan": 70, "dummi": 71, "col1": 71, "col2": 71, "single_profil": 71, "And": 71, "introductori": 73, "jupyt": 73, "plethora": 73, "larg": 73, "cover": [73, 74, 76, 79], "pyplot": [73, 76], "plt": [73, 76], "former": 73, "overal": [73, 76, 79], "latter": 73, "successfulli": 73, "regardless": 73, "collect": [73, 76, 79], "col_int": [73, 76], "col_float": [73, 76], "infer": [73, 75, 76], "explicitli": [73, 76, 79], "sample1": 73, "sample2": 73, "sample3": 73, "text_sampl": 73, "featur": [73, 75], "rice": [73, 75], "One": [73, 75, 76, 79], "verifi": 73, "devid": 73, "halv": 73, "df1": 73, "iloc": 73, "df2": 73, "half": 73, "profile_ful": 73, "report_ful": 73, "broken": 73, "aforement": 73, "machin": [73, 76, 79], "profile_merg": 73, "report_merg": 73, "walk": 73, "dynam": 74, "pm_stability_report": 74, "libarari": 74, "noqa": 74, "ahead": 74, "popmon_dataload": 74, "time_index": 74, "pm_data": 74, "read_csv": 74, "parse_d": 74, "super": 74, "know": 74, "intend": 74, "standardli": 74, "dp_dataload": 74, "datalaod": 74, "dp_data": 74, "even": 74, "actual": 74, "datatim": 74, "ind": 74, "enumer": 74, "to_datetim": 74, "gzip": 74, "shutil": 74, "popmon_tutorial_data": 74, "flight_delai": 74, "gz": 74, "rb": 74, "f_in": 74, "wb": 74, "f_out": 74, "copyfileobj": 74, "report_output_dir": 74, "popmon_output": 74, "flight_delays_ful": 74, "report_pm_load": 74, "time_axi": 74, "time_width": 74, "1w": 74, "time_offset": 74, "2015": 74, "02": 74, "extended_report": 74, "pull_rul": 74, "_pull": 74, "to_fil": 74, "popmon_loader_report": 74, "loader": 74, "dp_datafram": 74, "report_dp_load": 74, "dataprofiler_loader_report": 74, "approach": 74, "mini": 74, "flight_delays_mini": 74, "math": 74, "block": 74, "seemlessli": 74, "exact": [74, 75], "vast": 75, "amount": 75, "page": [75, 78], "diff_report": 75, "differenc": 75, "valuabl": 75, "chi2": 75, "psi": 75, "popoul": 75, "stabil": 75, "understand": 75, "mirror": 75, "vari": 75, "conserv": 75, "deg_of_fre": 75, "welch": 75, "my_profil": [75, 76, 79], "loaded_pkl_profil": 75, "loaded_json_profil": 75, "explicit": 75, "normal_csv_fil": 75, "structured_profil": 75, "structured_report": 75, "normal_text_fil": 75, "unstructured_profil": 75, "unstructured_report": 75, "sample_arrai": 75, "contrast": 75, "met": 75, "grab": 75, "satisfi": 75, "reproduc": 75, "proport": 75, "distinct": 75, "ex": 75, "coeffici": 75, "question": [75, 78], "durat": 75, "took": 75, "millisecond": 75, "indici": 75, "percentil": 75, "chosen": [75, 79], "accord": 75, "denot": 75, "appear": 75, "attribute_n": 75, "th": 75, "ie": 75, "breakdown": 75, "750": 75, "system": 75, "much": 75, "top_k_mod": 75, "prefer": 75, "left": 75, "fd": 75, "doan": 75, "scott": 75, "sturg": 75, "sqrt": 75, "heavi": 75, "hitter": 75, "ascend": 75, "batch_1": 75, "batch_2": 75, "coupl": 75, "wide": [76, 79], "primari": [76, 79], "analyz": [76, 79], "high": 76, "accomplish": [76, 79], "simialrli": 76, "accur": 76, "increas": 76, "beyond": [76, 79], "aspect": [76, 79], "peek": [76, 79], "particularli": [76, 79], "reason": [76, 79], "Such": [76, 79], "pseudo": [76, 79], "profiler_train": [76, 79], "training_data": [76, 79], "profiler_test": [76, 79], "validation_report": [76, 79], "data_split_differ": [76, 79], "ve": 76, "visual": 76, "plot": 76, "figsiz": 76, "dpi": 76, "gca": 76, "set_size_inch": 76, "set_dpi": 76, "Not": [76, 79], "manipul": [76, 79], "loaded_profil": [76, 79], "synthet": [76, 79], "data_object": [76, 79], "utilizng": 77, "fake": 77, "st": 77, "ignore_cas": 77, "statement": 77, "captur": 77, "rather": 77, "upper": 77, "lower": 77, "letter": 77, "lowercase_char": 77, "uppercase_char": 77, "otuput": 77, "subtext": 77, "foudn": 77, "funciton": 77, "lowercas": 77, "uppercas": 77, "serach": 77, "za": 77, "alon": 77, "slight": 77, "oth": 77, "postprocesor": 77, "202": 77, "third": 77, "why": 77, "omit_kei": 77, "my_new_regex_label": 77, "task": 78, "checkout": 78, "paradigm": 78, "linst": 78, "transform": 78, "back": 78, "reth": 78, "meta": 78, "window": 78, "branch": 78, "secondari": 78, "conjunct": 78, "thie": 78, "null_perc": 78, "divid": 78, "sample_count": 78, "pr": 78, "baselin": 78, "concern": 78, "drive": 78, "mock": 78, "remot": 78, "backward": 78, "incompat": 78, "legaci": 78, "patch": 78, "refact": 78, "polar": 78, "spearman": 78, "workflow": 78, "3x": 79}, "objects": {"": [[6, 0, 0, "-", "dataprofiler"]], "dataprofiler": [[7, 0, 0, "-", "data_readers"], [19, 0, 0, "-", "dp_logging"], [20, 0, 0, "-", "labelers"], [32, 0, 0, "-", "plugins"], [34, 0, 0, "-", "profilers"], [57, 0, 0, "-", "reports"], [60, 0, 0, "-", "rng_utils"], [6, 5, 1, "", "set_seed"], [61, 0, 0, "-", "settings"], [62, 0, 0, "-", "validators"], [64, 0, 0, "-", "version"]], "dataprofiler.data_readers": [[8, 0, 0, "-", "avro_data"], [9, 0, 0, "-", "base_data"], [10, 0, 0, "-", "csv_data"], [11, 0, 0, "-", "data"], [12, 0, 0, "-", "data_utils"], [13, 0, 0, "-", "filepath_or_buffer"], [14, 0, 0, "-", "graph_data"], [15, 0, 0, "-", "json_data"], [16, 0, 0, "-", "parquet_data"], [17, 0, 0, "-", "structured_mixins"], [18, 0, 0, "-", "text_data"]], "dataprofiler.data_readers.avro_data": [[8, 1, 1, "", "AVROData"]], "dataprofiler.data_readers.avro_data.AVROData": [[8, 2, 1, "", "data"], [8, 2, 1, "", "data_and_metadata"], [8, 2, 1, "", "data_format"], [8, 3, 1, "", "data_type"], [8, 2, 1, "", "file_encoding"], [8, 4, 1, "", "get_batch_generator"], [8, 3, 1, "", "info"], [8, 4, 1, "", "is_match"], [8, 2, 1, "", "is_structured"], [8, 2, 1, "", "length"], [8, 2, 1, "", "metadata"], [8, 4, 1, "", "reload"], [8, 2, 1, "", "selected_keys"]], "dataprofiler.data_readers.base_data": [[9, 1, 1, "", "BaseData"]], "dataprofiler.data_readers.base_data.BaseData": [[9, 2, 1, "", "data"], [9, 2, 1, "", "data_format"], [9, 3, 1, "", "data_type"], [9, 2, 1, "", "file_encoding"], [9, 4, 1, "", "get_batch_generator"], [9, 3, 1, "", "info"], [9, 4, 1, "", "is_match"], [9, 2, 1, "", "is_structured"], [9, 2, 1, "", "length"], [9, 4, 1, "", "reload"]], "dataprofiler.data_readers.csv_data": [[10, 1, 1, "", "CSVData"]], "dataprofiler.data_readers.csv_data.CSVData": [[10, 2, 1, "", "data"], [10, 2, 1, "", "data_format"], [10, 3, 1, "", "data_type"], [10, 2, 1, "", "delimiter"], [10, 2, 1, "", "file_encoding"], [10, 4, 1, "", "get_batch_generator"], [10, 2, 1, "", "header"], [10, 3, 1, "", "info"], [10, 4, 1, "", "is_match"], [10, 2, 1, "", "is_structured"], [10, 2, 1, "", "length"], [10, 3, 1, "", "options"], [10, 2, 1, "", "quotechar"], [10, 4, 1, "", "reload"], [10, 2, 1, "", "sample_nrows"], [10, 2, 1, "", "selected_columns"]], "dataprofiler.data_readers.data": [[11, 1, 1, "", "Data"]], "dataprofiler.data_readers.data.Data": [[11, 3, 1, "", "data_classes"]], "dataprofiler.data_readers.data_utils": [[12, 1, 1, "", "S3Helper"], [12, 5, 1, "", "convert_int_to_string"], [12, 5, 1, "", "convert_unicode_col_to_utf8"], [12, 5, 1, "", "data_generator"], [12, 5, 1, "", "detect_cell_type"], [12, 5, 1, "", "detect_file_encoding"], [12, 5, 1, "", "find_nth_loc"], [12, 5, 1, "", "generator_on_file"], [12, 5, 1, "", "get_delimiter_regex"], [12, 5, 1, "", "is_valid_url"], [12, 5, 1, "", "json_to_dataframe"], [12, 5, 1, "", "load_as_str_from_file"], [12, 5, 1, "", "read_csv_df"], [12, 5, 1, "", "read_json"], [12, 5, 1, "", "read_json_df"], [12, 5, 1, "", "read_parquet_df"], [12, 5, 1, "", "read_text_as_list_of_strs"], [12, 5, 1, "", "reservoir"], [12, 5, 1, "", "rsample"], [12, 5, 1, "", "sample_parquet"], [12, 5, 1, "", "unicode_to_str"], [12, 5, 1, "", "url_to_bytes"]], "dataprofiler.data_readers.data_utils.S3Helper": [[12, 4, 1, "", "create_s3_client"], [12, 4, 1, "", "get_s3_uri"], [12, 4, 1, "", "is_s3_uri"]], "dataprofiler.data_readers.filepath_or_buffer": [[13, 1, 1, "", "FileOrBufferHandler"], [13, 5, 1, "", "is_stream_buffer"]], "dataprofiler.data_readers.graph_data": [[14, 1, 1, "", "GraphData"]], "dataprofiler.data_readers.graph_data.GraphData": [[14, 4, 1, "", "check_integer"], [14, 4, 1, "", "csv_column_names"], [14, 2, 1, "", "data"], [14, 2, 1, "", "data_format"], [14, 3, 1, "", "data_type"], [14, 2, 1, "", "file_encoding"], [14, 4, 1, "", "get_batch_generator"], [14, 3, 1, "", "info"], [14, 4, 1, "", "is_match"], [14, 2, 1, "", "is_structured"], [14, 2, 1, "", "length"], [14, 3, 1, "", "options"], [14, 4, 1, "", "reload"]], "dataprofiler.data_readers.json_data": [[15, 1, 1, "", "JSONData"]], "dataprofiler.data_readers.json_data.JSONData": [[15, 2, 1, "", "data"], [15, 2, 1, "", "data_and_metadata"], [15, 2, 1, "", "data_format"], [15, 3, 1, "", "data_type"], [15, 2, 1, "", "file_encoding"], [15, 4, 1, "", "get_batch_generator"], [15, 3, 1, "", "info"], [15, 4, 1, "", "is_match"], [15, 2, 1, "", "is_structured"], [15, 2, 1, "", "length"], [15, 2, 1, "", "metadata"], [15, 3, 1, "", "options"], [15, 4, 1, "", "reload"], [15, 2, 1, "", "selected_keys"]], "dataprofiler.data_readers.parquet_data": [[16, 1, 1, "", "ParquetData"]], "dataprofiler.data_readers.parquet_data.ParquetData": [[16, 2, 1, "", "data"], [16, 2, 1, "", "data_format"], [16, 3, 1, "", "data_type"], [16, 2, 1, "", "file_encoding"], [16, 4, 1, "", "get_batch_generator"], [16, 3, 1, "", "info"], [16, 4, 1, "", "is_match"], [16, 2, 1, "", "is_structured"], [16, 2, 1, "", "length"], [16, 3, 1, "", "options"], [16, 4, 1, "", "reload"], [16, 2, 1, "", "sample_nrows"], [16, 2, 1, "", "selected_columns"]], "dataprofiler.data_readers.structured_mixins": [[17, 1, 1, "", "SpreadSheetDataMixin"]], "dataprofiler.data_readers.text_data": [[18, 1, 1, "", "TextData"]], "dataprofiler.data_readers.text_data.TextData": [[18, 2, 1, "", "data"], [18, 2, 1, "", "data_format"], [18, 3, 1, "", "data_type"], [18, 2, 1, "", "file_encoding"], [18, 4, 1, "", "get_batch_generator"], [18, 3, 1, "", "info"], [18, 4, 1, "", "is_match"], [18, 2, 1, "", "is_structured"], [18, 2, 1, "", "length"], [18, 3, 1, "", "options"], [18, 4, 1, "", "reload"], [18, 2, 1, "", "samples_per_line"], [18, 4, 1, "", "tokenize"]], "dataprofiler.dp_logging": [[19, 5, 1, "", "get_child_logger"], [19, 5, 1, "", "get_logger"], [19, 5, 1, "", "set_verbosity"]], "dataprofiler.labelers": [[21, 0, 0, "-", "base_data_labeler"], [22, 0, 0, "-", "base_model"], [23, 0, 0, "-", "char_load_tf_model"], [24, 0, 0, "-", "character_level_cnn_model"], [25, 0, 0, "-", "classification_report_utils"], [26, 0, 0, "-", "column_name_model"], [27, 0, 0, "-", "data_labelers"], [28, 0, 0, "-", "data_processing"], [29, 0, 0, "-", "labeler_utils"], [30, 0, 0, "-", "regex_model"], [31, 0, 0, "-", "utils"]], "dataprofiler.labelers.base_data_labeler": [[21, 1, 1, "", "BaseDataLabeler"], [21, 1, 1, "", "TrainableDataLabeler"]], "dataprofiler.labelers.base_data_labeler.BaseDataLabeler": [[21, 4, 1, "", "add_label"], [21, 4, 1, "", "check_pipeline"], [21, 4, 1, "", "help"], [21, 2, 1, "", "label_mapping"], [21, 2, 1, "", "labels"], [21, 4, 1, "", "load_from_disk"], [21, 4, 1, "", "load_from_library"], [21, 4, 1, "", "load_with_components"], [21, 2, 1, "", "model"], [21, 2, 1, "", "postprocessor"], [21, 4, 1, "", "predict"], [21, 2, 1, "", "preprocessor"], [21, 2, 1, "", "reverse_label_mapping"], [21, 4, 1, "", "save_to_disk"], [21, 4, 1, "", "set_labels"], [21, 4, 1, "", "set_model"], [21, 4, 1, "", "set_params"], [21, 4, 1, "", "set_postprocessor"], [21, 4, 1, "", "set_preprocessor"]], "dataprofiler.labelers.base_data_labeler.TrainableDataLabeler": [[21, 4, 1, "", "add_label"], [21, 4, 1, "", "check_pipeline"], [21, 4, 1, "", "fit"], [21, 4, 1, "", "help"], [21, 2, 1, "", "label_mapping"], [21, 2, 1, "", "labels"], [21, 4, 1, "", "load_from_disk"], [21, 4, 1, "", "load_from_library"], [21, 4, 1, "", "load_with_components"], [21, 2, 1, "", "model"], [21, 2, 1, "", "postprocessor"], [21, 4, 1, "", "predict"], [21, 2, 1, "", "preprocessor"], [21, 2, 1, "", "reverse_label_mapping"], [21, 4, 1, "", "save_to_disk"], [21, 4, 1, "", "set_labels"], [21, 4, 1, "", "set_model"], [21, 4, 1, "", "set_params"], [21, 4, 1, "", "set_postprocessor"], [21, 4, 1, "", "set_preprocessor"]], "dataprofiler.labelers.base_model": [[22, 1, 1, "", "AutoSubRegistrationMeta"], [22, 1, 1, "", "BaseModel"], [22, 1, 1, "", "BaseTrainableModel"]], "dataprofiler.labelers.base_model.AutoSubRegistrationMeta": [[22, 4, 1, "", "mro"], [22, 4, 1, "", "register"]], "dataprofiler.labelers.base_model.BaseModel": [[22, 4, 1, "", "add_label"], [22, 4, 1, "", "get_class"], [22, 4, 1, "", "get_parameters"], [22, 4, 1, "", "help"], [22, 2, 1, "", "label_mapping"], [22, 2, 1, "", "labels"], [22, 4, 1, "", "load_from_disk"], [22, 2, 1, "", "num_labels"], [22, 4, 1, "", "predict"], [22, 3, 1, "", "requires_zero_mapping"], [22, 4, 1, "", "reset_weights"], [22, 2, 1, "", "reverse_label_mapping"], [22, 4, 1, "", "save_to_disk"], [22, 4, 1, "", "set_label_mapping"], [22, 4, 1, "", "set_params"]], "dataprofiler.labelers.base_model.BaseTrainableModel": [[22, 4, 1, "", "add_label"], [22, 4, 1, "", "fit"], [22, 4, 1, "", "get_class"], [22, 4, 1, "", "get_parameters"], [22, 4, 1, "", "help"], [22, 2, 1, "", "label_mapping"], [22, 2, 1, "", "labels"], [22, 4, 1, "", "load_from_disk"], [22, 2, 1, "", "num_labels"], [22, 4, 1, "", "predict"], [22, 3, 1, "", "requires_zero_mapping"], [22, 4, 1, "", "reset_weights"], [22, 2, 1, "", "reverse_label_mapping"], [22, 4, 1, "", "save_to_disk"], [22, 4, 1, "", "set_label_mapping"], [22, 4, 1, "", "set_params"]], "dataprofiler.labelers.char_load_tf_model": [[23, 1, 1, "", "CharLoadTFModel"]], "dataprofiler.labelers.char_load_tf_model.CharLoadTFModel": [[23, 4, 1, "", "add_label"], [23, 4, 1, "", "details"], [23, 4, 1, "", "fit"], [23, 4, 1, "", "get_class"], [23, 4, 1, "", "get_parameters"], [23, 4, 1, "", "help"], [23, 2, 1, "", "label_mapping"], [23, 2, 1, "", "labels"], [23, 4, 1, "", "load_from_disk"], [23, 2, 1, "", "num_labels"], [23, 4, 1, "", "predict"], [23, 3, 1, "", "requires_zero_mapping"], [23, 4, 1, "", "reset_weights"], [23, 2, 1, "", "reverse_label_mapping"], [23, 4, 1, "", "save_to_disk"], [23, 4, 1, "", "set_label_mapping"], [23, 4, 1, "", "set_params"]], "dataprofiler.labelers.character_level_cnn_model": [[24, 1, 1, "", "CharacterLevelCnnModel"], [24, 1, 1, "", "EncodingLayer"], [24, 1, 1, "", "ThreshArgMaxLayer"], [24, 5, 1, "", "build_embd_dictionary"], [24, 5, 1, "", "create_glove_char"]], "dataprofiler.labelers.character_level_cnn_model.CharacterLevelCnnModel": [[24, 4, 1, "", "add_label"], [24, 4, 1, "", "details"], [24, 4, 1, "", "fit"], [24, 4, 1, "", "get_class"], [24, 4, 1, "", "get_parameters"], [24, 4, 1, "", "help"], [24, 2, 1, "", "label_mapping"], [24, 2, 1, "", "labels"], [24, 4, 1, "", "load_from_disk"], [24, 2, 1, "", "num_labels"], [24, 4, 1, "", "predict"], [24, 3, 1, "", "requires_zero_mapping"], [24, 4, 1, "", "reset_weights"], [24, 2, 1, "", "reverse_label_mapping"], [24, 4, 1, "", "save_to_disk"], [24, 4, 1, "", "set_label_mapping"], [24, 4, 1, "", "set_params"]], "dataprofiler.labelers.character_level_cnn_model.EncodingLayer": [[24, 4, 1, "", "add_loss"], [24, 4, 1, "", "add_metric"], [24, 4, 1, "", "add_variable"], [24, 4, 1, "", "add_weight"], [24, 4, 1, "", "build"], [24, 4, 1, "", "build_from_config"], [24, 4, 1, "", "call"], [24, 2, 1, "", "compute_dtype"], [24, 4, 1, "", "compute_mask"], [24, 4, 1, "", "compute_output_shape"], [24, 4, 1, "", "compute_output_spec"], [24, 4, 1, "", "count_params"], [24, 2, 1, "", "dtype"], [24, 2, 1, "", "dtype_policy"], [24, 4, 1, "", "from_config"], [24, 4, 1, "", "get_build_config"], [24, 4, 1, "", "get_config"], [24, 4, 1, "", "get_weights"], [24, 2, 1, "", "input"], [24, 2, 1, "", "input_dtype"], [24, 2, 1, "", "input_spec"], [24, 4, 1, "", "load_own_variables"], [24, 2, 1, "", "losses"], [24, 2, 1, "", "metrics"], [24, 2, 1, "", "metrics_variables"], [24, 2, 1, "", "non_trainable_variables"], [24, 2, 1, "", "non_trainable_weights"], [24, 2, 1, "", "output"], [24, 4, 1, "", "quantize"], [24, 4, 1, "", "quantized_call"], [24, 4, 1, "", "save_own_variables"], [24, 4, 1, "", "set_weights"], [24, 4, 1, "", "stateless_call"], [24, 2, 1, "", "supports_masking"], [24, 4, 1, "", "symbolic_call"], [24, 2, 1, "", "trainable"], [24, 2, 1, "", "trainable_variables"], [24, 2, 1, "", "trainable_weights"], [24, 2, 1, "", "variable_dtype"], [24, 2, 1, "", "variables"], [24, 2, 1, "", "weights"]], "dataprofiler.labelers.character_level_cnn_model.ThreshArgMaxLayer": [[24, 4, 1, "", "add_loss"], [24, 4, 1, "", "add_metric"], [24, 4, 1, "", "add_variable"], [24, 4, 1, "", "add_weight"], [24, 4, 1, "", "build"], [24, 4, 1, "", "build_from_config"], [24, 4, 1, "", "call"], [24, 2, 1, "", "compute_dtype"], [24, 4, 1, "", "compute_mask"], [24, 4, 1, "", "compute_output_shape"], [24, 4, 1, "", "compute_output_spec"], [24, 4, 1, "", "count_params"], [24, 2, 1, "", "dtype"], [24, 2, 1, "", "dtype_policy"], [24, 4, 1, "", "from_config"], [24, 4, 1, "", "get_build_config"], [24, 4, 1, "", "get_config"], [24, 4, 1, "", "get_weights"], [24, 2, 1, "", "input"], [24, 2, 1, "", "input_dtype"], [24, 2, 1, "", "input_spec"], [24, 4, 1, "", "load_own_variables"], [24, 2, 1, "", "losses"], [24, 2, 1, "", "metrics"], [24, 2, 1, "", "metrics_variables"], [24, 2, 1, "", "non_trainable_variables"], [24, 2, 1, "", "non_trainable_weights"], [24, 2, 1, "", "output"], [24, 4, 1, "", "quantize"], [24, 4, 1, "", "quantized_call"], [24, 4, 1, "", "save_own_variables"], [24, 4, 1, "", "set_weights"], [24, 4, 1, "", "stateless_call"], [24, 2, 1, "", "supports_masking"], [24, 4, 1, "", "symbolic_call"], [24, 2, 1, "", "trainable"], [24, 2, 1, "", "trainable_variables"], [24, 2, 1, "", "trainable_weights"], [24, 2, 1, "", "variable_dtype"], [24, 2, 1, "", "variables"], [24, 2, 1, "", "weights"]], "dataprofiler.labelers.classification_report_utils": [[25, 5, 1, "", "classification_report"], [25, 5, 1, "", "convert_confusion_matrix_to_MCM"], [25, 5, 1, "", "precision_recall_fscore_support"]], "dataprofiler.labelers.column_name_model": [[26, 1, 1, "", "ColumnNameModel"]], "dataprofiler.labelers.column_name_model.ColumnNameModel": [[26, 4, 1, "", "add_label"], [26, 4, 1, "", "get_class"], [26, 4, 1, "", "get_parameters"], [26, 4, 1, "", "help"], [26, 2, 1, "", "label_mapping"], [26, 2, 1, "", "labels"], [26, 4, 1, "", "load_from_disk"], [26, 2, 1, "", "num_labels"], [26, 4, 1, "", "predict"], [26, 3, 1, "", "requires_zero_mapping"], [26, 4, 1, "", "reset_weights"], [26, 2, 1, "", "reverse_label_mapping"], [26, 4, 1, "", "save_to_disk"], [26, 4, 1, "", "set_label_mapping"], [26, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_labelers": [[27, 1, 1, "", "DataLabeler"], [27, 1, 1, "", "StructuredDataLabeler"], [27, 1, 1, "", "UnstructuredDataLabeler"], [27, 5, 1, "", "train_structured_labeler"]], "dataprofiler.labelers.data_labelers.DataLabeler": [[27, 3, 1, "", "labeler_classes"], [27, 4, 1, "", "load_from_disk"], [27, 4, 1, "", "load_from_library"], [27, 4, 1, "", "load_with_components"]], "dataprofiler.labelers.data_labelers.StructuredDataLabeler": [[27, 4, 1, "", "add_label"], [27, 4, 1, "", "check_pipeline"], [27, 4, 1, "", "help"], [27, 2, 1, "", "label_mapping"], [27, 2, 1, "", "labels"], [27, 4, 1, "", "load_from_disk"], [27, 4, 1, "", "load_from_library"], [27, 4, 1, "", "load_with_components"], [27, 2, 1, "", "model"], [27, 2, 1, "", "postprocessor"], [27, 4, 1, "", "predict"], [27, 2, 1, "", "preprocessor"], [27, 2, 1, "", "reverse_label_mapping"], [27, 4, 1, "", "save_to_disk"], [27, 4, 1, "", "set_labels"], [27, 4, 1, "", "set_model"], [27, 4, 1, "", "set_params"], [27, 4, 1, "", "set_postprocessor"], [27, 4, 1, "", "set_preprocessor"]], "dataprofiler.labelers.data_labelers.UnstructuredDataLabeler": [[27, 4, 1, "", "add_label"], [27, 4, 1, "", "check_pipeline"], [27, 4, 1, "", "help"], [27, 2, 1, "", "label_mapping"], [27, 2, 1, "", "labels"], [27, 4, 1, "", "load_from_disk"], [27, 4, 1, "", "load_from_library"], [27, 4, 1, "", "load_with_components"], [27, 2, 1, "", "model"], [27, 2, 1, "", "postprocessor"], [27, 4, 1, "", "predict"], [27, 2, 1, "", "preprocessor"], [27, 2, 1, "", "reverse_label_mapping"], [27, 4, 1, "", "save_to_disk"], [27, 4, 1, "", "set_labels"], [27, 4, 1, "", "set_model"], [27, 4, 1, "", "set_params"], [27, 4, 1, "", "set_postprocessor"], [27, 4, 1, "", "set_preprocessor"]], "dataprofiler.labelers.data_processing": [[28, 1, 1, "", "AutoSubRegistrationMeta"], [28, 1, 1, "", "BaseDataPostprocessor"], [28, 1, 1, "", "BaseDataPreprocessor"], [28, 1, 1, "", "BaseDataProcessor"], [28, 1, 1, "", "CharEncodedPreprocessor"], [28, 1, 1, "", "CharPostprocessor"], [28, 1, 1, "", "CharPreprocessor"], [28, 1, 1, "", "ColumnNameModelPostprocessor"], [28, 1, 1, "", "DirectPassPreprocessor"], [28, 1, 1, "", "RegexPostProcessor"], [28, 1, 1, "", "StructCharPostprocessor"], [28, 1, 1, "", "StructCharPreprocessor"], [28, 1, 1, "", "StructRegexPostProcessor"]], "dataprofiler.labelers.data_processing.AutoSubRegistrationMeta": [[28, 4, 1, "", "mro"], [28, 4, 1, "", "register"]], "dataprofiler.labelers.data_processing.BaseDataPostprocessor": [[28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.BaseDataPreprocessor": [[28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.BaseDataProcessor": [[28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.CharEncodedPreprocessor": [[28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.CharPostprocessor": [[28, 4, 1, "", "convert_to_NER_format"], [28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "match_sentence_lengths"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.CharPreprocessor": [[28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.ColumnNameModelPostprocessor": [[28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.DirectPassPreprocessor": [[28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.RegexPostProcessor": [[28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "priority_prediction"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"], [28, 4, 1, "", "split_prediction"]], "dataprofiler.labelers.data_processing.StructCharPostprocessor": [[28, 4, 1, "", "convert_to_structured_analysis"], [28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "match_sentence_lengths"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.StructCharPreprocessor": [[28, 4, 1, "", "convert_to_unstructured_format"], [28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.data_processing.StructRegexPostProcessor": [[28, 4, 1, "", "get_class"], [28, 4, 1, "", "get_parameters"], [28, 4, 1, "", "help"], [28, 4, 1, "", "load_from_disk"], [28, 4, 1, "", "load_from_library"], [28, 4, 1, "", "process"], [28, 3, 1, "", "processor_type"], [28, 4, 1, "", "save_to_disk"], [28, 4, 1, "", "set_params"]], "dataprofiler.labelers.labeler_utils": [[29, 1, 1, "", "F1Score"], [29, 1, 1, "", "FBetaScore"], [29, 5, 1, "", "evaluate_accuracy"], [29, 5, 1, "", "f1_report_dict_to_str"], [29, 5, 1, "", "get_tf_layer_index_from_name"], [29, 5, 1, "", "hide_tf_logger_warnings"], [29, 5, 1, "", "protected_register_keras_serializable"]], "dataprofiler.labelers.labeler_utils.F1Score": [[29, 4, 1, "", "add_variable"], [29, 4, 1, "", "add_weight"], [29, 2, 1, "", "dtype"], [29, 4, 1, "", "from_config"], [29, 4, 1, "", "get_config"], [29, 4, 1, "", "reset_state"], [29, 4, 1, "", "result"], [29, 4, 1, "", "stateless_reset_state"], [29, 4, 1, "", "stateless_result"], [29, 4, 1, "", "stateless_update_state"], [29, 4, 1, "", "update_state"], [29, 2, 1, "", "variables"]], "dataprofiler.labelers.labeler_utils.FBetaScore": [[29, 4, 1, "", "add_variable"], [29, 4, 1, "", "add_weight"], [29, 2, 1, "", "dtype"], [29, 4, 1, "", "from_config"], [29, 4, 1, "", "get_config"], [29, 4, 1, "", "reset_state"], [29, 4, 1, "", "result"], [29, 4, 1, "", "stateless_reset_state"], [29, 4, 1, "", "stateless_result"], [29, 4, 1, "", "stateless_update_state"], [29, 4, 1, "", "update_state"], [29, 2, 1, "", "variables"]], "dataprofiler.labelers.regex_model": [[30, 1, 1, "", "RegexModel"]], "dataprofiler.labelers.regex_model.RegexModel": [[30, 4, 1, "", "add_label"], [30, 4, 1, "", "get_class"], [30, 4, 1, "", "get_parameters"], [30, 4, 1, "", "help"], [30, 2, 1, "", "label_mapping"], [30, 2, 1, "", "labels"], [30, 4, 1, "", "load_from_disk"], [30, 2, 1, "", "num_labels"], [30, 4, 1, "", "predict"], [30, 3, 1, "", "requires_zero_mapping"], [30, 4, 1, "", "reset_weights"], [30, 2, 1, "", "reverse_label_mapping"], [30, 4, 1, "", "save_to_disk"], [30, 4, 1, "", "set_label_mapping"], [30, 4, 1, "", "set_params"]], "dataprofiler.labelers.utils": [[31, 5, 1, "", "require_module"], [31, 5, 1, "", "warn_missing_module"]], "dataprofiler.plugins": [[33, 0, 0, "-", "decorators"], [32, 5, 1, "", "get_plugins"], [32, 5, 1, "", "load_plugins"]], "dataprofiler.plugins.decorators": [[33, 5, 1, "", "plugin_decorator"]], "dataprofiler.profilers": [[35, 0, 0, "-", "base_column_profilers"], [36, 0, 0, "-", "categorical_column_profile"], [37, 0, 0, "-", "column_profile_compilers"], [38, 0, 0, "-", "data_labeler_column_profile"], [39, 0, 0, "-", "datetime_column_profile"], [40, 0, 0, "-", "float_column_profile"], [41, 0, 0, "-", "graph_profiler"], [42, 0, 0, "-", "helpers"], [44, 0, 0, "-", "histogram_utils"], [45, 0, 0, "-", "int_column_profile"], [46, 0, 0, "-", "json_decoder"], [47, 0, 0, "-", "json_encoder"], [48, 0, 0, "-", "numerical_column_stats"], [49, 0, 0, "-", "order_column_profile"], [50, 0, 0, "-", "profile_builder"], [51, 0, 0, "-", "profiler_options"], [52, 0, 0, "-", "profiler_utils"], [53, 0, 0, "-", "text_column_profile"], [54, 0, 0, "-", "unstructured_labeler_profile"], [55, 0, 0, "-", "unstructured_text_profile"]], "dataprofiler.profilers.base_column_profilers": [[35, 1, 1, "", "BaseColumnPrimitiveTypeProfiler"], [35, 1, 1, "", "BaseColumnProfiler"]], "dataprofiler.profilers.base_column_profilers.BaseColumnPrimitiveTypeProfiler": [[35, 3, 1, "", "col_type"], [35, 4, 1, "", "diff"], [35, 4, 1, "", "load_from_dict"], [35, 3, 1, "", "metadata"], [35, 3, 1, "", "name"], [35, 2, 1, "", "profile"], [35, 4, 1, "", "report"], [35, 3, 1, "", "sample_size"], [35, 3, 1, "", "thread_safe"], [35, 3, 1, "", "times"], [35, 4, 1, "", "update"]], "dataprofiler.profilers.base_column_profilers.BaseColumnProfiler": [[35, 3, 1, "", "col_type"], [35, 4, 1, "", "diff"], [35, 4, 1, "", "load_from_dict"], [35, 2, 1, "", "profile"], [35, 4, 1, "", "report"], [35, 4, 1, "", "update"]], "dataprofiler.profilers.categorical_column_profile": [[36, 1, 1, "", "CategoricalColumn"]], "dataprofiler.profilers.categorical_column_profile.CategoricalColumn": [[36, 2, 1, "", "categorical_counts"], [36, 2, 1, "", "categories"], [36, 3, 1, "", "col_type"], [36, 4, 1, "", "diff"], [36, 2, 1, "", "gini_impurity"], [36, 2, 1, "", "is_match"], [36, 4, 1, "", "load_from_dict"], [36, 3, 1, "", "metadata"], [36, 3, 1, "", "name"], [36, 2, 1, "", "profile"], [36, 4, 1, "", "report"], [36, 3, 1, "", "sample_size"], [36, 3, 1, "", "thread_safe"], [36, 3, 1, "", "times"], [36, 3, 1, "", "type"], [36, 2, 1, "", "unalikeability"], [36, 2, 1, "", "unique_count"], [36, 2, 1, "", "unique_ratio"], [36, 4, 1, "", "update"]], "dataprofiler.profilers.column_profile_compilers": [[37, 1, 1, "", "BaseCompiler"], [37, 1, 1, "", "ColumnDataLabelerCompiler"], [37, 1, 1, "", "ColumnPrimitiveTypeProfileCompiler"], [37, 1, 1, "", "ColumnStatsProfileCompiler"], [37, 1, 1, "", "UnstructuredCompiler"]], "dataprofiler.profilers.column_profile_compilers.BaseCompiler": [[37, 4, 1, "", "diff"], [37, 4, 1, "", "load_from_dict"], [37, 2, 1, "", "profile"], [37, 4, 1, "", "report"], [37, 4, 1, "", "update_profile"]], "dataprofiler.profilers.column_profile_compilers.ColumnDataLabelerCompiler": [[37, 4, 1, "", "diff"], [37, 4, 1, "", "load_from_dict"], [37, 2, 1, "", "profile"], [37, 4, 1, "", "report"], [37, 4, 1, "", "update_profile"]], "dataprofiler.profilers.column_profile_compilers.ColumnPrimitiveTypeProfileCompiler": [[37, 4, 1, "", "diff"], [37, 4, 1, "", "load_from_dict"], [37, 2, 1, "", "profile"], [37, 4, 1, "", "report"], [37, 2, 1, "", "selected_data_type"], [37, 4, 1, "", "update_profile"]], "dataprofiler.profilers.column_profile_compilers.ColumnStatsProfileCompiler": [[37, 4, 1, "", "diff"], [37, 4, 1, "", "load_from_dict"], [37, 2, 1, "", "profile"], [37, 4, 1, "", "report"], [37, 4, 1, "", "update_profile"]], "dataprofiler.profilers.column_profile_compilers.UnstructuredCompiler": [[37, 4, 1, "", "diff"], [37, 4, 1, "", "load_from_dict"], [37, 2, 1, "", "profile"], [37, 4, 1, "", "report"], [37, 4, 1, "", "update_profile"]], "dataprofiler.profilers.data_labeler_column_profile": [[38, 1, 1, "", "DataLabelerColumn"]], "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn": [[38, 4, 1, "", "assert_equal_conditions"], [38, 2, 1, "", "avg_predictions"], [38, 3, 1, "", "col_type"], [38, 2, 1, "", "data_label"], [38, 4, 1, "", "diff"], [38, 2, 1, "", "label_representation"], [38, 4, 1, "", "load_from_dict"], [38, 3, 1, "", "metadata"], [38, 3, 1, "", "name"], [38, 2, 1, "", "possible_data_labels"], [38, 2, 1, "", "profile"], [38, 2, 1, "", "rank_distribution"], [38, 4, 1, "", "report"], [38, 2, 1, "", "reverse_label_mapping"], [38, 3, 1, "", "sample_size"], [38, 2, 1, "", "sum_predictions"], [38, 3, 1, "", "thread_safe"], [38, 3, 1, "", "times"], [38, 3, 1, "", "type"], [38, 4, 1, "", "update"]], "dataprofiler.profilers.datetime_column_profile": [[39, 1, 1, "", "DateTimeColumn"]], "dataprofiler.profilers.datetime_column_profile.DateTimeColumn": [[39, 3, 1, "", "col_type"], [39, 2, 1, "", "data_type_ratio"], [39, 4, 1, "", "diff"], [39, 4, 1, "", "load_from_dict"], [39, 3, 1, "", "match_count"], [39, 3, 1, "", "metadata"], [39, 3, 1, "", "name"], [39, 2, 1, "", "profile"], [39, 4, 1, "", "report"], [39, 3, 1, "", "sample_size"], [39, 3, 1, "", "thread_safe"], [39, 3, 1, "", "times"], [39, 3, 1, "", "type"], [39, 4, 1, "", "update"]], "dataprofiler.profilers.float_column_profile": [[40, 1, 1, "", "FloatColumn"]], "dataprofiler.profilers.float_column_profile.FloatColumn": [[40, 3, 1, "", "col_type"], [40, 2, 1, "", "data_type_ratio"], [40, 4, 1, "", "diff"], [40, 4, 1, "", "is_float"], [40, 4, 1, "", "is_int"], [40, 2, 1, "", "kurtosis"], [40, 4, 1, "", "load_from_dict"], [40, 3, 1, "", "match_count"], [40, 2, 1, "", "mean"], [40, 2, 1, "", "median"], [40, 2, 1, "", "median_abs_deviation"], [40, 3, 1, "", "metadata"], [40, 2, 1, "", "mode"], [40, 3, 1, "", "name"], [40, 4, 1, "", "np_type_to_type"], [40, 2, 1, "", "precision"], [40, 2, 1, "", "profile"], [40, 4, 1, "", "report"], [40, 3, 1, "", "sample_size"], [40, 2, 1, "", "skewness"], [40, 2, 1, "", "stddev"], [40, 3, 1, "", "thread_safe"], [40, 3, 1, "", "times"], [40, 3, 1, "", "type"], [40, 4, 1, "", "update"], [40, 2, 1, "", "variance"]], "dataprofiler.profilers.graph_profiler": [[41, 1, 1, "", "GraphProfiler"]], "dataprofiler.profilers.graph_profiler.GraphProfiler": [[41, 4, 1, "", "diff"], [41, 4, 1, "", "load"], [41, 2, 1, "", "profile"], [41, 4, 1, "", "report"], [41, 4, 1, "", "save"], [41, 3, 1, "", "times"], [41, 4, 1, "", "update"]], "dataprofiler.profilers.helpers": [[42, 5, 1, "", "calculate_quantiles"], [43, 0, 0, "-", "report_helpers"]], "dataprofiler.profilers.helpers.report_helpers": [[43, 5, 1, "", "calculate_quantiles"], [43, 5, 1, "", "flat_dict"]], "dataprofiler.profilers.int_column_profile": [[45, 1, 1, "", "IntColumn"]], "dataprofiler.profilers.int_column_profile.IntColumn": [[45, 3, 1, "", "col_type"], [45, 2, 1, "", "data_type_ratio"], [45, 4, 1, "", "diff"], [45, 4, 1, "", "is_float"], [45, 4, 1, "", "is_int"], [45, 2, 1, "", "kurtosis"], [45, 4, 1, "", "load_from_dict"], [45, 3, 1, "", "match_count"], [45, 2, 1, "", "mean"], [45, 2, 1, "", "median"], [45, 2, 1, "", "median_abs_deviation"], [45, 3, 1, "", "metadata"], [45, 2, 1, "", "mode"], [45, 3, 1, "", "name"], [45, 4, 1, "", "np_type_to_type"], [45, 2, 1, "", "profile"], [45, 4, 1, "", "report"], [45, 3, 1, "", "sample_size"], [45, 2, 1, "", "skewness"], [45, 2, 1, "", "stddev"], [45, 3, 1, "", "thread_safe"], [45, 3, 1, "", "times"], [45, 3, 1, "", "type"], [45, 4, 1, "", "update"], [45, 2, 1, "", "variance"]], "dataprofiler.profilers.json_decoder": [[46, 5, 1, "", "get_column_profiler_class"], [46, 5, 1, "", "get_compiler_class"], [46, 5, 1, "", "get_option_class"], [46, 5, 1, "", "get_profiler_class"], [46, 5, 1, "", "get_structured_col_profiler_class"], [46, 5, 1, "", "load_column_profile"], [46, 5, 1, "", "load_compiler"], [46, 5, 1, "", "load_option"], [46, 5, 1, "", "load_profiler"], [46, 5, 1, "", "load_structured_col_profiler"]], "dataprofiler.profilers.json_encoder": [[47, 1, 1, "", "ProfileEncoder"]], "dataprofiler.profilers.json_encoder.ProfileEncoder": [[47, 4, 1, "", "default"], [47, 4, 1, "", "encode"], [47, 3, 1, "", "item_separator"], [47, 4, 1, "", "iterencode"], [47, 3, 1, "", "key_separator"]], "dataprofiler.profilers.numerical_column_stats": [[48, 1, 1, "", "NumericStatsMixin"], [48, 1, 1, "", "abstractstaticmethod"]], "dataprofiler.profilers.numerical_column_stats.NumericStatsMixin": [[48, 3, 1, "", "col_type"], [48, 4, 1, "", "diff"], [48, 4, 1, "", "is_float"], [48, 4, 1, "", "is_int"], [48, 2, 1, "", "kurtosis"], [48, 4, 1, "", "load_from_dict"], [48, 2, 1, "", "mean"], [48, 2, 1, "", "median"], [48, 2, 1, "", "median_abs_deviation"], [48, 3, 1, "", "metadata"], [48, 2, 1, "", "mode"], [48, 3, 1, "", "name"], [48, 4, 1, "", "np_type_to_type"], [48, 4, 1, "", "profile"], [48, 4, 1, "", "report"], [48, 3, 1, "", "sample_size"], [48, 2, 1, "", "skewness"], [48, 2, 1, "", "stddev"], [48, 3, 1, "", "thread_safe"], [48, 3, 1, "", "times"], [48, 3, 1, "", "type"], [48, 4, 1, "", "update"], [48, 2, 1, "", "variance"]], "dataprofiler.profilers.order_column_profile": [[49, 1, 1, "", "Comparable"], [49, 1, 1, "", "OrderColumn"]], "dataprofiler.profilers.order_column_profile.OrderColumn": [[49, 3, 1, "", "col_type"], [49, 4, 1, "", "diff"], [49, 4, 1, "", "load_from_dict"], [49, 3, 1, "", "metadata"], [49, 3, 1, "", "name"], [49, 2, 1, "", "profile"], [49, 4, 1, "", "report"], [49, 3, 1, "", "sample_size"], [49, 3, 1, "", "thread_safe"], [49, 3, 1, "", "times"], [49, 3, 1, "", "type"], [49, 4, 1, "", "update"]], "dataprofiler.profilers.profile_builder": [[50, 1, 1, "", "BaseProfiler"], [50, 1, 1, "", "Profiler"], [50, 1, 1, "", "StructuredColProfiler"], [50, 1, 1, "", "StructuredProfiler"], [50, 1, 1, "", "UnstructuredProfiler"]], "dataprofiler.profilers.profile_builder.BaseProfiler": [[50, 4, 1, "", "diff"], [50, 4, 1, "", "load"], [50, 4, 1, "", "load_from_dict"], [50, 2, 1, "", "profile"], [50, 4, 1, "", "report"], [50, 4, 1, "", "save"], [50, 4, 1, "", "update_profile"]], "dataprofiler.profilers.profile_builder.Profiler": [[50, 4, 1, "", "load"]], "dataprofiler.profilers.profile_builder.StructuredColProfiler": [[50, 4, 1, "", "clean_data_and_get_base_stats"], [50, 4, 1, "", "diff"], [50, 4, 1, "", "load_from_dict"], [50, 2, 1, "", "profile"], [50, 4, 1, "", "report"], [50, 4, 1, "", "update_column_profilers"], [50, 4, 1, "", "update_profile"]], "dataprofiler.profilers.profile_builder.StructuredProfiler": [[50, 4, 1, "", "diff"], [50, 4, 1, "", "load"], [50, 4, 1, "", "load_from_dict"], [50, 2, 1, "", "profile"], [50, 4, 1, "", "report"], [50, 4, 1, "", "save"], [50, 4, 1, "", "update_profile"]], "dataprofiler.profilers.profile_builder.UnstructuredProfiler": [[50, 4, 1, "", "diff"], [50, 4, 1, "", "load"], [50, 4, 1, "", "load_from_dict"], [50, 2, 1, "", "profile"], [50, 4, 1, "", "report"], [50, 4, 1, "", "save"], [50, 4, 1, "", "update_profile"]], "dataprofiler.profilers.profiler_options": [[51, 1, 1, "", "BaseInspectorOptions"], [51, 1, 1, "", "BaseOption"], [51, 1, 1, "", "BooleanOption"], [51, 1, 1, "", "CategoricalOptions"], [51, 1, 1, "", "CorrelationOptions"], [51, 1, 1, "", "DataLabelerOptions"], [51, 1, 1, "", "DateTimeOptions"], [51, 1, 1, "", "FloatOptions"], [51, 1, 1, "", "HistogramAndQuantilesOption"], [51, 1, 1, "", "HyperLogLogOptions"], [51, 1, 1, "", "IntOptions"], [51, 1, 1, "", "ModeOption"], [51, 1, 1, "", "NumericalOptions"], [51, 1, 1, "", "OrderOptions"], [51, 1, 1, "", "PrecisionOptions"], [51, 1, 1, "", "ProfilerOptions"], [51, 1, 1, "", "RowStatisticsOptions"], [51, 1, 1, "", "StructuredOptions"], [51, 1, 1, "", "TextOptions"], [51, 1, 1, "", "TextProfilerOptions"], [51, 1, 1, "", "UniqueCountOptions"], [51, 1, 1, "", "UnstructuredOptions"]], "dataprofiler.profilers.profiler_options.BaseInspectorOptions": [[51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.BaseOption": [[51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.BooleanOption": [[51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.CategoricalOptions": [[51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.CorrelationOptions": [[51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.DataLabelerOptions": [[51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.DateTimeOptions": [[51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.FloatOptions": [[51, 2, 1, "", "is_numeric_stats_enabled"], [51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.HistogramAndQuantilesOption": [[51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.HyperLogLogOptions": [[51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.IntOptions": [[51, 2, 1, "", "is_numeric_stats_enabled"], [51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.ModeOption": [[51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.NumericalOptions": [[51, 2, 1, "", "is_numeric_stats_enabled"], [51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.OrderOptions": [[51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.PrecisionOptions": [[51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.ProfilerOptions": [[51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.RowStatisticsOptions": [[51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.StructuredOptions": [[51, 2, 1, "", "enabled_profiles"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.TextOptions": [[51, 2, 1, "", "is_numeric_stats_enabled"], [51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.TextProfilerOptions": [[51, 4, 1, "", "is_prop_enabled"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.UniqueCountOptions": [[51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_options.UnstructuredOptions": [[51, 2, 1, "", "enabled_profiles"], [51, 4, 1, "", "load_from_dict"], [51, 2, 1, "", "properties"], [51, 4, 1, "", "set"], [51, 4, 1, "", "validate"]], "dataprofiler.profilers.profiler_utils": [[52, 1, 1, "", "KeyDict"], [52, 1, 1, "", "Subtractable"], [52, 5, 1, "", "add_nested_dictionaries"], [52, 5, 1, "", "auto_multiprocess_toggle"], [52, 5, 1, "", "biased_kurt"], [52, 5, 1, "", "biased_skew"], [52, 5, 1, "", "chunk"], [52, 5, 1, "", "find_diff_of_dates"], [52, 5, 1, "", "find_diff_of_dicts"], [52, 5, 1, "", "find_diff_of_dicts_with_diff_keys"], [52, 5, 1, "", "find_diff_of_lists_and_sets"], [52, 5, 1, "", "find_diff_of_matrices"], [52, 5, 1, "", "find_diff_of_numbers"], [52, 5, 1, "", "find_diff_of_strings_and_bools"], [52, 5, 1, "", "generate_pool"], [52, 5, 1, "", "get_memory_size"], [52, 5, 1, "", "merge"], [52, 5, 1, "", "merge_profile_list"], [52, 5, 1, "", "method_timeit"], [52, 5, 1, "", "overlap"], [52, 5, 1, "", "partition"], [52, 5, 1, "", "perform_chi_squared_test_for_homogeneity"], [52, 5, 1, "", "recursive_dict_update"], [52, 5, 1, "", "reload_labeler_from_options_or_get_new"], [52, 5, 1, "", "shuffle_in_chunks"], [52, 5, 1, "", "suggest_pool_size"], [52, 5, 1, "", "warn_on_profile"]], "dataprofiler.profilers.profiler_utils.KeyDict": [[52, 4, 1, "", "clear"], [52, 4, 1, "", "copy"], [52, 3, 1, "", "default_factory"], [52, 4, 1, "", "fromkeys"], [52, 4, 1, "", "get"], [52, 4, 1, "", "items"], [52, 4, 1, "", "keys"], [52, 4, 1, "", "pop"], [52, 4, 1, "", "popitem"], [52, 4, 1, "", "setdefault"], [52, 4, 1, "", "update"], [52, 4, 1, "", "values"]], "dataprofiler.profilers.text_column_profile": [[53, 1, 1, "", "TextColumn"]], "dataprofiler.profilers.text_column_profile.TextColumn": [[53, 3, 1, "", "bias_correction"], [53, 3, 1, "", "col_type"], [53, 2, 1, "", "data_type_ratio"], [53, 4, 1, "", "diff"], [53, 3, 1, "", "histogram_bin_method_names"], [53, 3, 1, "", "histogram_methods"], [53, 3, 1, "", "histogram_selection"], [53, 4, 1, "", "is_float"], [53, 4, 1, "", "is_int"], [53, 2, 1, "", "kurtosis"], [53, 4, 1, "", "load_from_dict"], [53, 3, 1, "", "match_count"], [53, 3, 1, "", "max"], [53, 3, 1, "", "max_histogram_bin"], [53, 2, 1, "", "mean"], [53, 2, 1, "", "median"], [53, 2, 1, "", "median_abs_deviation"], [53, 3, 1, "", "metadata"], [53, 3, 1, "", "min"], [53, 3, 1, "", "min_histogram_bin"], [53, 2, 1, "", "mode"], [53, 3, 1, "", "name"], [53, 4, 1, "", "np_type_to_type"], [53, 3, 1, "", "num_negatives"], [53, 3, 1, "", "num_zeros"], [53, 2, 1, "", "profile"], [53, 3, 1, "", "quantiles"], [53, 4, 1, "", "report"], [53, 3, 1, "", "sample_size"], [53, 2, 1, "", "skewness"], [53, 2, 1, "", "stddev"], [53, 3, 1, "", "sum"], [53, 3, 1, "", "thread_safe"], [53, 3, 1, "", "times"], [53, 3, 1, "", "type"], [53, 4, 1, "", "update"], [53, 3, 1, "", "user_set_histogram_bin"], [53, 2, 1, "", "variance"]], "dataprofiler.profilers.unstructured_labeler_profile": [[54, 1, 1, "", "UnstructuredLabelerProfile"]], "dataprofiler.profilers.unstructured_labeler_profile.UnstructuredLabelerProfile": [[54, 4, 1, "", "diff"], [54, 2, 1, "", "label_encoding"], [54, 2, 1, "", "profile"], [54, 4, 1, "", "report"], [54, 3, 1, "", "type"], [54, 4, 1, "", "update"]], "dataprofiler.profilers.unstructured_text_profile": [[55, 1, 1, "", "TextProfiler"]], "dataprofiler.profilers.unstructured_text_profile.TextProfiler": [[55, 4, 1, "", "diff"], [55, 2, 1, "", "profile"], [55, 4, 1, "", "report"], [55, 3, 1, "", "type"], [55, 4, 1, "", "update"]], "dataprofiler.reports": [[58, 0, 0, "-", "graphs"], [59, 0, 0, "-", "utils"]], "dataprofiler.reports.utils": [[59, 5, 1, "", "require_module"], [59, 5, 1, "", "warn_missing_module"]], "dataprofiler.rng_utils": [[60, 5, 1, "", "get_random_number_generator"]], "dataprofiler.validators": [[63, 0, 0, "-", "base_validators"]], "dataprofiler.validators.base_validators": [[63, 1, 1, "", "Validator"], [63, 5, 1, "", "is_in_list"], [63, 5, 1, "", "is_in_range"]], "dataprofiler.validators.base_validators.Validator": [[63, 4, 1, "", "get"], [63, 4, 1, "", "validate"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:property", "3": "py:attribute", "4": "py:method", "5": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "property", "Python property"], "3": ["py", "attribute", "Python attribute"], "4": ["py", "method", "Python method"], "5": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "content": 0, "ad": 1, "new": [1, 2, 3, 77, 78], "model": [1, 2, 3, 22, 23, 24, 26, 30, 78], "exist": [1, 2, 3, 77], "datalabel": [1, 77], "pipelin": [1, 66], "dataset": 1, "implement": 1, "charact": [1, 24], "level": [1, 24], "lstm": 1, "integr": [1, 77], "columnnam": 2, "label": [2, 3, 20, 21, 27, 29, 38, 54, 68, 70, 77], "tutori": 2, "load": [2, 3, 4, 23, 66, 68, 70, 75, 76, 77, 79], "predict": [2, 70, 77], "us": [2, 5, 74, 77], "pre": [2, 77], "load_from_librari": 2, "column": [2, 4, 26, 35, 36, 37, 38, 39, 40, 45, 48, 49, 53], "name": [2, 26], "load_with_compon": 2, "replac": [2, 77], "paramet": 2, "save": [2, 66, 70, 75, 76, 77, 79], "data": [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 21, 27, 28, 38, 66, 67, 68, 70, 71, 73, 74, 75, 76, 77, 78, 79], "futur": [2, 77], "sensit": [3, 70], "identifi": 3, "entiti": 3, "structur": [3, 17, 70, 73, 75, 76, 77], "unstructur": [3, 54, 55, 68, 70, 73, 75, 79], "train": [3, 70], "an": [3, 5, 67], "extend": 3, "transfer": [3, 70], "learn": [3, 70], "build": [3, 69, 70, 77], "your": [3, 67, 68, 73, 75, 77], "own": [3, 77], "compon": 3, "preprocessor": 3, "postprocessor": [3, 77], "intro": 4, "reader": [4, 5, 7, 73, 78], "automat": 4, "read": 4, "detect": [4, 70], "specifi": [4, 68, 75], "option": [4, 51, 73, 75, 76, 79], "panda": [4, 68, 75], "datafram": [4, 68, 75], "access": 4, "attribut": 4, "check": 4, "file": [4, 68, 75], "type": [4, 76, 79], "is_match": 4, "reload": 4, "after": 4, "alter": 4, "A": 4, "deeper": 4, "dive": 4, "csvdata": [4, 5], "delimit": [4, 68, 75], "quotechar": 4, "header": 4, "data_format": 4, "select": 4, "graphdata": [4, 5], "jsondata": 5, "avrodata": 5, "parquetdata": 5, "textdata": 5, "url": 5, "aw": 5, "s3": 5, "uri": 5, "dataprofil": [6, 72, 74], "modul": [6, 7, 20, 32, 34, 42, 57, 62], "avro": 8, "base": [9, 21, 22, 35, 63], "csv": 10, "util": [12, 25, 29, 31, 44, 52, 56, 59, 60], "filepath": 13, "Or": 13, "buffer": 13, "graph": [14, 41, 58, 66, 67, 68, 75, 76], "json": [15, 46, 47], "parquet": 16, "mixin": 17, "text": [18, 53, 55], "dp": 19, "log": 19, "char": 23, "tf": 23, "cnn": 24, "classif": 25, "report": [25, 43, 57, 74, 75, 76, 78, 79], "process": 28, "regex": [30, 77], "plugin": 32, "decor": 33, "profil": [34, 35, 36, 37, 38, 39, 40, 41, 45, 49, 50, 51, 52, 53, 54, 55, 66, 68, 71, 73, 75, 76, 77, 78, 79], "categor": 36, "compil": 37, "datetim": 39, "float": 40, "helper": [42, 43], "histogram": [44, 67], "int": 45, "decod": 46, "encod": 47, "numer": 48, "stat": 48, "order": [49, 75], "builder": 50, "rng": 60, "set": [61, 75, 77], "valid": [62, 63], "version": [64, 68, 78], "exampl": [65, 67, 74], "basic": [65, 73], "demo": 66, "differ": [66, 75, 76, 79], "conclus": [66, 73], "what": [67, 68, 70, 73], "we": 67, "need": 67, "import": [67, 71], "plot": 67, "from": [67, 69, 70], "structuredprofil": [67, 77], "class": [67, 73], "individu": 67, "intcolumn": 67, "floatcolumn": 67, "miss": 67, "valu": 67, "matrix": 67, "s": [68, 73, 74, 77], "purpos": 68, "support": 68, "format": 68, "get": 68, "start": 68, "updat": [68, 73, 75, 76, 78, 79], "merg": [68, 71, 73, 75, 76, 79], "filetyp": [68, 75], "instal": [69, 74], "snappi": 69, "scratch": [69, 70], "test": [69, 78], "ground": 70, "up": 70, "list": 71, "setup": 71, "run": 71, "usag": [73, 74], "vs": [73, 75], "dataload": 74, "popmon": 74, "how": 74, "comparison": 74, "both": 74, "sampl": 75, "size": 75, "seed": 75, "statist": 75, "descript": 75, "depend": 75, "w": 77, "rule": 77, "adjust": 77, "properti": 77, "roadmap": 78, "histor": 78, "condit": 78, "metric": 78, "space": 78, "time": 78, "suit": 78, "upgrad": 78, "miscellan": 78}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "nbsphinx": 3, "sphinx": 56}}) \ No newline at end of file diff --git a/docs/0.12.0/html/unstructured_profiler_example.html b/docs/0.12.0/html/unstructured_profiler_example.html new file mode 100644 index 000000000..881deb117 --- /dev/null +++ b/docs/0.12.0/html/unstructured_profiler_example.html @@ -0,0 +1,837 @@ + + + + + + + + + Unstructured Profilers - Data Profiler v0.12.0 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Unstructured Profilers

+

Data profiling - is the process of examining a dataset and collecting statistical or informational summaries about said dataset.

+

The Profiler class inside the DataProfiler is designed to generate data profiles via the Profiler class, which ingests either a Data class or a Pandas DataFrame.

+

Currently, the Data class supports loading the following file formats:

+
    +
  • Any delimited (CSV, TSV, etc.)

  • +
  • JSON object

  • +
  • Avro

  • +
  • Parquet

  • +
  • Text files

  • +
  • Pandas Series/Dataframe

  • +
+

Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.

+

This example will look at specifically the unstructured data types for unstructured profiling. This means that only text files, lists of strings, single column pandas dataframes/series, or DataProfile Data objects in string format will work with the unstructured profiler.

+
+

Reporting

+

One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.

+

In terms of reporting, there are multiple reporting options:

+
    +
  • Pretty: Floats are rounded to four decimal places, and lists are shortened.

  • +
  • Compact: Similar to pretty, but removes detailed statistics

  • +
  • Serializable: Output is json serializable and not prettified

  • +
  • Flat: Nested Output is returned as a flattened dictionary

  • +
+

The Pretty and Compact reports are the two most commonly used reports and includes global_stats and data_stats for the given dataset. global_stats contains overall properties of the data such as samples used and file encoding. data_stats contains specific properties and statistics for each text sample.

+

For unstructured profiles, the report looks like this:

+
"global_stats": {
+    "samples_used": int,
+    "empty_line_count": int,
+    "file_type": string,
+    "encoding": string
+},
+"data_stats": {
+    "data_label": {
+        "entity_counts": {
+            "word_level": dict(int),
+            "true_char_level": dict(int),
+            "postprocess_char_level": dict(int)
+        },
+        "times": dict(float)
+    },
+    "statistics": {
+        "vocab": list(char),
+        "words": list(string),
+        "word_count": dict(int),
+        "times": dict(float)
+    }
+}
+
+
+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+
+try:
+    sys.path.insert(0, '..')
+    import dataprofiler as dp
+except ImportError:
+    import dataprofiler as dp
+
+data_path = "../dataprofiler/tests/data"
+
+# remove extra tf loggin
+import tensorflow as tf
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+
+
+
+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "txt/discussion_reddit.txt"))
+profile = dp.Profiler(data)
+
+report  = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Profiler Type

+

It should be noted, in addition to reading the input data from text files, DataProfiler allows the input data as a pandas dataframe, a pandas series, a list, and Data objects (when an unstructured format is selected) if the Profiler is explicitly chosen as unstructured.

+
+
[ ]:
+
+
+
+# run data profiler and get the report
+import pandas as pd
+data = dp.Data(os.path.join(data_path, "csv/SchoolDataSmall.csv"), options={"data_format": "records"})
+profile = dp.Profiler(data, profiler_type='unstructured')
+
+report  = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Profiler options

+

The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the ProfilerOptions class.

+

For example, if a user doesn’t require vocab count information they may desire to turn off the word count functionality.

+

Below, let’s remove the vocab count and set the stop words.

+

Full list of options in the Profiler section of the DataProfiler documentation.

+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "txt/discussion_reddit.txt"))
+
+profile_options = dp.ProfilerOptions()
+
+# Setting multiple options via set
+profile_options.set({ "*.vocab.is_enabled": False, "*.is_case_sensitive": True })
+
+# Set options via directly setting them
+profile_options.unstructured_options.text.stop_words = ["These", "are", "stop", "words"]
+
+profile = dp.Profiler(data, options=profile_options)
+report  = profile.report(report_options={"output_format": "compact"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Updating Profiles

+

Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately.

+
+
[ ]:
+
+
+
+# Load and profile a CSV file
+data = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+profile = dp.Profiler(data)
+
+# Update the profile with new data:
+new_data = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+profile.update_profile(new_data)
+
+# Take a peek at the data
+print(data.data)
+print(new_data.data)
+
+# Report the compact version of the profile
+report  = profile.report(report_options={"output_format": "compact"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Merging Profiles

+

Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple + command: profile3 = profile1 + profile2

+
+
[ ]:
+
+
+
+# Load a CSV file with a schema
+data1 = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+profile1 = dp.Profiler(data1)
+
+# Load another CSV file with the same schema
+data2 = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+profile2 = dp.Profiler(data2)
+
+# Merge the profiles
+profile3 = profile1 + profile2
+
+# Report the compact version of the profile
+report  = profile3.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

As you can see, the update_profile function and the + operator function similarly. The reason the + operator is important is that it’s possible to save and load profiles, which we cover next.

+
+
+

Differences in Data

+

Can be applied to both structured and unstructured datasets.

+

Such reports can provide details on the differences between training and validation data like in this pseudo example:

+
profiler_training = dp.Profiler(training_data)
+profiler_testing = dp.Profiler(testing_data)
+
+validation_report = profiler_training.diff(profiler_testing)
+
+
+
+
[ ]:
+
+
+
+from pprint import pprint
+
+# unstructured differences example
+data_split_differences = profile1.diff(profile2)
+pprint(data_split_differences)
+
+
+
+
+
+

Saving and Loading a Profile

+

Not only can the Profiler create and update profiles, it’s also possible to save, load then manipulate profiles.

+
+
[ ]:
+
+
+
+# Load data
+data = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+
+# Generate a profile
+profile = dp.Profiler(data)
+
+# Save a profile to disk for later (saves as pickle file)
+profile.save(filepath="my_profile.pkl")
+
+# Load a profile from disk
+loaded_profile = dp.Profiler.load("my_profile.pkl")
+
+# Report the compact version of the profile
+report = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more.

+
+
[ ]:
+
+
+
+# Load a multiple files via the Data class
+filenames = ["txt/sentence-3x.txt",
+             "txt/sentence.txt"]
+data_objects = []
+for filename in filenames:
+    data_objects.append(dp.Data(os.path.join(data_path, filename)))
+
+print(data_objects)
+# Generate and save profiles
+for i in range(len(data_objects)):
+    profile = dp.Profiler(data_objects[i])
+    report = profile.report(report_options={"output_format":"compact"})
+    print(json.dumps(report, indent=4))
+    profile.save(filepath="data-"+str(i)+".pkl")
+
+
+# Load profiles and add them together
+profile = None
+for i in range(len(data_objects)):
+    if profile is None:
+        profile = dp.Profiler.load("data-"+str(i)+".pkl")
+    else:
+        profile += dp.Profiler.load("data-"+str(i)+".pkl")
+
+
+# Report the compact version of the profile
+report = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.12.0/html/unstructured_profiler_example.ipynb b/docs/0.12.0/html/unstructured_profiler_example.ipynb new file mode 100644 index 000000000..9ab754cc7 --- /dev/null +++ b/docs/0.12.0/html/unstructured_profiler_example.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f37ca393", + "metadata": {}, + "source": [ + "# Unstructured Profilers" + ] + }, + { + "cell_type": "markdown", + "id": "ff9bd095", + "metadata": {}, + "source": [ + "**Data profiling** - *is the process of examining a dataset and collecting statistical or informational summaries about said dataset.*\n", + "\n", + "The Profiler class inside the DataProfiler is designed to generate *data profiles* via the Profiler class, which ingests either a Data class or a Pandas DataFrame. \n", + "\n", + "Currently, the Data class supports loading the following file formats:\n", + "\n", + "* Any delimited (CSV, TSV, etc.)\n", + "* JSON object\n", + "* Avro\n", + "* Parquet\n", + "* Text files\n", + "* Pandas Series/Dataframe\n", + "\n", + "Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.\n", + "\n", + "This example will look at specifically the unstructured data types for unstructured profiling. This means that only text files, lists of strings, single column pandas dataframes/series, or DataProfile Data objects in string format will work with the unstructured profiler. " + ] + }, + { + "cell_type": "markdown", + "id": "de58b9c4", + "metadata": {}, + "source": [ + "## Reporting" + ] + }, + { + "cell_type": "markdown", + "id": "8001185a", + "metadata": {}, + "source": [ + "One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.\n", + "\n", + "In terms of reporting, there are multiple reporting options:\n", + "\n", + "* **Pretty**: Floats are rounded to four decimal places, and lists are shortened.\n", + "* **Compact**: Similar to pretty, but removes detailed statistics\n", + "* **Serializable**: Output is json serializable and not prettified\n", + "* **Flat**: Nested Output is returned as a flattened dictionary\n", + "\n", + "The **Pretty** and **Compact** reports are the two most commonly used reports and includes `global_stats` and `data_stats` for the given dataset. `global_stats` contains overall properties of the data such as samples used and file encoding. `data_stats` contains specific properties and statistics for each text sample.\n", + "\n", + "For unstructured profiles, the report looks like this:\n", + "\n", + "```\n", + "\"global_stats\": {\n", + " \"samples_used\": int,\n", + " \"empty_line_count\": int,\n", + " \"file_type\": string,\n", + " \"encoding\": string\n", + "},\n", + "\"data_stats\": {\n", + " \"data_label\": {\n", + " \"entity_counts\": {\n", + " \"word_level\": dict(int),\n", + " \"true_char_level\": dict(int),\n", + " \"postprocess_char_level\": dict(int)\n", + " },\n", + " \"times\": dict(float)\n", + " },\n", + " \"statistics\": {\n", + " \"vocab\": list(char),\n", + " \"words\": list(string),\n", + " \"word_count\": dict(int),\n", + " \"times\": dict(float)\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fcb5447", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"\n", + "\n", + "# remove extra tf loggin\n", + "import tensorflow as tf\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7fc2df6", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"txt/discussion_reddit.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "4d183992", + "metadata": {}, + "source": [ + "## Profiler Type" + ] + }, + { + "cell_type": "markdown", + "id": "d7ec39d2", + "metadata": {}, + "source": [ + "It should be noted, in addition to reading the input data from text files, DataProfiler allows the input data as a pandas dataframe, a pandas series, a list, and Data objects (when an unstructured format is selected) if the Profiler is explicitly chosen as unstructured." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29737f25", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "import pandas as pd\n", + "data = dp.Data(os.path.join(data_path, \"csv/SchoolDataSmall.csv\"), options={\"data_format\": \"records\"})\n", + "profile = dp.Profiler(data, profiler_type='unstructured')\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "fe02ad64", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "40804cc9", + "metadata": {}, + "source": [ + "The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the `ProfilerOptions` class.\n", + "\n", + "For example, if a user doesn't require vocab count information they may desire to turn off the word count functionality.\n", + "\n", + "Below, let's remove the vocab count and set the stop words. \n", + "\n", + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler/profile_options.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d25d899", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"txt/discussion_reddit.txt\"))\n", + "\n", + "profile_options = dp.ProfilerOptions()\n", + "\n", + "# Setting multiple options via set\n", + "profile_options.set({ \"*.vocab.is_enabled\": False, \"*.is_case_sensitive\": True })\n", + "\n", + "# Set options via directly setting them\n", + "profile_options.unstructured_options.text.stop_words = [\"These\", \"are\", \"stop\", \"words\"]\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\": \"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "2052415a", + "metadata": {}, + "source": [ + "## Updating Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "7e02f746", + "metadata": {}, + "source": [ + "Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab8022f", + "metadata": {}, + "outputs": [], + "source": [ + "# Load and profile a CSV file\n", + "data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Update the profile with new data:\n", + "new_data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile.update_profile(new_data)\n", + "\n", + "# Take a peek at the data\n", + "print(data.data)\n", + "print(new_data.data)\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\": \"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "66ec6dc5", + "metadata": {}, + "source": [ + "## Merging Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "e2265fe9", + "metadata": {}, + "source": [ + "Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple `+` command: `profile3 = profile1 + profile2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc68ca07", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a CSV file with a schema\n", + "data1 = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile1 = dp.Profiler(data1)\n", + "\n", + "# Load another CSV file with the same schema\n", + "data2 = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile2 = dp.Profiler(data2)\n", + "\n", + "# Merge the profiles\n", + "profile3 = profile1 + profile2\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile3.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "7ea07dc6", + "metadata": {}, + "source": [ + "As you can see, the `update_profile` function and the `+` operator function similarly. The reason the `+` operator is important is that it's possible to *save and load profiles*, which we cover next." + ] + }, + { + "cell_type": "markdown", + "id": "4704961a", + "metadata": {}, + "source": [ + "## Differences in Data\n", + "Can be applied to both structured and unstructured datasets. \n", + "\n", + "Such reports can provide details on the differences between training and validation data like in this pseudo example:\n", + "```python\n", + "profiler_training = dp.Profiler(training_data)\n", + "profiler_testing = dp.Profiler(testing_data)\n", + "\n", + "validation_report = profiler_training.diff(profiler_testing)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58f92c1b", + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "# unstructured differences example\n", + "data_split_differences = profile1.diff(profile2)\n", + "pprint(data_split_differences)" + ] + }, + { + "cell_type": "markdown", + "id": "30868000", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile" + ] + }, + { + "cell_type": "markdown", + "id": "f2858072", + "metadata": {}, + "source": [ + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ad9ca57", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "\n", + "# Generate a profile\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Save a profile to disk for later (saves as pickle file)\n", + "profile.save(filepath=\"my_profile.pkl\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.pkl\")\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "8f9859c2", + "metadata": {}, + "source": [ + "With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3571f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a multiple files via the Data class\n", + "filenames = [\"txt/sentence-3x.txt\",\n", + " \"txt/sentence.txt\"]\n", + "data_objects = []\n", + "for filename in filenames:\n", + " data_objects.append(dp.Data(os.path.join(data_path, filename)))\n", + "\n", + "print(data_objects)\n", + "# Generate and save profiles\n", + "for i in range(len(data_objects)):\n", + " profile = dp.Profiler(data_objects[i])\n", + " report = profile.report(report_options={\"output_format\":\"compact\"})\n", + " print(json.dumps(report, indent=4))\n", + " profile.save(filepath=\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Load profiles and add them together\n", + "profile = None\n", + "for i in range(len(data_objects)):\n", + " if profile is None:\n", + " profile = dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + " else:\n", + " profile += dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/index.rst b/docs/source/index.rst index 5002a8449..a20aa5ff0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -476,6 +476,7 @@ In addition, it utilizes only the first 10,000 rows. Versions ======== +* `0.12.0`_ * `0.11.0`_ * `0.10.9`_ * `0.10.8`_ @@ -600,3 +601,5 @@ Versions .. _0.11.0: ../../0.11.0/html/index.html +.. _0.12.0: ../../0.12.0/html/index.html + diff --git a/index.html b/index.html index 09860e3a9..fb51eaca9 100644 --- a/index.html +++ b/index.html @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/profiler_options.html b/profiler_options.html index b9817566e..831f653ff 100644 --- a/profiler_options.html +++ b/profiler_options.html @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file