From b60fb74e777a5ec62b5fc4621f24b865a455a336 Mon Sep 17 00:00:00 2001 From: "Xue, Chendi" Date: Thu, 21 Dec 2023 23:29:15 +0000 Subject: [PATCH 1/2] add copyright to all new components Signed-off-by: Xue, Chendi --- RecDP/pyrecdp/LLM/TextPipeline.py | 16 ++++ RecDP/pyrecdp/LLM/__init__.py | 16 ++++ RecDP/pyrecdp/__init__.py | 16 ++++ RecDP/pyrecdp/autofe/AutoFE.py | 16 ++++ RecDP/pyrecdp/autofe/FeatureEstimator.py | 16 ++++ RecDP/pyrecdp/autofe/FeatureProfiler.py | 16 ++++ RecDP/pyrecdp/autofe/FeatureWrangler.py | 16 ++++ RecDP/pyrecdp/autofe/RelationalBuilder.py | 16 ++++ RecDP/pyrecdp/autofe/TabularPipeline.py | 16 ++++ RecDP/pyrecdp/autofe/__init__.py | 16 ++++ RecDP/pyrecdp/core/__init__.py | 16 ++++ RecDP/pyrecdp/core/cache_utils.py | 16 ++++ RecDP/pyrecdp/core/class_utils.py | 16 ++++ RecDP/pyrecdp/core/dataframe.py | 16 ++++ RecDP/pyrecdp/core/di_graph.py | 16 ++++ RecDP/pyrecdp/core/import_utils.py | 16 ++++ RecDP/pyrecdp/core/model_utils.py | 16 ++++ RecDP/pyrecdp/core/parallel_iterator.py | 16 ++++ RecDP/pyrecdp/core/pipeline.py | 16 ++++ RecDP/pyrecdp/core/registry.py | 16 ++++ RecDP/pyrecdp/core/schema.py | 16 ++++ RecDP/pyrecdp/core/utils.py | 16 ++++ RecDP/pyrecdp/datasets/CESM_breast_cancer.py | 79 ------------------- RecDP/pyrecdp/datasets/__init__.py | 16 ++++ .../pyrecdp/datasets/amazon_product_review.py | 19 ----- RecDP/pyrecdp/datasets/base_api.py | 16 ++++ RecDP/pyrecdp/datasets/download.py | 16 ++++ RecDP/pyrecdp/datasets/ibm_fraud_detect.py | 17 ---- RecDP/pyrecdp/datasets/nyc_taxi.py | 30 ------- RecDP/pyrecdp/datasets/outbrain.py | 22 ------ RecDP/pyrecdp/datasets/pretrained.py | 12 --- RecDP/pyrecdp/datasets/twitter_recsys.py | 13 --- RecDP/pyrecdp/primitives/estimators/base.py | 16 ++++ .../pyrecdp/primitives/estimators/lightgbm.py | 16 ++++ .../pyrecdp/primitives/generators/__init__.py | 16 ++++ RecDP/pyrecdp/primitives/generators/base.py | 16 ++++ RecDP/pyrecdp/primitives/generators/binned.py | 16 ++++ .../pyrecdp/primitives/generators/category.py | 16 ++++ .../pyrecdp/primitives/generators/datetime.py | 16 ++++ RecDP/pyrecdp/primitives/generators/drop.py | 16 ++++ RecDP/pyrecdp/primitives/generators/encode.py | 16 ++++ .../generators/feature_transform.py | 16 ++++ .../generators/featuretools_adaptor.py | 16 ++++ RecDP/pyrecdp/primitives/generators/fillna.py | 16 ++++ .../pyrecdp/primitives/generators/geograph.py | 16 ++++ .../primitives/generators/group_category.py | 16 ++++ RecDP/pyrecdp/primitives/generators/name.py | 16 ++++ RecDP/pyrecdp/primitives/generators/nlp.py | 16 ++++ .../pyrecdp/primitives/generators/relation.py | 16 ++++ RecDP/pyrecdp/primitives/generators/type.py | 16 ++++ RecDP/pyrecdp/primitives/llmutils/classify.py | 16 ++++ RecDP/pyrecdp/primitives/llmutils/convert.py | 16 ++++ .../primitives/llmutils/decontaminate.py | 16 ++++ .../primitives/llmutils/diversity_analysis.py | 16 ++++ .../primitives/llmutils/document/extractor.py | 16 ++++ .../primitives/llmutils/document/reader.py | 16 ++++ .../primitives/llmutils/document/schema.py | 16 ++++ .../primitives/llmutils/document/writer.py | 16 ++++ .../primitives/llmutils/document_extractor.py | 16 ++++ RecDP/pyrecdp/primitives/llmutils/filter.py | 16 ++++ .../primitives/llmutils/global_dedup.py | 16 ++++ .../primitives/llmutils/global_hash.py | 16 ++++ .../llmutils/index_based_reduction.py | 16 ++++ .../primitives/llmutils/language_identify.py | 16 ++++ .../pyrecdp/primitives/llmutils/near_dedup.py | 16 ++++ .../primitives/llmutils/perplexity_score.py | 16 ++++ .../pii/detect/name_password_detection.py | 16 ++++ .../llmutils/pii/detect/phones_detection.py | 16 ++++ .../primitives/llmutils/pii/detect/utils.py | 16 ++++ .../primitives/llmutils/pii/pii_detection.py | 16 ++++ .../primitives/llmutils/pii/pii_redaction.py | 16 ++++ .../pyrecdp/primitives/llmutils/pii_remove.py | 16 ++++ .../primitives/llmutils/pipeline_hpo.py | 16 ++++ .../primitives/llmutils/profanity_filter.py | 16 ++++ .../primitives/llmutils/qa_generate.py | 16 ++++ .../primitives/llmutils/quality_classifier.py | 16 ++++ .../primitives/llmutils/rag_data_extractor.py | 16 ++++ .../primitives/llmutils/rouge_score_dedup.py | 16 ++++ .../primitives/llmutils/sentence_split.py | 16 ++++ .../primitives/llmutils/shrink_jsonl.py | 16 ++++ .../pyrecdp/primitives/llmutils/text_fixer.py | 16 ++++ .../primitives/llmutils/text_normalization.py | 16 ++++ .../primitives/llmutils/text_to_jsonl.py | 16 ++++ .../tokenize_and_save/count_tokens.py | 16 ++++ .../tokenize_and_save/merge_datasets.py | 16 ++++ .../llmutils/tokenize_and_save/run-dp.sh | 14 ++++ .../tokenize_and_save/tokenize_and_save.py | 16 ++++ .../primitives/llmutils/toxicity_score.py | 16 ++++ RecDP/pyrecdp/primitives/llmutils/utils.py | 16 ++++ RecDP/pyrecdp/primitives/operations/base.py | 16 ++++ .../pyrecdp/primitives/operations/category.py | 16 ++++ .../operations/constant/SPECIAL_CHARACTERS.py | 16 ++++ .../operations/constant/__init__.py | 16 ++++ RecDP/pyrecdp/primitives/operations/custom.py | 16 ++++ RecDP/pyrecdp/primitives/operations/data.py | 16 ++++ .../primitives/operations/dataframe.py | 16 ++++ .../primitives/operations/doc_loader.py | 16 ++++ RecDP/pyrecdp/primitives/operations/drop.py | 16 ++++ RecDP/pyrecdp/primitives/operations/encode.py | 16 ++++ .../operations/featuretools_adaptor.py | 16 ++++ RecDP/pyrecdp/primitives/operations/fillna.py | 16 ++++ .../operations/filter/alphanumeric_filter.py | 16 ++++ .../filter/average_line_length_filter.py | 16 ++++ .../operations/filter/badwords_filter.py | 16 ++++ .../primitives/operations/filter/base.py | 16 ++++ .../operations/filter/length_filter.py | 16 ++++ .../filter/maximum_line_length_filter.py | 16 ++++ .../operations/filter/perplexity_filter.py | 16 ++++ .../operations/filter/profanity_filter.py | 16 ++++ .../filter/special_characters_filter.py | 16 ++++ .../filter/text_gopherqualityfilter.py | 16 ++++ .../operations/filter/token_num_filter.py | 16 ++++ .../operations/filter/url_filter.py | 16 ++++ .../operations/filter/word_num_filter.py | 16 ++++ .../filter/word_repetition_filter.py | 16 ++++ .../pyrecdp/primitives/operations/geograph.py | 16 ++++ .../primitives/operations/logging_utils.py | 16 ++++ RecDP/pyrecdp/primitives/operations/merge.py | 16 ++++ RecDP/pyrecdp/primitives/operations/name.py | 16 ++++ .../primitives/operations/random_select.py | 16 ++++ .../primitives/operations/table_summary.py | 16 ++++ .../primitives/operations/text_bytesize.py | 16 ++++ .../operations/text_compare_dedup.py | 16 ++++ .../operations/text_contraction_remove.py | 16 ++++ .../primitives/operations/text_custom.py | 16 ++++ .../operations/text_deduplication.py | 16 ++++ .../operations/text_diversityindicate.py | 16 ++++ .../primitives/operations/text_fixer.py | 16 ++++ .../primitives/operations/text_ingestion.py | 16 ++++ .../operations/text_language_identify.py | 16 ++++ .../primitives/operations/text_normalize.py | 16 ++++ .../operations/text_perplexity_score.py | 16 ++++ .../primitives/operations/text_pii_remove.py | 16 ++++ .../primitives/operations/text_prompt.py | 16 ++++ .../operations/text_qualityscorer.py | 16 ++++ .../primitives/operations/text_reader.py | 16 ++++ .../operations/text_spell_correct.py | 16 ++++ .../primitives/operations/text_split.py | 16 ++++ .../primitives/operations/text_to_qa.py | 16 ++++ .../primitives/operations/text_toxicity.py | 16 ++++ .../primitives/operations/text_writer.py | 16 ++++ RecDP/pyrecdp/primitives/operations/tuple.py | 16 ++++ RecDP/pyrecdp/primitives/operations/type.py | 16 ++++ RecDP/pyrecdp/primitives/operations/utils.py | 16 ++++ .../primitives/profilers/cluster_infer.py | 16 ++++ .../profilers/distribution_infer.py | 16 ++++ RecDP/pyrecdp/primitives/profilers/statics.py | 16 ++++ .../primitives/profilers/time_series_infer.py | 16 ++++ .../primitives/profilers/type_infer.py | 16 ++++ .../spark_data_processor/data_processor.py | 16 ++++ .../spark_data_processor/encoder.py | 16 ++++ .../start-spark-standalone.sh | 14 ++++ .../primitives/spark_data_processor/utils.py | 16 ++++ .../primitives/tabutils/data_preprocess.py | 16 ++++ .../tabutils/feature_normal_transform.py | 16 ++++ .../primitives/tabutils/segmentation.py | 16 ++++ RecDP/pyrecdp/primitives/tabutils/utils.py | 16 ++++ RecDP/pyrecdp/widgets/BaseWidget.py | 16 ++++ RecDP/pyrecdp/widgets/TabWidget.py | 16 ++++ RecDP/pyrecdp/widgets/TableViewWidget.py | 16 ++++ RecDP/setup.py | 64 +++++++-------- e2eAIOK/deltatuner/deltatuner/deltatuner.py | 16 ++++ .../deltatuner/deltatuner/deltatuner_args.py | 16 ++++ .../deltatuner/deltatuner/deltatuner_model.py | 16 ++++ e2eAIOK/deltatuner/deltatuner/mapping.py | 16 ++++ .../deltatuner/scores/compute_de_score.py | 16 ++++ .../deltatuner/scores/transformer_proxy.py | 16 ++++ .../scores/transformer_proxy_ssf.py | 16 ++++ e2eAIOK/deltatuner/deltatuner/scores/utils.py | 16 ++++ .../deltatuner/search/BaseSearchEngine.py | 16 ++++ .../search/EvolutionarySearchEngine.py | 16 ++++ .../deltatuner/search/SearchEngineFactory.py | 16 ++++ e2eAIOK/deltatuner/deltatuner/search/utils.py | 16 ++++ .../deltatuner/tuner/deltatuner_lora.py | 16 ++++ .../deltatuner/tuner/deltatuner_ssf.py | 16 ++++ e2eAIOK/deltatuner/deltatuner/utils/config.py | 16 ++++ 176 files changed, 2717 insertions(+), 223 deletions(-) delete mode 100644 RecDP/pyrecdp/datasets/CESM_breast_cancer.py delete mode 100644 RecDP/pyrecdp/datasets/amazon_product_review.py delete mode 100644 RecDP/pyrecdp/datasets/ibm_fraud_detect.py delete mode 100644 RecDP/pyrecdp/datasets/nyc_taxi.py delete mode 100644 RecDP/pyrecdp/datasets/outbrain.py delete mode 100644 RecDP/pyrecdp/datasets/pretrained.py delete mode 100644 RecDP/pyrecdp/datasets/twitter_recsys.py diff --git a/RecDP/pyrecdp/LLM/TextPipeline.py b/RecDP/pyrecdp/LLM/TextPipeline.py index abf4ac947..9be961a0a 100644 --- a/RecDP/pyrecdp/LLM/TextPipeline.py +++ b/RecDP/pyrecdp/LLM/TextPipeline.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.di_graph import DiGraph from pyrecdp.core.pipeline import BasePipeline from pyrecdp.primitives.operations import Operation, BaseOperation diff --git a/RecDP/pyrecdp/LLM/__init__.py b/RecDP/pyrecdp/LLM/__init__.py index f2aa37fd0..b8e21cd9c 100644 --- a/RecDP/pyrecdp/LLM/__init__.py +++ b/RecDP/pyrecdp/LLM/__init__.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.import_utils import check_availability_and_install, list_requirements import os, pathlib diff --git a/RecDP/pyrecdp/__init__.py b/RecDP/pyrecdp/__init__.py index 7bdf9a1a2..148c926ba 100644 --- a/RecDP/pyrecdp/__init__.py +++ b/RecDP/pyrecdp/__init__.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os, sys from pathlib import Path try: diff --git a/RecDP/pyrecdp/autofe/AutoFE.py b/RecDP/pyrecdp/autofe/AutoFE.py index 1746825b3..617018ee9 100644 --- a/RecDP/pyrecdp/autofe/AutoFE.py +++ b/RecDP/pyrecdp/autofe/AutoFE.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import logging from pyrecdp.core.utils import Timer, infer_problem_type from pyrecdp.core.dataframe import DataFrameAPI diff --git a/RecDP/pyrecdp/autofe/FeatureEstimator.py b/RecDP/pyrecdp/autofe/FeatureEstimator.py index ecbe20104..3386176cc 100644 --- a/RecDP/pyrecdp/autofe/FeatureEstimator.py +++ b/RecDP/pyrecdp/autofe/FeatureEstimator.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.generators import * from pyrecdp.autofe.TabularPipeline import TabularPipeline import logging diff --git a/RecDP/pyrecdp/autofe/FeatureProfiler.py b/RecDP/pyrecdp/autofe/FeatureProfiler.py index 0a373aaae..f4c5703af 100644 --- a/RecDP/pyrecdp/autofe/FeatureProfiler.py +++ b/RecDP/pyrecdp/autofe/FeatureProfiler.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from jinja2 import Environment, PackageLoader from pyrecdp.primitives.profilers import * from pyrecdp.primitives.generators import * diff --git a/RecDP/pyrecdp/autofe/FeatureWrangler.py b/RecDP/pyrecdp/autofe/FeatureWrangler.py index 9b235a6a0..54d46d72f 100644 --- a/RecDP/pyrecdp/autofe/FeatureWrangler.py +++ b/RecDP/pyrecdp/autofe/FeatureWrangler.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.generators import * from pyrecdp.primitives.profilers import * from pyrecdp.autofe.TabularPipeline import TabularPipeline diff --git a/RecDP/pyrecdp/autofe/RelationalBuilder.py b/RecDP/pyrecdp/autofe/RelationalBuilder.py index ed6454157..d001a6821 100644 --- a/RecDP/pyrecdp/autofe/RelationalBuilder.py +++ b/RecDP/pyrecdp/autofe/RelationalBuilder.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.generators import * from pyrecdp.primitives.profilers import * from pyrecdp.autofe.TabularPipeline import TabularPipeline diff --git a/RecDP/pyrecdp/autofe/TabularPipeline.py b/RecDP/pyrecdp/autofe/TabularPipeline.py index 49668b7dc..aeb9f3105 100644 --- a/RecDP/pyrecdp/autofe/TabularPipeline.py +++ b/RecDP/pyrecdp/autofe/TabularPipeline.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.generators import * from pyrecdp.core.schema import DataFrameSchema from pyrecdp.core.di_graph import DiGraph diff --git a/RecDP/pyrecdp/autofe/__init__.py b/RecDP/pyrecdp/autofe/__init__.py index 824b8d32f..ec2892f1f 100644 --- a/RecDP/pyrecdp/autofe/__init__.py +++ b/RecDP/pyrecdp/autofe/__init__.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.import_utils import check_availability_and_install, list_requirements import os, pathlib diff --git a/RecDP/pyrecdp/core/__init__.py b/RecDP/pyrecdp/core/__init__.py index ace8ff5fc..9c50b39e0 100644 --- a/RecDP/pyrecdp/core/__init__.py +++ b/RecDP/pyrecdp/core/__init__.py @@ -1 +1,17 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.data_processor import DataProcessor as SparkDataProcessor \ No newline at end of file diff --git a/RecDP/pyrecdp/core/cache_utils.py b/RecDP/pyrecdp/core/cache_utils.py index 91a048ce7..b95c8b0bd 100644 --- a/RecDP/pyrecdp/core/cache_utils.py +++ b/RecDP/pyrecdp/core/cache_utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os # Default cache location diff --git a/RecDP/pyrecdp/core/class_utils.py b/RecDP/pyrecdp/core/class_utils.py index 5791412f8..cd740cd6a 100644 --- a/RecDP/pyrecdp/core/class_utils.py +++ b/RecDP/pyrecdp/core/class_utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + def new_instance(module, clazz, **clazz_kwargs): import importlib diff --git a/RecDP/pyrecdp/core/dataframe.py b/RecDP/pyrecdp/core/dataframe.py index 31bcebdfe..997f4a3e5 100644 --- a/RecDP/pyrecdp/core/dataframe.py +++ b/RecDP/pyrecdp/core/dataframe.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import pandas as pd class DataFrameAPI: diff --git a/RecDP/pyrecdp/core/di_graph.py b/RecDP/pyrecdp/core/di_graph.py index 5fd16596f..87c2554d1 100644 --- a/RecDP/pyrecdp/core/di_graph.py +++ b/RecDP/pyrecdp/core/di_graph.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from collections import defaultdict class Graph: diff --git a/RecDP/pyrecdp/core/import_utils.py b/RecDP/pyrecdp/core/import_utils.py index 37ce26428..df0dbe834 100644 --- a/RecDP/pyrecdp/core/import_utils.py +++ b/RecDP/pyrecdp/core/import_utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os from typing import Optional import pip diff --git a/RecDP/pyrecdp/core/model_utils.py b/RecDP/pyrecdp/core/model_utils.py index c95070deb..7cd0731f3 100644 --- a/RecDP/pyrecdp/core/model_utils.py +++ b/RecDP/pyrecdp/core/model_utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + """ This code is adapted from Alibaba data-juicer https://github.com/alibaba/data-juicer/blob/main/data_juicer/utils/model_utils.py """ diff --git a/RecDP/pyrecdp/core/parallel_iterator.py b/RecDP/pyrecdp/core/parallel_iterator.py index dec15fd4a..9b36bd63a 100644 --- a/RecDP/pyrecdp/core/parallel_iterator.py +++ b/RecDP/pyrecdp/core/parallel_iterator.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor import multiprocessing from multiprocessing.pool import ThreadPool diff --git a/RecDP/pyrecdp/core/pipeline.py b/RecDP/pyrecdp/core/pipeline.py index 1816f6406..4e9081823 100644 --- a/RecDP/pyrecdp/core/pipeline.py +++ b/RecDP/pyrecdp/core/pipeline.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.di_graph import DiGraph from pyrecdp.primitives.operations import Operation import logging diff --git a/RecDP/pyrecdp/core/registry.py b/RecDP/pyrecdp/core/registry.py index 87a35c037..993e653f3 100644 --- a/RecDP/pyrecdp/core/registry.py +++ b/RecDP/pyrecdp/core/registry.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + class Registry(object): def __init__(self, registry_name): self._name = registry_name diff --git a/RecDP/pyrecdp/core/schema.py b/RecDP/pyrecdp/core/schema.py index 815e50025..4f764cf42 100644 --- a/RecDP/pyrecdp/core/schema.py +++ b/RecDP/pyrecdp/core/schema.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from woodwork.column_schema import ColumnSchema from pandas import StringDtype from pyrecdp.core.utils import is_text_series, is_tuple, is_integer_convertable diff --git a/RecDP/pyrecdp/core/utils.py b/RecDP/pyrecdp/core/utils.py index e278841c4..34338bcda 100644 --- a/RecDP/pyrecdp/core/utils.py +++ b/RecDP/pyrecdp/core/utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import numpy as np import pandas as pd import copy diff --git a/RecDP/pyrecdp/datasets/CESM_breast_cancer.py b/RecDP/pyrecdp/datasets/CESM_breast_cancer.py deleted file mode 100644 index f4ab6bfe7..000000000 --- a/RecDP/pyrecdp/datasets/CESM_breast_cancer.py +++ /dev/null @@ -1,79 +0,0 @@ -from .base_api import base_api - -def try_finall(regex, input_str): - import re - ret = re.findall(regex, input_str) - if len(ret) == 0: - return None - else: - return ret[0] - -def extract_info_from_report(report_content): - method = None - side = None - p_id = None - ret = {} - lines = [i for i in report_content.split('\n') if i != ''] - for t in lines: - if 'PATIENT NO.' in t: - p_id = try_finall("\d+", t) - method = None - side = None - elif 'SOFT TISSUE MAMMOGRAPHY REVEALED:' in t: - method = 'DM' - elif 'OPINION:' in t: - method = 'OP' - elif 'CONTRAST ENHANCED SPECTRAL MAMMOGRAPHY REVEALED:' in t: - method = 'CESM' - elif 'Right Breast' in t: - side = "R" - elif 'Left Breast' in t: - side = "L" - else: - if side is None or method is None: - continue - cur_key = f"{side}_{method}" - if cur_key not in ret: - ret[cur_key] = {} - ret[cur_key]['Side'] = side - ret[cur_key]['Patient_ID'] = int(p_id) - ret[cur_key]['Type'] = method - ret[cur_key]['symptoms'] = t - - return ret - -class CESM_breast_cancer(base_api): - def __init__(self, scale = 'full'): - super().__init__() - file_list = { - 'medical_report': "https://wiki.cancerimagingarchive.net/download/attachments/109379611/Medical%20reports%20for%20cases%20.zip?api=v2", - 'manual_annotations': "https://wiki.cancerimagingarchive.net/download/attachments/109379611/Radiology%20manual%20annotations.xlsx?api=v2" - } - - self.saved_path = dict() - self.saved_path['medical_report'] = self.download_url("Medical reports for cases", file_list['medical_report'], unzip = True) - self.saved_path['manual_annotations'] = self.download_url("radiology_manual_annotations.xlsx", file_list['manual_annotations']) - - def to_pandas(self, nrows = None): - import pandas as pd - import os, docx2txt - ret = {} - ret['manual_annotations'] = pd.read_excel(self.saved_path['manual_annotations'], sheet_name="all") - #ret['medical_report'] = {} - medical_report_content = [] - medical_extracted = [] - for f in os.listdir(self.saved_path['medical_report']): - try: - file_content = docx2txt.process(os.path.join(self.saved_path['medical_report'], f)) - except Exception as e: - file_content = "" - Warning(e) - medical_report_content.append(file_content) - ext = extract_info_from_report(file_content) - for side, line in ext.items(): - medical_extracted.append(line) - - ret['medical_report'] = pd.DataFrame.from_records(medical_extracted) - - return ret - \ No newline at end of file diff --git a/RecDP/pyrecdp/datasets/__init__.py b/RecDP/pyrecdp/datasets/__init__.py index 840337d3b..3204bd45b 100644 --- a/RecDP/pyrecdp/datasets/__init__.py +++ b/RecDP/pyrecdp/datasets/__init__.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.datasets.download import * from pyrecdp.datasets.nyc_taxi import * from pyrecdp.datasets.twitter_recsys import * diff --git a/RecDP/pyrecdp/datasets/amazon_product_review.py b/RecDP/pyrecdp/datasets/amazon_product_review.py deleted file mode 100644 index 239adea89..000000000 --- a/RecDP/pyrecdp/datasets/amazon_product_review.py +++ /dev/null @@ -1,19 +0,0 @@ -from .base_api import base_api - -class amazon_product_review(base_api): - def __init__(self): - super().__init__() - name = "amazon_reviews_us_Books.tsv" - url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_00.tsv.gz" - self.saved_path = self.download_url(name, url, unzip = True) - - def to_pandas(self, nrows = None): - import pandas as pd - df = pd.read_table(self.saved_path, on_bad_lines='skip') - - # fix train - df = df.loc[df['star_rating'].apply(lambda x: len(str(x)) <= 3)] - df['star_rating'] = df['star_rating'].astype(float) - - return df - \ No newline at end of file diff --git a/RecDP/pyrecdp/datasets/base_api.py b/RecDP/pyrecdp/datasets/base_api.py index c9a859f69..f379badd2 100644 --- a/RecDP/pyrecdp/datasets/base_api.py +++ b/RecDP/pyrecdp/datasets/base_api.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os, requests from tqdm import tqdm import shutil diff --git a/RecDP/pyrecdp/datasets/download.py b/RecDP/pyrecdp/datasets/download.py index 074f46ddf..21a43b12f 100644 --- a/RecDP/pyrecdp/datasets/download.py +++ b/RecDP/pyrecdp/datasets/download.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base_api import base_api class download(base_api): diff --git a/RecDP/pyrecdp/datasets/ibm_fraud_detect.py b/RecDP/pyrecdp/datasets/ibm_fraud_detect.py deleted file mode 100644 index 125a18bae..000000000 --- a/RecDP/pyrecdp/datasets/ibm_fraud_detect.py +++ /dev/null @@ -1,17 +0,0 @@ -from .base_api import base_api - -class ibm_fraud_detect(base_api): - def __init__(self, scale = 'full'): - super().__init__() - if scale == 'test': - raise NotImplementedError("ibm_fraud_detect test dataset is not created yet") - else: - url = "https://huggingface.co/datasets/Chendi/ibm_transactions/resolve/main/transactions.tgz" - self.saved_path = self.download_url("card_transaction.v1.csv", url, unzip = True) - - def to_pandas(self, scale = 'full'): - import pandas as pd - if scale == 'test': - return pd.read_csv(self.saved_path, nrows = 100000) - return pd.read_csv(self.saved_path) - \ No newline at end of file diff --git a/RecDP/pyrecdp/datasets/nyc_taxi.py b/RecDP/pyrecdp/datasets/nyc_taxi.py deleted file mode 100644 index 9856f1689..000000000 --- a/RecDP/pyrecdp/datasets/nyc_taxi.py +++ /dev/null @@ -1,30 +0,0 @@ -from .base_api import base_api - -class nyc_taxi(base_api): - def __init__(self, scale = 'full'): - super().__init__() - self.scale = scale - if scale == 'test': - name = "test_nyc_taxi_fare.parquet" - url = f"https://pyrecdp-testdata.s3.us-west-2.amazonaws.com/{name}" - elif scale == 'test_large': - name = "nyc_taxi_fare_1M.csv" - url = f"https://pyrecdp-testdata.s3.us-west-2.amazonaws.com/{name}" - elif scale == 'full': - name = "nyc_taxi_fare_cleaned.csv" - url = "https://huggingface.co/datasets/Chendi/NYC_TAXI_FARE_CLEANED/resolve/main/nyc_taxi_fare_cleaned.csv" - - self.saved_path = self.download_url(name, url) - - def to_pandas(self, nrows = None): - import pandas as pd - if self.scale == 'test': - return pd.read_parquet(self.saved_path) - elif self.scale == 'test_large': - return pd.read_csv(self.saved_path) - elif self.scale == 'full': - if nrows: - return pd.read_csv(self.saved_path, nrows = nrows) - else: - return pd.read_csv(self.saved_path) - \ No newline at end of file diff --git a/RecDP/pyrecdp/datasets/outbrain.py b/RecDP/pyrecdp/datasets/outbrain.py deleted file mode 100644 index ec12ec473..000000000 --- a/RecDP/pyrecdp/datasets/outbrain.py +++ /dev/null @@ -1,22 +0,0 @@ -from .base_api import base_api - -class outbrain(base_api): - def __init__(self): - super().__init__() - file_list = { - 'clicks': "clicks_train.csv", - 'documents_categories': "documents_categories.csv", - 'documents_entities': "documents_entities.csv", - 'documents_meta': "documents_meta.csv", - 'documents_topics': "documents_topics.csv", - 'events': "events.csv", - 'page_views': "page_views_sample.csv", - 'promoted_content': "promoted_content.csv" - } - - self.saved_path = dict((f_name, self.download_url(f_path, f"https://outbrain-sampled.s3.us-west-2.amazonaws.com/{f_path}")) for f_name, f_path in file_list.items()) - - def to_pandas(self, nrows = None): - import pandas as pd - return dict((f_name, pd.read_csv(f_path)) for f_name, f_path in self.saved_path.items()) - \ No newline at end of file diff --git a/RecDP/pyrecdp/datasets/pretrained.py b/RecDP/pyrecdp/datasets/pretrained.py deleted file mode 100644 index 486e9552c..000000000 --- a/RecDP/pyrecdp/datasets/pretrained.py +++ /dev/null @@ -1,12 +0,0 @@ -from .base_api import base_api - -class pretrained(base_api): - def __init__(self): - super().__init__() - - def download(self, model_name): - if model_name == "nyc_taxi_fare": - name = "lightgbm_regression_nyc_taxi_fare_amount.mdl" - url = f"https://pyrecdp-testdata.s3.us-west-2.amazonaws.com/{name}" - return self.download_url(name, url) - \ No newline at end of file diff --git a/RecDP/pyrecdp/datasets/twitter_recsys.py b/RecDP/pyrecdp/datasets/twitter_recsys.py deleted file mode 100644 index bb5b9a694..000000000 --- a/RecDP/pyrecdp/datasets/twitter_recsys.py +++ /dev/null @@ -1,13 +0,0 @@ -from .base_api import base_api - -class twitter_recsys(base_api): - def __init__(self): - super().__init__() - name = "test_twitter_recsys.parquet" - url = f"https://pyrecdp-testdata.s3.us-west-2.amazonaws.com/{name}" - self.saved_path = self.download_url(name, url) - - def to_pandas(self, nrows = None): - import pandas as pd - return pd.read_parquet(self.saved_path) - \ No newline at end of file diff --git a/RecDP/pyrecdp/primitives/estimators/base.py b/RecDP/pyrecdp/primitives/estimators/base.py index fa629ad77..4b47db0a2 100644 --- a/RecDP/pyrecdp/primitives/estimators/base.py +++ b/RecDP/pyrecdp/primitives/estimators/base.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.operations.base import BaseOperation import numpy as np from sklearn.metrics import mean_squared_error, roc_auc_score diff --git a/RecDP/pyrecdp/primitives/estimators/lightgbm.py b/RecDP/pyrecdp/primitives/estimators/lightgbm.py index be2000904..a3acd7cd3 100644 --- a/RecDP/pyrecdp/primitives/estimators/lightgbm.py +++ b/RecDP/pyrecdp/primitives/estimators/lightgbm.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseEstimator import pandas as pd import lightgbm as lgbm diff --git a/RecDP/pyrecdp/primitives/generators/__init__.py b/RecDP/pyrecdp/primitives/generators/__init__.py index c9922629a..2b4fe637e 100644 --- a/RecDP/pyrecdp/primitives/generators/__init__.py +++ b/RecDP/pyrecdp/primitives/generators/__init__.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .binned import BinnedFeatureGenerator from .category import CategoryFeatureGenerator from .group_category import GroupCategoryFeatureGenerator diff --git a/RecDP/pyrecdp/primitives/generators/base.py b/RecDP/pyrecdp/primitives/generators/base.py index 0e873a84a..dd49ade75 100644 --- a/RecDP/pyrecdp/primitives/generators/base.py +++ b/RecDP/pyrecdp/primitives/generators/base.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.schema import SeriesSchema from typing import List diff --git a/RecDP/pyrecdp/primitives/generators/binned.py b/RecDP/pyrecdp/primitives/generators/binned.py index 6017efd23..fab1b0717 100644 --- a/RecDP/pyrecdp/primitives/generators/binned.py +++ b/RecDP/pyrecdp/primitives/generators/binned.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class class BinnedFeatureGenerator(super_class): diff --git a/RecDP/pyrecdp/primitives/generators/category.py b/RecDP/pyrecdp/primitives/generators/category.py index 8b958501e..3095a361f 100644 --- a/RecDP/pyrecdp/primitives/generators/category.py +++ b/RecDP/pyrecdp/primitives/generators/category.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class import pandas as pd from pyrecdp.core.schema import SeriesSchema diff --git a/RecDP/pyrecdp/primitives/generators/datetime.py b/RecDP/pyrecdp/primitives/generators/datetime.py index 25e054aeb..9c62ac5ab 100644 --- a/RecDP/pyrecdp/primitives/generators/datetime.py +++ b/RecDP/pyrecdp/primitives/generators/datetime.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .featuretools_adaptor import FeaturetoolsBasedFeatureGenerator from featuretools.primitives import ( Day, diff --git a/RecDP/pyrecdp/primitives/generators/drop.py b/RecDP/pyrecdp/primitives/generators/drop.py index da6ed44d3..ef2fd7642 100644 --- a/RecDP/pyrecdp/primitives/generators/drop.py +++ b/RecDP/pyrecdp/primitives/generators/drop.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class from pyrecdp.primitives.operations import Operation from pyspark.sql import DataFrame as SparkDataFrame diff --git a/RecDP/pyrecdp/primitives/generators/encode.py b/RecDP/pyrecdp/primitives/generators/encode.py index 40a88dace..78f19fa03 100644 --- a/RecDP/pyrecdp/primitives/generators/encode.py +++ b/RecDP/pyrecdp/primitives/generators/encode.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class import pandas as pd from pyrecdp.core.schema import SeriesSchema diff --git a/RecDP/pyrecdp/primitives/generators/feature_transform.py b/RecDP/pyrecdp/primitives/generators/feature_transform.py index 99f887d76..fb178699a 100644 --- a/RecDP/pyrecdp/primitives/generators/feature_transform.py +++ b/RecDP/pyrecdp/primitives/generators/feature_transform.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .featuretools_adaptor import FeaturetoolsBasedFeatureGenerator from pyrecdp.primitives.operations import Operation from featuretools.primitives.base import TransformPrimitive diff --git a/RecDP/pyrecdp/primitives/generators/featuretools_adaptor.py b/RecDP/pyrecdp/primitives/generators/featuretools_adaptor.py index c2036305f..d257ca0ab 100644 --- a/RecDP/pyrecdp/primitives/generators/featuretools_adaptor.py +++ b/RecDP/pyrecdp/primitives/generators/featuretools_adaptor.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class from pyrecdp.core.schema import SeriesSchema from pyrecdp.primitives.operations import Operation diff --git a/RecDP/pyrecdp/primitives/generators/fillna.py b/RecDP/pyrecdp/primitives/generators/fillna.py index 8a618013a..68ff28963 100644 --- a/RecDP/pyrecdp/primitives/generators/fillna.py +++ b/RecDP/pyrecdp/primitives/generators/fillna.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class from pyrecdp.core.schema import SeriesSchema from pyrecdp.primitives.operations import Operation diff --git a/RecDP/pyrecdp/primitives/generators/geograph.py b/RecDP/pyrecdp/primitives/generators/geograph.py index 1fb2d2edc..e46e3c270 100644 --- a/RecDP/pyrecdp/primitives/generators/geograph.py +++ b/RecDP/pyrecdp/primitives/generators/geograph.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class from .featuretools_adaptor import FeaturetoolsBasedFeatureGenerator from pyrecdp.core.schema import SeriesSchema diff --git a/RecDP/pyrecdp/primitives/generators/group_category.py b/RecDP/pyrecdp/primitives/generators/group_category.py index b89d85459..871b7181a 100644 --- a/RecDP/pyrecdp/primitives/generators/group_category.py +++ b/RecDP/pyrecdp/primitives/generators/group_category.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class import pandas as pd from pyrecdp.core.schema import SeriesSchema diff --git a/RecDP/pyrecdp/primitives/generators/name.py b/RecDP/pyrecdp/primitives/generators/name.py index 95147c6c1..377f7a53e 100644 --- a/RecDP/pyrecdp/primitives/generators/name.py +++ b/RecDP/pyrecdp/primitives/generators/name.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class from pyrecdp.primitives.operations import Operation diff --git a/RecDP/pyrecdp/primitives/generators/nlp.py b/RecDP/pyrecdp/primitives/generators/nlp.py index 38fb3c654..986f54595 100644 --- a/RecDP/pyrecdp/primitives/generators/nlp.py +++ b/RecDP/pyrecdp/primitives/generators/nlp.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .featuretools_adaptor import FeaturetoolsBasedFeatureGenerator from pyrecdp.core.schema import SeriesSchema from pyrecdp.primitives.operations import Operation diff --git a/RecDP/pyrecdp/primitives/generators/relation.py b/RecDP/pyrecdp/primitives/generators/relation.py index 0987e3cc6..70eb54f25 100644 --- a/RecDP/pyrecdp/primitives/generators/relation.py +++ b/RecDP/pyrecdp/primitives/generators/relation.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class import pandas as pd from pyrecdp.core.schema import SeriesSchema, DataFrameSchema diff --git a/RecDP/pyrecdp/primitives/generators/type.py b/RecDP/pyrecdp/primitives/generators/type.py index f0d5c86ad..808426d4e 100644 --- a/RecDP/pyrecdp/primitives/generators/type.py +++ b/RecDP/pyrecdp/primitives/generators/type.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseFeatureGenerator as super_class from .featuretools_adaptor import FeaturetoolsBasedFeatureGenerator from pyrecdp.core.schema import SeriesSchema diff --git a/RecDP/pyrecdp/primitives/llmutils/classify.py b/RecDP/pyrecdp/primitives/llmutils/classify.py index 75677a3b4..34a2d0e7f 100644 --- a/RecDP/pyrecdp/primitives/llmutils/classify.py +++ b/RecDP/pyrecdp/primitives/llmutils/classify.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/convert.py b/RecDP/pyrecdp/primitives/llmutils/convert.py index 43e3a4a5c..82e0452e1 100644 --- a/RecDP/pyrecdp/primitives/llmutils/convert.py +++ b/RecDP/pyrecdp/primitives/llmutils/convert.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse import os, sys from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/decontaminate.py b/RecDP/pyrecdp/primitives/llmutils/decontaminate.py index bca19a36a..f4211eb4a 100644 --- a/RecDP/pyrecdp/primitives/llmutils/decontaminate.py +++ b/RecDP/pyrecdp/primitives/llmutils/decontaminate.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse import os from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/diversity_analysis.py b/RecDP/pyrecdp/primitives/llmutils/diversity_analysis.py index d9c83541b..04e3d5b9d 100644 --- a/RecDP/pyrecdp/primitives/llmutils/diversity_analysis.py +++ b/RecDP/pyrecdp/primitives/llmutils/diversity_analysis.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/document/extractor.py b/RecDP/pyrecdp/primitives/llmutils/document/extractor.py index c90541d12..e595d6d3e 100644 --- a/RecDP/pyrecdp/primitives/llmutils/document/extractor.py +++ b/RecDP/pyrecdp/primitives/llmutils/document/extractor.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .reader import * from .writer import DocumentWriter diff --git a/RecDP/pyrecdp/primitives/llmutils/document/reader.py b/RecDP/pyrecdp/primitives/llmutils/document/reader.py index 056937885..4200540b3 100644 --- a/RecDP/pyrecdp/primitives/llmutils/document/reader.py +++ b/RecDP/pyrecdp/primitives/llmutils/document/reader.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os from abc import abstractmethod, ABC from pathlib import Path diff --git a/RecDP/pyrecdp/primitives/llmutils/document/schema.py b/RecDP/pyrecdp/primitives/llmutils/document/schema.py index df1d4295c..24c4f985c 100644 --- a/RecDP/pyrecdp/primitives/llmutils/document/schema.py +++ b/RecDP/pyrecdp/primitives/llmutils/document/schema.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from dataclasses import dataclass, asdict import json diff --git a/RecDP/pyrecdp/primitives/llmutils/document/writer.py b/RecDP/pyrecdp/primitives/llmutils/document/writer.py index b28285268..e0fa407e5 100644 --- a/RecDP/pyrecdp/primitives/llmutils/document/writer.py +++ b/RecDP/pyrecdp/primitives/llmutils/document/writer.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os from .schema import Document diff --git a/RecDP/pyrecdp/primitives/llmutils/document_extractor.py b/RecDP/pyrecdp/primitives/llmutils/document_extractor.py index 2537b4c32..40b6eb850 100644 --- a/RecDP/pyrecdp/primitives/llmutils/document_extractor.py +++ b/RecDP/pyrecdp/primitives/llmutils/document_extractor.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from typing import Optional, Union from pyrecdp.primitives.llmutils.document.extractor import DocumentExtractor diff --git a/RecDP/pyrecdp/primitives/llmutils/filter.py b/RecDP/pyrecdp/primitives/llmutils/filter.py index 0d67e86cf..8c762483b 100644 --- a/RecDP/pyrecdp/primitives/llmutils/filter.py +++ b/RecDP/pyrecdp/primitives/llmutils/filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse import os import sys diff --git a/RecDP/pyrecdp/primitives/llmutils/global_dedup.py b/RecDP/pyrecdp/primitives/llmutils/global_dedup.py index b0e6dbfe9..3e68b3e06 100644 --- a/RecDP/pyrecdp/primitives/llmutils/global_dedup.py +++ b/RecDP/pyrecdp/primitives/llmutils/global_dedup.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/global_hash.py b/RecDP/pyrecdp/primitives/llmutils/global_hash.py index b7c6a2e7f..89a3ca8ce 100644 --- a/RecDP/pyrecdp/primitives/llmutils/global_hash.py +++ b/RecDP/pyrecdp/primitives/llmutils/global_hash.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse import os, sys from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/index_based_reduction.py b/RecDP/pyrecdp/primitives/llmutils/index_based_reduction.py index e9b04e91e..0b833e7be 100644 --- a/RecDP/pyrecdp/primitives/llmutils/index_based_reduction.py +++ b/RecDP/pyrecdp/primitives/llmutils/index_based_reduction.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse import os from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/language_identify.py b/RecDP/pyrecdp/primitives/llmutils/language_identify.py index 8905ecc74..59697a9e5 100644 --- a/RecDP/pyrecdp/primitives/llmutils/language_identify.py +++ b/RecDP/pyrecdp/primitives/llmutils/language_identify.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer import os diff --git a/RecDP/pyrecdp/primitives/llmutils/near_dedup.py b/RecDP/pyrecdp/primitives/llmutils/near_dedup.py index 1c50a3ae1..1765d093e 100644 --- a/RecDP/pyrecdp/primitives/llmutils/near_dedup.py +++ b/RecDP/pyrecdp/primitives/llmutils/near_dedup.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/perplexity_score.py b/RecDP/pyrecdp/primitives/llmutils/perplexity_score.py index 1dd627afb..1dabeee27 100644 --- a/RecDP/pyrecdp/primitives/llmutils/perplexity_score.py +++ b/RecDP/pyrecdp/primitives/llmutils/perplexity_score.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/pii/detect/name_password_detection.py b/RecDP/pyrecdp/primitives/llmutils/pii/detect/name_password_detection.py index 18904a995..bc5b61e99 100644 --- a/RecDP/pyrecdp/primitives/llmutils/pii/detect/name_password_detection.py +++ b/RecDP/pyrecdp/primitives/llmutils/pii/detect/name_password_detection.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from transformers import Pipeline from .utils import PIIEntityType diff --git a/RecDP/pyrecdp/primitives/llmutils/pii/detect/phones_detection.py b/RecDP/pyrecdp/primitives/llmutils/pii/detect/phones_detection.py index c50b9b289..c6e0deaff 100644 --- a/RecDP/pyrecdp/primitives/llmutils/pii/detect/phones_detection.py +++ b/RecDP/pyrecdp/primitives/llmutils/pii/detect/phones_detection.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.import_utils import check_availability_and_install def detect_phones(text): diff --git a/RecDP/pyrecdp/primitives/llmutils/pii/detect/utils.py b/RecDP/pyrecdp/primitives/llmutils/pii/detect/utils.py index da5d6d65a..adcb5a886 100644 --- a/RecDP/pyrecdp/primitives/llmutils/pii/detect/utils.py +++ b/RecDP/pyrecdp/primitives/llmutils/pii/detect/utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from enum import Enum, auto diff --git a/RecDP/pyrecdp/primitives/llmutils/pii/pii_detection.py b/RecDP/pyrecdp/primitives/llmutils/pii/pii_detection.py index bfbb61400..7130c7e18 100644 --- a/RecDP/pyrecdp/primitives/llmutils/pii/pii_detection.py +++ b/RecDP/pyrecdp/primitives/llmutils/pii/pii_detection.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .detect.ip_detection import detect_ip from .detect.emails_detection import detect_email from .detect.phones_detection import detect_phones diff --git a/RecDP/pyrecdp/primitives/llmutils/pii/pii_redaction.py b/RecDP/pyrecdp/primitives/llmutils/pii/pii_redaction.py index b14056508..12c829405 100644 --- a/RecDP/pyrecdp/primitives/llmutils/pii/pii_redaction.py +++ b/RecDP/pyrecdp/primitives/llmutils/pii/pii_redaction.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import json import random import string diff --git a/RecDP/pyrecdp/primitives/llmutils/pii_remove.py b/RecDP/pyrecdp/primitives/llmutils/pii_remove.py index 43200d30e..f37d0972d 100644 --- a/RecDP/pyrecdp/primitives/llmutils/pii_remove.py +++ b/RecDP/pyrecdp/primitives/llmutils/pii_remove.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyspark.sql.dataframe import DataFrame diff --git a/RecDP/pyrecdp/primitives/llmutils/pipeline_hpo.py b/RecDP/pyrecdp/primitives/llmutils/pipeline_hpo.py index 1fc988d77..82b844d98 100644 --- a/RecDP/pyrecdp/primitives/llmutils/pipeline_hpo.py +++ b/RecDP/pyrecdp/primitives/llmutils/pipeline_hpo.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os from abc import abstractmethod, ABC from dataclasses import dataclass diff --git a/RecDP/pyrecdp/primitives/llmutils/profanity_filter.py b/RecDP/pyrecdp/primitives/llmutils/profanity_filter.py index 766fcae71..7b0741bca 100644 --- a/RecDP/pyrecdp/primitives/llmutils/profanity_filter.py +++ b/RecDP/pyrecdp/primitives/llmutils/profanity_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/qa_generate.py b/RecDP/pyrecdp/primitives/llmutils/qa_generate.py index e2caea3df..e8e25035e 100644 --- a/RecDP/pyrecdp/primitives/llmutils/qa_generate.py +++ b/RecDP/pyrecdp/primitives/llmutils/qa_generate.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/quality_classifier.py b/RecDP/pyrecdp/primitives/llmutils/quality_classifier.py index e0939bf3b..2e336d16d 100644 --- a/RecDP/pyrecdp/primitives/llmutils/quality_classifier.py +++ b/RecDP/pyrecdp/primitives/llmutils/quality_classifier.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/rag_data_extractor.py b/RecDP/pyrecdp/primitives/llmutils/rag_data_extractor.py index d6c97b3fb..907e9ab87 100644 --- a/RecDP/pyrecdp/primitives/llmutils/rag_data_extractor.py +++ b/RecDP/pyrecdp/primitives/llmutils/rag_data_extractor.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from typing import Optional, List diff --git a/RecDP/pyrecdp/primitives/llmutils/rouge_score_dedup.py b/RecDP/pyrecdp/primitives/llmutils/rouge_score_dedup.py index dc3d9be12..8c3df4a7f 100644 --- a/RecDP/pyrecdp/primitives/llmutils/rouge_score_dedup.py +++ b/RecDP/pyrecdp/primitives/llmutils/rouge_score_dedup.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/sentence_split.py b/RecDP/pyrecdp/primitives/llmutils/sentence_split.py index 91dedefaa..7796c5602 100644 --- a/RecDP/pyrecdp/primitives/llmutils/sentence_split.py +++ b/RecDP/pyrecdp/primitives/llmutils/sentence_split.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/llmutils/shrink_jsonl.py b/RecDP/pyrecdp/primitives/llmutils/shrink_jsonl.py index ef80a4a05..25f415417 100644 --- a/RecDP/pyrecdp/primitives/llmutils/shrink_jsonl.py +++ b/RecDP/pyrecdp/primitives/llmutils/shrink_jsonl.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse import os import pickle diff --git a/RecDP/pyrecdp/primitives/llmutils/text_fixer.py b/RecDP/pyrecdp/primitives/llmutils/text_fixer.py index 4cf9a949e..9ba5d725f 100644 --- a/RecDP/pyrecdp/primitives/llmutils/text_fixer.py +++ b/RecDP/pyrecdp/primitives/llmutils/text_fixer.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/text_normalization.py b/RecDP/pyrecdp/primitives/llmutils/text_normalization.py index c9245aa22..a3972ffe2 100644 --- a/RecDP/pyrecdp/primitives/llmutils/text_normalization.py +++ b/RecDP/pyrecdp/primitives/llmutils/text_normalization.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse import os, sys from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/text_to_jsonl.py b/RecDP/pyrecdp/primitives/llmutils/text_to_jsonl.py index 5b739f16b..9dabfca0b 100644 --- a/RecDP/pyrecdp/primitives/llmutils/text_to_jsonl.py +++ b/RecDP/pyrecdp/primitives/llmutils/text_to_jsonl.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse import os from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/count_tokens.py b/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/count_tokens.py index b53b582c3..d88a89cbb 100644 --- a/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/count_tokens.py +++ b/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/count_tokens.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from indexed_dataset import MMapIndexedDataset from transformers import AutoTokenizer diff --git a/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/merge_datasets.py b/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/merge_datasets.py index 0f16bb7c4..37936252f 100644 --- a/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/merge_datasets.py +++ b/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/merge_datasets.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + """this script is for merging multiple megatron-style data files.""" import os diff --git a/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/run-dp.sh b/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/run-dp.sh index 983b0d273..d6a4afa88 100755 --- a/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/run-dp.sh +++ b/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/run-dp.sh @@ -1,3 +1,17 @@ +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + start=`date +%s` echo -e "\n distributed tokenization with ray for Book" diff --git a/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/tokenize_and_save.py b/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/tokenize_and_save.py index a1f375e1b..6b7f8a833 100644 --- a/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/tokenize_and_save.py +++ b/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save/tokenize_and_save.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + """ this script is for tokenizing various data input in json format and saving to megatron-format. """ diff --git a/RecDP/pyrecdp/primitives/llmutils/toxicity_score.py b/RecDP/pyrecdp/primitives/llmutils/toxicity_score.py index 96dcc9af3..b9b1172c6 100644 --- a/RecDP/pyrecdp/primitives/llmutils/toxicity_score.py +++ b/RecDP/pyrecdp/primitives/llmutils/toxicity_score.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import argparse from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/llmutils/utils.py b/RecDP/pyrecdp/primitives/llmutils/utils.py index f10d33b2e..a0e56d01d 100644 --- a/RecDP/pyrecdp/primitives/llmutils/utils.py +++ b/RecDP/pyrecdp/primitives/llmutils/utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os, sys import time from pyrecdp.core.import_utils import check_availability_and_install diff --git a/RecDP/pyrecdp/primitives/operations/base.py b/RecDP/pyrecdp/primitives/operations/base.py index 9f7435080..05b768f4f 100644 --- a/RecDP/pyrecdp/primitives/operations/base.py +++ b/RecDP/pyrecdp/primitives/operations/base.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import json from dataclasses import dataclass from functools import wraps diff --git a/RecDP/pyrecdp/primitives/operations/category.py b/RecDP/pyrecdp/primitives/operations/category.py index 99f84acd6..19db9c877 100644 --- a/RecDP/pyrecdp/primitives/operations/category.py +++ b/RecDP/pyrecdp/primitives/operations/category.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS import pandas as pd from pyspark.sql import DataFrame as SparkDataFrame diff --git a/RecDP/pyrecdp/primitives/operations/constant/SPECIAL_CHARACTERS.py b/RecDP/pyrecdp/primitives/operations/constant/SPECIAL_CHARACTERS.py index b535f24d4..a0368e84d 100644 --- a/RecDP/pyrecdp/primitives/operations/constant/SPECIAL_CHARACTERS.py +++ b/RecDP/pyrecdp/primitives/operations/constant/SPECIAL_CHARACTERS.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import string from pyrecdp.core.import_utils import check_availability_and_install diff --git a/RecDP/pyrecdp/primitives/operations/constant/__init__.py b/RecDP/pyrecdp/primitives/operations/constant/__init__.py index 0d75243d4..f4712f10d 100644 --- a/RecDP/pyrecdp/primitives/operations/constant/__init__.py +++ b/RecDP/pyrecdp/primitives/operations/constant/__init__.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + HF_TOKENIZER = 'EleutherAI/pythia-6.9b-deduped' VARIOUS_WHITESPACES = { diff --git a/RecDP/pyrecdp/primitives/operations/custom.py b/RecDP/pyrecdp/primitives/operations/custom.py index 41cf85ff3..b773de227 100644 --- a/RecDP/pyrecdp/primitives/operations/custom.py +++ b/RecDP/pyrecdp/primitives/operations/custom.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS import copy diff --git a/RecDP/pyrecdp/primitives/operations/data.py b/RecDP/pyrecdp/primitives/operations/data.py index 92b624d0b..d29eb70ab 100644 --- a/RecDP/pyrecdp/primitives/operations/data.py +++ b/RecDP/pyrecdp/primitives/operations/data.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS import pandas as pd diff --git a/RecDP/pyrecdp/primitives/operations/dataframe.py b/RecDP/pyrecdp/primitives/operations/dataframe.py index 75406dd6c..8848d4d25 100644 --- a/RecDP/pyrecdp/primitives/operations/dataframe.py +++ b/RecDP/pyrecdp/primitives/operations/dataframe.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import numpy as np import time import pandas as pd diff --git a/RecDP/pyrecdp/primitives/operations/doc_loader.py b/RecDP/pyrecdp/primitives/operations/doc_loader.py index 72280d09b..198bf94dd 100644 --- a/RecDP/pyrecdp/primitives/operations/doc_loader.py +++ b/RecDP/pyrecdp/primitives/operations/doc_loader.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os import re from typing import Optional, List, Callable, Union, Sequence diff --git a/RecDP/pyrecdp/primitives/operations/drop.py b/RecDP/pyrecdp/primitives/operations/drop.py index 99e9f8f16..38e3fec1e 100644 --- a/RecDP/pyrecdp/primitives/operations/drop.py +++ b/RecDP/pyrecdp/primitives/operations/drop.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS import pandas as pd from pyspark.sql import DataFrame as SparkDataFrame diff --git a/RecDP/pyrecdp/primitives/operations/encode.py b/RecDP/pyrecdp/primitives/operations/encode.py index 651cb0723..88892de03 100644 --- a/RecDP/pyrecdp/primitives/operations/encode.py +++ b/RecDP/pyrecdp/primitives/operations/encode.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS import pandas as pd from pyspark.sql import DataFrame as SparkDataFrame diff --git a/RecDP/pyrecdp/primitives/operations/featuretools_adaptor.py b/RecDP/pyrecdp/primitives/operations/featuretools_adaptor.py index b2e199655..da1efe210 100644 --- a/RecDP/pyrecdp/primitives/operations/featuretools_adaptor.py +++ b/RecDP/pyrecdp/primitives/operations/featuretools_adaptor.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation import copy from pyrecdp.core.utils import class_name_fix diff --git a/RecDP/pyrecdp/primitives/operations/fillna.py b/RecDP/pyrecdp/primitives/operations/fillna.py index c34273e74..17ea3500a 100644 --- a/RecDP/pyrecdp/primitives/operations/fillna.py +++ b/RecDP/pyrecdp/primitives/operations/fillna.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS import copy diff --git a/RecDP/pyrecdp/primitives/operations/filter/alphanumeric_filter.py b/RecDP/pyrecdp/primitives/operations/filter/alphanumeric_filter.py index 7432b97ac..ec24568e1 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/alphanumeric_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/alphanumeric_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import sys from pyrecdp.primitives.operations.base import LLMOPERATORS diff --git a/RecDP/pyrecdp/primitives/operations/filter/average_line_length_filter.py b/RecDP/pyrecdp/primitives/operations/filter/average_line_length_filter.py index 030247477..72e69f6f7 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/average_line_length_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/average_line_length_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import sys from pyrecdp.primitives.operations.base import LLMOPERATORS diff --git a/RecDP/pyrecdp/primitives/operations/filter/badwords_filter.py b/RecDP/pyrecdp/primitives/operations/filter/badwords_filter.py index 9187cf7f1..fa931906a 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/badwords_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/badwords_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.operations.base import LLMOPERATORS from pyrecdp.primitives.operations.filter.base import BaseFilter diff --git a/RecDP/pyrecdp/primitives/operations/filter/base.py b/RecDP/pyrecdp/primitives/operations/filter/base.py index 940f9348e..04a55be0a 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/base.py +++ b/RecDP/pyrecdp/primitives/operations/filter/base.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyspark.sql import DataFrame from ray.data import Dataset diff --git a/RecDP/pyrecdp/primitives/operations/filter/length_filter.py b/RecDP/pyrecdp/primitives/operations/filter/length_filter.py index 768ba8621..73bbca87a 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/length_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/length_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.operations.base import LLMOPERATORS from pyrecdp.primitives.operations.filter.base import BaseFilter diff --git a/RecDP/pyrecdp/primitives/operations/filter/maximum_line_length_filter.py b/RecDP/pyrecdp/primitives/operations/filter/maximum_line_length_filter.py index 4541f7ecd..227945e87 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/maximum_line_length_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/maximum_line_length_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import sys from pyrecdp.primitives.operations.base import LLMOPERATORS diff --git a/RecDP/pyrecdp/primitives/operations/filter/perplexity_filter.py b/RecDP/pyrecdp/primitives/operations/filter/perplexity_filter.py index 3506a1469..48f7ad95f 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/perplexity_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/perplexity_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import sys from pyrecdp.core.model_utils import get_model, prepare_model diff --git a/RecDP/pyrecdp/primitives/operations/filter/profanity_filter.py b/RecDP/pyrecdp/primitives/operations/filter/profanity_filter.py index aeba132f6..167dcbe09 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/profanity_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/profanity_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.operations.base import LLMOPERATORS from pyrecdp.primitives.operations.filter.base import BaseFilter diff --git a/RecDP/pyrecdp/primitives/operations/filter/special_characters_filter.py b/RecDP/pyrecdp/primitives/operations/filter/special_characters_filter.py index bc7b26abf..ca62b5b91 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/special_characters_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/special_characters_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.operations.base import LLMOPERATORS from pyrecdp.primitives.operations.filter.base import BaseFilter diff --git a/RecDP/pyrecdp/primitives/operations/filter/text_gopherqualityfilter.py b/RecDP/pyrecdp/primitives/operations/filter/text_gopherqualityfilter.py index 2b662a8bf..c2ca8b66f 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/text_gopherqualityfilter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/text_gopherqualityfilter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.primitives.operations.filter.base import BaseFilter from pyrecdp.primitives.operations.base import LLMOPERATORS, statistics_decorator import re diff --git a/RecDP/pyrecdp/primitives/operations/filter/token_num_filter.py b/RecDP/pyrecdp/primitives/operations/filter/token_num_filter.py index d577ee1be..04147ed0d 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/token_num_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/token_num_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import sys from pyrecdp.core.model_utils import get_model, prepare_model diff --git a/RecDP/pyrecdp/primitives/operations/filter/url_filter.py b/RecDP/pyrecdp/primitives/operations/filter/url_filter.py index 591a89d9d..5cb2a322f 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/url_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/url_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from functools import lru_cache from pyrecdp.primitives.operations.filter.base import BaseFilter diff --git a/RecDP/pyrecdp/primitives/operations/filter/word_num_filter.py b/RecDP/pyrecdp/primitives/operations/filter/word_num_filter.py index a2c34500a..6b9ea4d61 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/word_num_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/word_num_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import sys from pyrecdp.core.model_utils import get_model diff --git a/RecDP/pyrecdp/primitives/operations/filter/word_repetition_filter.py b/RecDP/pyrecdp/primitives/operations/filter/word_repetition_filter.py index a42af2f79..f618af2d4 100644 --- a/RecDP/pyrecdp/primitives/operations/filter/word_repetition_filter.py +++ b/RecDP/pyrecdp/primitives/operations/filter/word_repetition_filter.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.model_utils import get_model, prepare_model from pyrecdp.primitives.operations.utils import get_words_from_document, words_refinement from pyrecdp.primitives.operations.base import LLMOPERATORS diff --git a/RecDP/pyrecdp/primitives/operations/geograph.py b/RecDP/pyrecdp/primitives/operations/geograph.py index 8082c888b..afebee204 100644 --- a/RecDP/pyrecdp/primitives/operations/geograph.py +++ b/RecDP/pyrecdp/primitives/operations/geograph.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS from .featuretools_adaptor import FeaturetoolsOperation import copy diff --git a/RecDP/pyrecdp/primitives/operations/logging_utils.py b/RecDP/pyrecdp/primitives/operations/logging_utils.py index b3f43d273..fa130ab53 100644 --- a/RecDP/pyrecdp/primitives/operations/logging_utils.py +++ b/RecDP/pyrecdp/primitives/operations/logging_utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import logging import sys from loguru import logger diff --git a/RecDP/pyrecdp/primitives/operations/merge.py b/RecDP/pyrecdp/primitives/operations/merge.py index e5e7e868c..9e85c6b5c 100644 --- a/RecDP/pyrecdp/primitives/operations/merge.py +++ b/RecDP/pyrecdp/primitives/operations/merge.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS import pandas as pd from .dataframe import * diff --git a/RecDP/pyrecdp/primitives/operations/name.py b/RecDP/pyrecdp/primitives/operations/name.py index 023447925..246b6f93a 100644 --- a/RecDP/pyrecdp/primitives/operations/name.py +++ b/RecDP/pyrecdp/primitives/operations/name.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS import copy diff --git a/RecDP/pyrecdp/primitives/operations/random_select.py b/RecDP/pyrecdp/primitives/operations/random_select.py index e195a4c66..e540f8ace 100644 --- a/RecDP/pyrecdp/primitives/operations/random_select.py +++ b/RecDP/pyrecdp/primitives/operations/random_select.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/table_summary.py b/RecDP/pyrecdp/primitives/operations/table_summary.py index c02dfeae3..9aadb73ea 100644 --- a/RecDP/pyrecdp/primitives/operations/table_summary.py +++ b/RecDP/pyrecdp/primitives/operations/table_summary.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS import ray from ray.data import Dataset diff --git a/RecDP/pyrecdp/primitives/operations/text_bytesize.py b/RecDP/pyrecdp/primitives/operations/text_bytesize.py index ee3673a0d..4fe189f00 100644 --- a/RecDP/pyrecdp/primitives/operations/text_bytesize.py +++ b/RecDP/pyrecdp/primitives/operations/text_bytesize.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_compare_dedup.py b/RecDP/pyrecdp/primitives/operations/text_compare_dedup.py index cab269e40..22daaae29 100644 --- a/RecDP/pyrecdp/primitives/operations/text_compare_dedup.py +++ b/RecDP/pyrecdp/primitives/operations/text_compare_dedup.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS, statistics_decorator from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_contraction_remove.py b/RecDP/pyrecdp/primitives/operations/text_contraction_remove.py index fb91822cf..c91ddb469 100644 --- a/RecDP/pyrecdp/primitives/operations/text_contraction_remove.py +++ b/RecDP/pyrecdp/primitives/operations/text_contraction_remove.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_custom.py b/RecDP/pyrecdp/primitives/operations/text_custom.py index 382c5dbb7..9db0960a9 100644 --- a/RecDP/pyrecdp/primitives/operations/text_custom.py +++ b/RecDP/pyrecdp/primitives/operations/text_custom.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from functools import partial from typing import Optional, Dict diff --git a/RecDP/pyrecdp/primitives/operations/text_deduplication.py b/RecDP/pyrecdp/primitives/operations/text_deduplication.py index 0c86c12c7..57a32f9b2 100644 --- a/RecDP/pyrecdp/primitives/operations/text_deduplication.py +++ b/RecDP/pyrecdp/primitives/operations/text_deduplication.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS, statistics_decorator from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_diversityindicate.py b/RecDP/pyrecdp/primitives/operations/text_diversityindicate.py index 4d2494ce7..c720a9a48 100644 --- a/RecDP/pyrecdp/primitives/operations/text_diversityindicate.py +++ b/RecDP/pyrecdp/primitives/operations/text_diversityindicate.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + # This tool is referred from alibaba data juicer project and used for # analyzing the verb-noun structure of the SFT dataset and plots its diversity in sunburst format. diff --git a/RecDP/pyrecdp/primitives/operations/text_fixer.py b/RecDP/pyrecdp/primitives/operations/text_fixer.py index 1e33fb353..fc795b565 100644 --- a/RecDP/pyrecdp/primitives/operations/text_fixer.py +++ b/RecDP/pyrecdp/primitives/operations/text_fixer.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_ingestion.py b/RecDP/pyrecdp/primitives/operations/text_ingestion.py index 026775041..309db1a46 100644 --- a/RecDP/pyrecdp/primitives/operations/text_ingestion.py +++ b/RecDP/pyrecdp/primitives/operations/text_ingestion.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os.path from abc import ABC, abstractmethod from typing import Optional, Dict, Union, Iterable, Any, cast diff --git a/RecDP/pyrecdp/primitives/operations/text_language_identify.py b/RecDP/pyrecdp/primitives/operations/text_language_identify.py index 247bb943e..3ace00fc8 100644 --- a/RecDP/pyrecdp/primitives/operations/text_language_identify.py +++ b/RecDP/pyrecdp/primitives/operations/text_language_identify.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_normalize.py b/RecDP/pyrecdp/primitives/operations/text_normalize.py index 5e9eda805..fde21f5e1 100644 --- a/RecDP/pyrecdp/primitives/operations/text_normalize.py +++ b/RecDP/pyrecdp/primitives/operations/text_normalize.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_perplexity_score.py b/RecDP/pyrecdp/primitives/operations/text_perplexity_score.py index 586cd46a3..4f68e1b94 100644 --- a/RecDP/pyrecdp/primitives/operations/text_perplexity_score.py +++ b/RecDP/pyrecdp/primitives/operations/text_perplexity_score.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, statistics_decorator from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_pii_remove.py b/RecDP/pyrecdp/primitives/operations/text_pii_remove.py index a14bd15f6..723c594b2 100644 --- a/RecDP/pyrecdp/primitives/operations/text_pii_remove.py +++ b/RecDP/pyrecdp/primitives/operations/text_pii_remove.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_prompt.py b/RecDP/pyrecdp/primitives/operations/text_prompt.py index 147551389..6680fb739 100644 --- a/RecDP/pyrecdp/primitives/operations/text_prompt.py +++ b/RecDP/pyrecdp/primitives/operations/text_prompt.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os, shutil, subprocess, sys, inspect from .base import BaseLLMOperation, LLMOPERATORS diff --git a/RecDP/pyrecdp/primitives/operations/text_qualityscorer.py b/RecDP/pyrecdp/primitives/operations/text_qualityscorer.py index d4a9989dc..f470511b6 100644 --- a/RecDP/pyrecdp/primitives/operations/text_qualityscorer.py +++ b/RecDP/pyrecdp/primitives/operations/text_qualityscorer.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS, statistics_decorator from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_reader.py b/RecDP/pyrecdp/primitives/operations/text_reader.py index fd2d76293..6da04dc64 100644 --- a/RecDP/pyrecdp/primitives/operations/text_reader.py +++ b/RecDP/pyrecdp/primitives/operations/text_reader.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_spell_correct.py b/RecDP/pyrecdp/primitives/operations/text_spell_correct.py index 76aa838da..629e5ab98 100644 --- a/RecDP/pyrecdp/primitives/operations/text_spell_correct.py +++ b/RecDP/pyrecdp/primitives/operations/text_spell_correct.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_split.py b/RecDP/pyrecdp/primitives/operations/text_split.py index 13a64855d..29067685b 100644 --- a/RecDP/pyrecdp/primitives/operations/text_split.py +++ b/RecDP/pyrecdp/primitives/operations/text_split.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import math from typing import List, Dict, Any, Optional, Callable, cast, Union diff --git a/RecDP/pyrecdp/primitives/operations/text_to_qa.py b/RecDP/pyrecdp/primitives/operations/text_to_qa.py index d598aaa85..1ec2a420a 100644 --- a/RecDP/pyrecdp/primitives/operations/text_to_qa.py +++ b/RecDP/pyrecdp/primitives/operations/text_to_qa.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS, statistics_decorator import ray from ray.data import Dataset diff --git a/RecDP/pyrecdp/primitives/operations/text_toxicity.py b/RecDP/pyrecdp/primitives/operations/text_toxicity.py index 905b31fee..fbb9e6cd6 100644 --- a/RecDP/pyrecdp/primitives/operations/text_toxicity.py +++ b/RecDP/pyrecdp/primitives/operations/text_toxicity.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS, statistics_decorator from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/text_writer.py b/RecDP/pyrecdp/primitives/operations/text_writer.py index b16285064..07d6f3eae 100644 --- a/RecDP/pyrecdp/primitives/operations/text_writer.py +++ b/RecDP/pyrecdp/primitives/operations/text_writer.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseLLMOperation, LLMOPERATORS from ray.data import Dataset from pyspark.sql import DataFrame diff --git a/RecDP/pyrecdp/primitives/operations/tuple.py b/RecDP/pyrecdp/primitives/operations/tuple.py index aa1d4dee6..e91d318b2 100644 --- a/RecDP/pyrecdp/primitives/operations/tuple.py +++ b/RecDP/pyrecdp/primitives/operations/tuple.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS class TupleOperation(BaseOperation): diff --git a/RecDP/pyrecdp/primitives/operations/type.py b/RecDP/pyrecdp/primitives/operations/type.py index 1e60d47fa..2cc11c10d 100644 --- a/RecDP/pyrecdp/primitives/operations/type.py +++ b/RecDP/pyrecdp/primitives/operations/type.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .base import BaseOperation, AUTOFEOPERATORS from pyrecdp.core.schema import SeriesSchema import pandas as pd diff --git a/RecDP/pyrecdp/primitives/operations/utils.py b/RecDP/pyrecdp/primitives/operations/utils.py index 016604a3c..2cbdba083 100644 --- a/RecDP/pyrecdp/primitives/operations/utils.py +++ b/RecDP/pyrecdp/primitives/operations/utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import re diff --git a/RecDP/pyrecdp/primitives/profilers/cluster_infer.py b/RecDP/pyrecdp/primitives/profilers/cluster_infer.py index f82ee7986..1c168cb6f 100644 --- a/RecDP/pyrecdp/primitives/profilers/cluster_infer.py +++ b/RecDP/pyrecdp/primitives/profilers/cluster_infer.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.schema import SeriesSchema from pyrecdp.primitives.operations import Operation import pandas as pd diff --git a/RecDP/pyrecdp/primitives/profilers/distribution_infer.py b/RecDP/pyrecdp/primitives/profilers/distribution_infer.py index f82ee7986..1c168cb6f 100644 --- a/RecDP/pyrecdp/primitives/profilers/distribution_infer.py +++ b/RecDP/pyrecdp/primitives/profilers/distribution_infer.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.schema import SeriesSchema from pyrecdp.primitives.operations import Operation import pandas as pd diff --git a/RecDP/pyrecdp/primitives/profilers/statics.py b/RecDP/pyrecdp/primitives/profilers/statics.py index 19d82b187..4a84ff910 100644 --- a/RecDP/pyrecdp/primitives/profilers/statics.py +++ b/RecDP/pyrecdp/primitives/profilers/statics.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.schema import SeriesSchema from pyrecdp.core.utils import is_text_series from pyrecdp.core.utils import Timer diff --git a/RecDP/pyrecdp/primitives/profilers/time_series_infer.py b/RecDP/pyrecdp/primitives/profilers/time_series_infer.py index 887c9a3b7..09f7b96cc 100644 --- a/RecDP/pyrecdp/primitives/profilers/time_series_infer.py +++ b/RecDP/pyrecdp/primitives/profilers/time_series_infer.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.schema import SeriesSchema from pyrecdp.primitives.operations import Operation import pandas as pd diff --git a/RecDP/pyrecdp/primitives/profilers/type_infer.py b/RecDP/pyrecdp/primitives/profilers/type_infer.py index b7cb6da76..ac9121f3d 100644 --- a/RecDP/pyrecdp/primitives/profilers/type_infer.py +++ b/RecDP/pyrecdp/primitives/profilers/type_infer.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.core.schema import SeriesSchema from pyrecdp.core.utils import is_unique from pyrecdp.primitives.operations import Operation diff --git a/RecDP/pyrecdp/primitives/spark_data_processor/data_processor.py b/RecDP/pyrecdp/primitives/spark_data_processor/data_processor.py index b2e8089a0..611cc30a4 100644 --- a/RecDP/pyrecdp/primitives/spark_data_processor/data_processor.py +++ b/RecDP/pyrecdp/primitives/spark_data_processor/data_processor.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.utils import create_spark_context import uuid diff --git a/RecDP/pyrecdp/primitives/spark_data_processor/encoder.py b/RecDP/pyrecdp/primitives/spark_data_processor/encoder.py index 44aa5b985..c6610ab55 100644 --- a/RecDP/pyrecdp/primitives/spark_data_processor/encoder.py +++ b/RecDP/pyrecdp/primitives/spark_data_processor/encoder.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import uuid from pyrecdp.utils import * from pyspark.ml.feature import * diff --git a/RecDP/pyrecdp/primitives/spark_data_processor/start-spark-standalone.sh b/RecDP/pyrecdp/primitives/spark_data_processor/start-spark-standalone.sh index 88b6c8b79..36c6198b9 100755 --- a/RecDP/pyrecdp/primitives/spark_data_processor/start-spark-standalone.sh +++ b/RecDP/pyrecdp/primitives/spark_data_processor/start-spark-standalone.sh @@ -1,3 +1,17 @@ +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + if [ "$#" -ne 1 ]; then echo "Please provide master node, ex: " $0 "127.0.0.1" fi diff --git a/RecDP/pyrecdp/primitives/spark_data_processor/utils.py b/RecDP/pyrecdp/primitives/spark_data_processor/utils.py index ac833f33c..e928edaf9 100644 --- a/RecDP/pyrecdp/primitives/spark_data_processor/utils.py +++ b/RecDP/pyrecdp/primitives/spark_data_processor/utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os import re from pyspark import * diff --git a/RecDP/pyrecdp/primitives/tabutils/data_preprocess.py b/RecDP/pyrecdp/primitives/tabutils/data_preprocess.py index e48595300..75fcccfb3 100644 --- a/RecDP/pyrecdp/primitives/tabutils/data_preprocess.py +++ b/RecDP/pyrecdp/primitives/tabutils/data_preprocess.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import numpy as np import seaborn as sns from scipy.cluster import hierarchy diff --git a/RecDP/pyrecdp/primitives/tabutils/feature_normal_transform.py b/RecDP/pyrecdp/primitives/tabutils/feature_normal_transform.py index cda995e26..cd73e7be8 100644 --- a/RecDP/pyrecdp/primitives/tabutils/feature_normal_transform.py +++ b/RecDP/pyrecdp/primitives/tabutils/feature_normal_transform.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import numpy as np from scipy.stats import yeojohnson from scipy.stats import kstest diff --git a/RecDP/pyrecdp/primitives/tabutils/segmentation.py b/RecDP/pyrecdp/primitives/tabutils/segmentation.py index ee5004850..cbe292d95 100644 --- a/RecDP/pyrecdp/primitives/tabutils/segmentation.py +++ b/RecDP/pyrecdp/primitives/tabutils/segmentation.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from collections import defaultdict import pandas as pd import numpy as np diff --git a/RecDP/pyrecdp/primitives/tabutils/utils.py b/RecDP/pyrecdp/primitives/tabutils/utils.py index cc569b117..0124e018c 100644 --- a/RecDP/pyrecdp/primitives/tabutils/utils.py +++ b/RecDP/pyrecdp/primitives/tabutils/utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import pandas as pd import numpy as np import numba diff --git a/RecDP/pyrecdp/widgets/BaseWidget.py b/RecDP/pyrecdp/widgets/BaseWidget.py index 1f563758b..8bff817cb 100644 --- a/RecDP/pyrecdp/widgets/BaseWidget.py +++ b/RecDP/pyrecdp/widgets/BaseWidget.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import ipywidgets as widgets from IPython.display import display, clear_output diff --git a/RecDP/pyrecdp/widgets/TabWidget.py b/RecDP/pyrecdp/widgets/TabWidget.py index 020751fb5..9b72f2a4c 100644 --- a/RecDP/pyrecdp/widgets/TabWidget.py +++ b/RecDP/pyrecdp/widgets/TabWidget.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import ipywidgets as widgets from IPython.display import display diff --git a/RecDP/pyrecdp/widgets/TableViewWidget.py b/RecDP/pyrecdp/widgets/TableViewWidget.py index 9c6021cd2..089940e86 100644 --- a/RecDP/pyrecdp/widgets/TableViewWidget.py +++ b/RecDP/pyrecdp/widgets/TableViewWidget.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from pyrecdp.widgets import BaseWidget class TableViewWidget(BaseWidget): diff --git a/RecDP/setup.py b/RecDP/setup.py index 5e274bb2a..16ddc2150 100644 --- a/RecDP/setup.py +++ b/RecDP/setup.py @@ -59,34 +59,36 @@ def get_packages(self): setup_spec = SetupSpec() -setuptools.setup( - name="pyrecdp", - version=setup_spec.version, - author="INTEL BDF AIOK", - author_email="bdf.aiok@intel.com", - description= - "A data processing bundle for spark based recommender system operations", - long_description=long_description, - long_description_content_type="text/markdown", - url = "https://github.com/intel/e2eAIOK/", - project_urls={ - "Bug Tracker": "https://github.com/intel/e2eAIOK/", - }, - keywords=( - "pyrecdp recdp distributed parallel auto-feature-engineering autofe LLM python" - ), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - ], - include_package_data=True, - package_dir={}, - packages=setup_spec.get_packages(), - package_data={"": ["*.jar"], "pyrecdp": ["version"]}, - python_requires=">=3.6", - #cmdclass={'install': post_install}, - zip_safe=False, - install_requires=setup_spec.install_requires, - extras_require=setup_spec.extras, -) +name_list = ["pyrecdp", "e2eAIOK-pyrecdp"] +for name in name_list: + setuptools.setup( + name=name, + version=setup_spec.version, + author="INTEL BDF AIOK", + author_email="bdf.aiok@intel.com", + description= + "A data processing bundle for spark based recommender system operations", + long_description=long_description, + long_description_content_type="text/markdown", + url = "https://github.com/intel/e2eAIOK/", + project_urls={ + "Bug Tracker": "https://github.com/intel/e2eAIOK/", + }, + keywords=( + "pyrecdp recdp distributed parallel auto-feature-engineering autofe LLM python" + ), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ], + include_package_data=True, + package_dir={}, + packages=setup_spec.get_packages(), + package_data={"": ["*.jar"], "pyrecdp": ["version"]}, + python_requires=">=3.6", + #cmdclass={'install': post_install}, + zip_safe=False, + install_requires=setup_spec.install_requires, + extras_require=setup_spec.extras, + ) diff --git a/e2eAIOK/deltatuner/deltatuner/deltatuner.py b/e2eAIOK/deltatuner/deltatuner/deltatuner.py index c99163465..b31720c2c 100644 --- a/e2eAIOK/deltatuner/deltatuner/deltatuner.py +++ b/e2eAIOK/deltatuner/deltatuner/deltatuner.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import logging from peft import LoraConfig, PeftModel, LoraModel from .tuner import SSFConfig diff --git a/e2eAIOK/deltatuner/deltatuner/deltatuner_args.py b/e2eAIOK/deltatuner/deltatuner/deltatuner_args.py index d7f007c1a..0aabdb84b 100644 --- a/e2eAIOK/deltatuner/deltatuner/deltatuner_args.py +++ b/e2eAIOK/deltatuner/deltatuner/deltatuner_args.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from typing import List from dataclasses import dataclass, field from transformers import AutoTokenizer diff --git a/e2eAIOK/deltatuner/deltatuner/deltatuner_model.py b/e2eAIOK/deltatuner/deltatuner/deltatuner_model.py index 596bfa9fb..da81b40da 100644 --- a/e2eAIOK/deltatuner/deltatuner/deltatuner_model.py +++ b/e2eAIOK/deltatuner/deltatuner/deltatuner_model.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os import torch import json diff --git a/e2eAIOK/deltatuner/deltatuner/mapping.py b/e2eAIOK/deltatuner/deltatuner/mapping.py index 18f7a5de3..db1d9f23e 100644 --- a/e2eAIOK/deltatuner/deltatuner/mapping.py +++ b/e2eAIOK/deltatuner/deltatuner/mapping.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .tuner import SSFConfig from typing import TYPE_CHECKING, Any, Dict from .deltatuner_model import DeltaTunerModel, DelatunerModelForCausalLM diff --git a/e2eAIOK/deltatuner/deltatuner/scores/compute_de_score.py b/e2eAIOK/deltatuner/deltatuner/scores/compute_de_score.py index 4985a8ad0..e532126c8 100644 --- a/e2eAIOK/deltatuner/deltatuner/scores/compute_de_score.py +++ b/e2eAIOK/deltatuner/deltatuner/scores/compute_de_score.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os, sys, time, logging diff --git a/e2eAIOK/deltatuner/deltatuner/scores/transformer_proxy.py b/e2eAIOK/deltatuner/deltatuner/scores/transformer_proxy.py index 6e9cae148..3aa0758ee 100644 --- a/e2eAIOK/deltatuner/deltatuner/scores/transformer_proxy.py +++ b/e2eAIOK/deltatuner/deltatuner/scores/transformer_proxy.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os, sys, time, math import torch from torch import nn diff --git a/e2eAIOK/deltatuner/deltatuner/scores/transformer_proxy_ssf.py b/e2eAIOK/deltatuner/deltatuner/scores/transformer_proxy_ssf.py index e7a0a580a..bfbd90e60 100644 --- a/e2eAIOK/deltatuner/deltatuner/scores/transformer_proxy_ssf.py +++ b/e2eAIOK/deltatuner/deltatuner/scores/transformer_proxy_ssf.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os, sys, time, math import torch from torch import nn diff --git a/e2eAIOK/deltatuner/deltatuner/scores/utils.py b/e2eAIOK/deltatuner/deltatuner/scores/utils.py index ef935aa45..445ee36af 100644 --- a/e2eAIOK/deltatuner/deltatuner/scores/utils.py +++ b/e2eAIOK/deltatuner/deltatuner/scores/utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import torch from transformers import pytorch_utils from peft.tuners.lora import Linear, Conv1D, LoraLayer diff --git a/e2eAIOK/deltatuner/deltatuner/search/BaseSearchEngine.py b/e2eAIOK/deltatuner/deltatuner/search/BaseSearchEngine.py index 718a7e79c..8d72abcb9 100644 --- a/e2eAIOK/deltatuner/deltatuner/search/BaseSearchEngine.py +++ b/e2eAIOK/deltatuner/deltatuner/search/BaseSearchEngine.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import logging import gc import json diff --git a/e2eAIOK/deltatuner/deltatuner/search/EvolutionarySearchEngine.py b/e2eAIOK/deltatuner/deltatuner/search/EvolutionarySearchEngine.py index 67bac46a0..d125ff6f7 100644 --- a/e2eAIOK/deltatuner/deltatuner/search/EvolutionarySearchEngine.py +++ b/e2eAIOK/deltatuner/deltatuner/search/EvolutionarySearchEngine.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os import sys import traceback diff --git a/e2eAIOK/deltatuner/deltatuner/search/SearchEngineFactory.py b/e2eAIOK/deltatuner/deltatuner/search/SearchEngineFactory.py index 6725685d1..a63cfd909 100644 --- a/e2eAIOK/deltatuner/deltatuner/search/SearchEngineFactory.py +++ b/e2eAIOK/deltatuner/deltatuner/search/SearchEngineFactory.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + from .EvolutionarySearchEngine import EvolutionarySearchEngine SEARCHER_TYPES = { diff --git a/e2eAIOK/deltatuner/deltatuner/search/utils.py b/e2eAIOK/deltatuner/deltatuner/search/utils.py index b25ee56de..999bf5bd9 100644 --- a/e2eAIOK/deltatuner/deltatuner/search/utils.py +++ b/e2eAIOK/deltatuner/deltatuner/search/utils.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import sys import os import json diff --git a/e2eAIOK/deltatuner/deltatuner/tuner/deltatuner_lora.py b/e2eAIOK/deltatuner/deltatuner/tuner/deltatuner_lora.py index 82ca8acb3..98d4169eb 100644 --- a/e2eAIOK/deltatuner/deltatuner/tuner/deltatuner_lora.py +++ b/e2eAIOK/deltatuner/deltatuner/tuner/deltatuner_lora.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import torch from peft import LoraModel from peft.import_utils import is_bnb_available, is_bnb_4bit_available diff --git a/e2eAIOK/deltatuner/deltatuner/tuner/deltatuner_ssf.py b/e2eAIOK/deltatuner/deltatuner/tuner/deltatuner_ssf.py index 2a8716695..39785a6eb 100644 --- a/e2eAIOK/deltatuner/deltatuner/tuner/deltatuner_ssf.py +++ b/e2eAIOK/deltatuner/deltatuner/tuner/deltatuner_ssf.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import os import math import re diff --git a/e2eAIOK/deltatuner/deltatuner/utils/config.py b/e2eAIOK/deltatuner/deltatuner/utils/config.py index f3657b42b..73e29f62e 100644 --- a/e2eAIOK/deltatuner/deltatuner/utils/config.py +++ b/e2eAIOK/deltatuner/deltatuner/utils/config.py @@ -1,3 +1,19 @@ +""" + Copyright 2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import enum from dataclasses import field from typing import List, Optional, Tuple, Union From ae95394338424717c965685d5a8ff525d2a7da95 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Thu, 21 Dec 2023 18:12:40 -0600 Subject: [PATCH 2/2] Update setup.py --- RecDP/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RecDP/setup.py b/RecDP/setup.py index 16ddc2150..d61f01cb4 100644 --- a/RecDP/setup.py +++ b/RecDP/setup.py @@ -59,7 +59,7 @@ def get_packages(self): setup_spec = SetupSpec() -name_list = ["pyrecdp", "e2eAIOK-pyrecdp"] +name_list = ["pyrecdp", "e2eAIOK-recdp"] for name in name_list: setuptools.setup( name=name,