From faef72958028775a03a1c292bf90764dda8431ea Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 10 Jan 2024 15:25:48 +0000 Subject: [PATCH 01/32] Updated data reading procedure --- radarpipeline/common/utils.py | 17 ++++++ radarpipeline/io/reader.py | 99 ++++++++++++++++++++++------------- 2 files changed, 80 insertions(+), 36 deletions(-) diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index afd8f75..d109073 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -250,3 +250,20 @@ def get_write_file_attr(feature_name, output_dir, data_format, compression): raise ValueError(f"Invalid data format {data_format} specified \ for spark writer") return file_path + + +def get_hash(array : List) -> int: + """ + Returns the hash of the array + + Parameters + ---------- + array : list + List of values + + Returns + ------- + str + Hash of the array + """ + return hash(tuple(array)) diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index 5defb82..bbde6ee 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -2,17 +2,19 @@ import logging import os from glob import glob +import gzip import re from typing import Any, Dict, List, Optional, Union import pyspark.sql as ps -from pyspark.sql import SparkSession +from pyspark.sql import SparkSession, DataFrame from pyspark.sql.types import StructField, StructType from pyspark.sql.utils import IllegalArgumentException from radarpipeline.common import constants from radarpipeline.datalib import RadarData, RadarUserData, RadarVariableData from radarpipeline.io.abc import DataReader, SchemaReader +from radarpipeline.common.utils import get_hash import avro from avro.datafile import DataFileReader, DataFileWriter @@ -24,6 +26,7 @@ from datetime import datetime from collections import Counter +from functools import reduce, partial logger = logging.getLogger(__name__) @@ -33,27 +36,29 @@ class Schemas(object): def __init__(self, original_schema, original_schema_keys): self.original_schema = original_schema self.original_schema_hash = self._get_schema_hash(original_schema_keys) - self.counterdict = Counter({self.original_schema_hash: 1}) self.hashdict = {self.original_schema_hash: original_schema} def _get_schema_hash(self, schema_keys): - return hash(frozenset(schema_keys)) + return get_hash(schema_keys) def is_original_schema(self, schema_keys): return self._get_schema_hash(schema_keys) == self.original_schema_hash - def get_schema(self): - most_freq_schema_hash = self.counterdict.most_common(1)[0][0] - return self.hashdict[most_freq_schema_hash] + def is_schema_present(self, schema_keys): + return self._get_schema_hash(schema_keys) in self.hashdict + + def is_schema_hash_present(self, schema_hash): + return schema_hash in self.hashdict + + def get_schema(self, schema_keys): + return self.hashdict[self._get_schema_hash(schema_keys)] + + def get_schema_by_hash(self, schema_hash): + return self.hashdict[schema_hash] def add_schema(self, schema_keys, schema): schema_hash = self._get_schema_hash(schema_keys) - if schema_hash not in self.hashdict: - self.hashdict[schema_hash] = schema - self.counterdict[schema_hash] += 1 - - def update_schema_counter(self, schema_keys): - self.counterdict[self._get_schema_hash(schema_keys)] += 1 + self.hashdict[schema_hash] = schema class SparkCSVDataReader(DataReader): @@ -88,6 +93,7 @@ def __init__(self, config: Dict, required_data: List[str], df_type: str = "panda if spark_config is not None: self.spark_config.update(spark_config) self.spark = self._initialize_spark_session() + self.unionByName = partial(DataFrame.unionByName, allowMissingColumns=True) def _initialize_spark_session(self) -> ps.SparkSession: """ @@ -220,15 +226,48 @@ def _read_variable_data_files( RadarVariableData A RadarVariableData object containing all the read data """ + """ + New approach: If schema is present, use it to lazily read the data without enforcing schema + Check if schema is present in the schema dict by matching schema keys + If it is present read it using the schema + Else infer schema and add it to the schema directory + """ if schema: - df = self.spark.read.load( - data_files, - format="csv", - header=True, - schema=schema.get_schema(), - enforceSchema="false", - encoding=constants.ENCODING, - ) + file_dict = {} + for file in data_files: + with gzip.open(file, 'rb') as f: + columns = f.readline().decode("utf-8").split(",") + f.close() + column_hash = get_hash(columns) + if column_hash in file_dict: + file_dict[column_hash].append(file) + else: + file_dict[column_hash] = [file] + dfs = [] + for column_hash in file_dict.keys(): + if schema.is_schema_hash_present(column_hash): + df = self.spark.read.load( + file_dict[column_hash], + format="csv", + header=True, + schema=schema.get_schema_by_hash(column_hash), + enforceSchema="false", + encoding=constants.ENCODING, + ) + dfs.append(df) + else: + df = self.spark.read.load( + file_dict[column_hash], + format="csv", + header=True, + inferSchema="true", + encoding=constants.ENCODING, + ) + inferred_schema = df.schema + schema.add_schema(df.columns, inferred_schema) + dfs.append(df) + # Spark Join all the dfs + df = reduce(self.unionByName, dfs) else: df = self.spark.read.load( data_files, @@ -241,21 +280,9 @@ def _read_variable_data_files( if self.df_type == "pandas": try: df = df.toPandas() - schema.update_schema_counter(df.columns) except Exception: logger.warning("Failed to convert to pandas dataframe. " "inferring schema") - df = self.spark.read.load( - data_files, - format="csv", - header=True, - inferSchema="true", - encoding=constants.ENCODING, - ) - inferred_schema = df.schema - schema.add_schema(df.columns, inferred_schema) - df = df.toPandas() - variable_data = RadarVariableData(df, self.df_type) return variable_data @@ -387,17 +414,17 @@ def _get_schema(self, schema_dir, schema_dir_base) -> StructType: schema_file = os.path.join( schema_dir, f"schema-{schema_dir_base}.json" ) - schema_dict = json.load( + schema_content = json.load( open( schema_file, "r", encoding=constants.ENCODING, ) ) - avro_schema = avro.schema.parse(json.dumps(schema_dict)) - schema_dict = self._recursive_schema_loader(avro_schema) + avro_schema = avro.schema.parse(json.dumps(schema_content)) + schema_content_dict = self._recursive_schema_loader(avro_schema) - schema, schema_keys = self._to_structtype(schema_dict) + schema, schema_keys = self._to_structtype(schema_content_dict) return schema, schema_keys def _add_new_schema(self, schema_dir_base, schema): From 5ae2baefeeda7104ea6ac7f779d48f95be63d5ce Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Thu, 11 Jan 2024 13:23:05 +0000 Subject: [PATCH 02/32] refactoring --- radarpipeline/io/reader.py | 51 ++++++++++++++++++++++---------------- requirements.txt | 2 +- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index bbde6ee..0d65e13 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -50,7 +50,9 @@ def is_schema_present(self, schema_keys): def is_schema_hash_present(self, schema_hash): return schema_hash in self.hashdict - def get_schema(self, schema_keys): + def get_schema(self, schema_keys=None): + if schema_keys is None: + return self.original_schema return self.hashdict[self._get_schema_hash(schema_keys)] def get_schema_by_hash(self, schema_hash): @@ -205,6 +207,19 @@ def read_data(self) -> RadarData: source_path_item, user_data_dict) return radar_data + def _filter_files_by_headers(self, data_files): + file_dict = {} + for file in data_files: + with gzip.open(file, 'rb') as f: + columns = f.readline().decode("utf-8").split(",") + f.close() + column_hash = get_hash(columns) + if column_hash in file_dict: + file_dict[column_hash].append(file) + else: + file_dict[column_hash] = [file] + return file_dict + def _read_variable_data_files( self, data_files: List[str], @@ -232,18 +247,9 @@ def _read_variable_data_files( If it is present read it using the schema Else infer schema and add it to the schema directory """ + dfs = [] + file_dict = self._filter_files_by_headers(data_files) if schema: - file_dict = {} - for file in data_files: - with gzip.open(file, 'rb') as f: - columns = f.readline().decode("utf-8").split(",") - f.close() - column_hash = get_hash(columns) - if column_hash in file_dict: - file_dict[column_hash].append(file) - else: - file_dict[column_hash] = [file] - dfs = [] for column_hash in file_dict.keys(): if schema.is_schema_hash_present(column_hash): df = self.spark.read.load( @@ -266,16 +272,19 @@ def _read_variable_data_files( inferred_schema = df.schema schema.add_schema(df.columns, inferred_schema) dfs.append(df) - # Spark Join all the dfs - df = reduce(self.unionByName, dfs) else: - df = self.spark.read.load( - data_files, - format="csv", - header=True, - inferSchema="true", - encoding=constants.ENCODING, - ) + for column_hash in file_dict: + df = self.spark.read.load( + file_dict[column_hash], + format="csv", + header=True, + inferSchema="true", + encoding=constants.ENCODING, + ) + dfs.append(df) + + # Spark Join all the dfs + df = reduce(self.unionByName, dfs) if self.df_type == "pandas": try: diff --git a/requirements.txt b/requirements.txt index 2d5b859..f4b20d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ pyspark[sql]==3.3.0 GitPython>=3.1.32 strictyaml==1.7.3 pyspark-test==0.2.0 -paramiko==3.1.0 +paramiko==3.4.0 pre-commit pytest pytest-cov From fba2d7552e0cb5c447f33128c10f4fbd71a73643 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Mon, 15 Jan 2024 14:33:06 +0000 Subject: [PATCH 03/32] refactor to make all modules more spark dependent --- radarpipeline/datalib/radar_data.py | 7 ++- radarpipeline/datalib/radar_variable_data.py | 47 ++++++-------------- radarpipeline/io/reader.py | 19 +++----- 3 files changed, 22 insertions(+), 51 deletions(-) diff --git a/radarpipeline/datalib/radar_data.py b/radarpipeline/datalib/radar_data.py index b628d3d..d60bf01 100644 --- a/radarpipeline/datalib/radar_data.py +++ b/radarpipeline/datalib/radar_data.py @@ -94,10 +94,9 @@ def get_combined_data_by_variable( # Combine the all data for each variable for var in variable_dict: if len(variable_dict[var]) > 0: - if self.df_type == "spark": - combined_df = utils.combine_pyspark_dfs(variable_dict[var]) - else: - combined_df = pd.concat(variable_dict[var], ignore_index=True) + combined_df = utils.combine_pyspark_dfs(variable_dict[var]) + if self.df_type == "pandas": + combined_df = combined_df.toPandas() variable_data_list.append(combined_df) if is_only_one_var: diff --git a/radarpipeline/datalib/radar_variable_data.py b/radarpipeline/datalib/radar_variable_data.py index 0cdb032..8f5c59e 100644 --- a/radarpipeline/datalib/radar_variable_data.py +++ b/radarpipeline/datalib/radar_variable_data.py @@ -2,6 +2,7 @@ import logging import pandas as pd import pyspark.sql.functions as f +from pyspark.sql.types import TimestampType from radarpipeline.datalib.abc import Data from radarpipeline.datatypes import DataType @@ -20,6 +21,7 @@ def __init__(self, data: DataType, df_type: str = "pandas") -> None: self._data = data self.df_type = df_type self._preprocess_data() + print(self._data) def get_data(self) -> DataType: return self._data @@ -31,42 +33,19 @@ def get_data_keys(self) -> List[str]: return list(self._data.columns) def get_data_size(self) -> int: - if self.df_type == "pandas": - return len(self._data.index) - else: - return int(self._data.count()) + return int(self._data.count()) def _preprocess_data(self) -> None: """ Converts all time value columns to datetime format """ - - if self.df_type == "spark": - if "value.time" in self.get_data_keys(): - self._data = self._data.withColumn( - "value.time", f.to_date(self._data["`value.time`"]) - ) - if "value.timeReceived" in self.get_data_keys(): - self._data = self._data.withColumn( - "value.timeReceived", f.to_date(self._data["`value.timeReceived`"]) - ) - if "value.dateTime" in self.get_data_keys(): - self._data = self._data.withColumn( - "value.dateTime", f.to_date(self._data["`value.dateTime`"]) - ) - else: - try: - if "value.time" in self.get_data_keys(): - self._data["value.time"] = pd.to_datetime( - self._data["value.time"].astype(str), unit="s" - ) - if "value.timeReceived" in self.get_data_keys(): - self._data["value.timeReceived"] = pd.to_datetime( - self._data["value.timeReceived"].astype(str), unit="s" - ) - if "value.dateTime" in self.get_data_keys(): - self._data["value.dateTime"] = pd.to_datetime( - self._data["value.dateTime"].astype(str), unit="s" - ) - except ValueError: - logger.warning("Unable to convert time columns to datetime format") + try: + time_cols = ["value.time", "value.timeReceived", "value.dateTime"] + for i, col in enumerate(time_cols): + if col in self._data.columns: + self._data = self._data.withColumn(col, self._data[f"`{col}`"] + .cast(TimestampType())) + self._data.withColumn(col, f.from_unixtime( + f.unix_timestamp(f"`{col}`"))) + except ValueError: + logger.warning("Unable to convert time columns to datetime format") diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index 0d65e13..d85b359 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -22,7 +22,6 @@ from avro.schema import RecordSchema, Field, PrimitiveSchema, UnionSchema, Schema from multiprocessing import Pool -from functools import partial from datetime import datetime from collections import Counter @@ -241,12 +240,6 @@ def _read_variable_data_files( RadarVariableData A RadarVariableData object containing all the read data """ - """ - New approach: If schema is present, use it to lazily read the data without enforcing schema - Check if schema is present in the schema dict by matching schema keys - If it is present read it using the schema - Else infer schema and add it to the schema directory - """ dfs = [] file_dict = self._filter_files_by_headers(data_files) if schema: @@ -286,12 +279,12 @@ def _read_variable_data_files( # Spark Join all the dfs df = reduce(self.unionByName, dfs) - if self.df_type == "pandas": - try: - df = df.toPandas() - except Exception: - logger.warning("Failed to convert to pandas dataframe. " - "inferring schema") + #if self.df_type == "pandas": + # try: + # df = df.toPandas() + # except Exception: + # logger.warning("Failed to convert to pandas dataframe. " + # "inferring schema") variable_data = RadarVariableData(df, self.df_type) return variable_data From 6f217912eea6782a78688cbdba4c0f6151a1503b Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Tue, 16 Jan 2024 10:50:17 +0000 Subject: [PATCH 04/32] Begin refactoring pipeline --- radarpipeline/datalib/radar_variable_data.py | 1 - radarpipeline/features/feature_group.py | 2 +- radarpipeline/io/downloader.py | 7 +- radarpipeline/io/reader.py | 14 +-- radarpipeline/project/project.py | 13 ++- radarpipeline/project/sparkengine.py | 97 ++++++++++++++++++++ radarpipeline/radarpipeline.py | 2 + tests/tests_common/test_utils.py | 2 - tests/tests_project/test_project.py | 2 - 9 files changed, 114 insertions(+), 26 deletions(-) create mode 100644 radarpipeline/project/sparkengine.py diff --git a/radarpipeline/datalib/radar_variable_data.py b/radarpipeline/datalib/radar_variable_data.py index 8f5c59e..c32cfcf 100644 --- a/radarpipeline/datalib/radar_variable_data.py +++ b/radarpipeline/datalib/radar_variable_data.py @@ -21,7 +21,6 @@ def __init__(self, data: DataType, df_type: str = "pandas") -> None: self._data = data self.df_type = df_type self._preprocess_data() - print(self._data) def get_data(self) -> DataType: return self._data diff --git a/radarpipeline/features/feature_group.py b/radarpipeline/features/feature_group.py index 60d2f3e..551a84e 100644 --- a/radarpipeline/features/feature_group.py +++ b/radarpipeline/features/feature_group.py @@ -65,7 +65,7 @@ def get_all_features(self, data: RadarData) -> Tuple[List[str], List[DataType]]: feature_values = [] preprocessed_data = self.preprocess(data) for feature in self.features: - print(feature.name) + logger.info(f"Computing feature {feature.name}") feature_names.append(feature.name) preprocessed_feature = feature.preprocess(preprocessed_data) feature_values.append(feature.calculate(preprocessed_feature)) diff --git a/radarpipeline/io/downloader.py b/radarpipeline/io/downloader.py index e128320..6b8b220 100644 --- a/radarpipeline/io/downloader.py +++ b/radarpipeline/io/downloader.py @@ -101,13 +101,14 @@ def _fetch_data(self, root_path, sftp_source_path, included_var_cat, uid): src_file), preserve_mtime=True) except FileNotFoundError: - print("Folder not found: " + dir_path + "/" + src_file) + logger.warning("Folder not found: " + dir_path + + "/" + src_file) continue except EOFError: - print("EOFError: " + dir_path + "/" + src_file) + logger.warning("EOFError: " + dir_path + "/" + src_file) continue except FileNotFoundError: - print("Folder not found: " + uid) + logger.warning("Folder not found: " + uid) return sftp.close() diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index d85b359..f31e86d 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -67,8 +67,7 @@ class SparkCSVDataReader(DataReader): Read CSV data from local directory using pySpark """ - def __init__(self, config: Dict, required_data: List[str], df_type: str = "pandas", - spark_config: Dict = {}): + def __init__(self, spark_session: ps.SparkSession, config: Dict, required_data: List[str], df_type: str = "pandas"): super().__init__(config) self.source_formats = { # RADAR_OLD: uid/variable/yyyymmdd_hh00.csv.gz @@ -91,9 +90,7 @@ def __init__(self, config: Dict, required_data: List[str], df_type: str = "panda self.source_path = self.config['config'].get("source_path", "") self.spark_config = default_spark_config self.schema_reader = AvroSchemaReader() - if spark_config is not None: - self.spark_config.update(spark_config) - self.spark = self._initialize_spark_session() + self.spark = spark_session self.unionByName = partial(DataFrame.unionByName, allowMissingColumns=True) def _initialize_spark_session(self) -> ps.SparkSession: @@ -278,13 +275,6 @@ def _read_variable_data_files( # Spark Join all the dfs df = reduce(self.unionByName, dfs) - - #if self.df_type == "pandas": - # try: - # df = df.toPandas() - # except Exception: - # logger.warning("Failed to convert to pandas dataframe. " - # "inferring schema") variable_data = RadarVariableData(df, self.df_type) return variable_data diff --git a/radarpipeline/project/project.py b/radarpipeline/project/project.py index d9dd666..b221079 100644 --- a/radarpipeline/project/project.py +++ b/radarpipeline/project/project.py @@ -14,6 +14,7 @@ from radarpipeline.io import PandasDataWriter, SparkCSVDataReader, SparkDataWriter from radarpipeline.io import SftpDataReader from radarpipeline.project.validations import ConfigValidator +from radarpipeline.project.sparkengine import SparkEngine from strictyaml import load, YAMLError logger = logging.getLogger(__name__) @@ -43,6 +44,12 @@ def __init__(self, input_data: Union[str, dict]) -> None: self.validator.validate() self.feature_groups = self._get_feature_groups() self.total_required_data = self._get_total_required_data() + ## Initialize spark session + self.spark_engine = SparkEngine(self.config['spark_config']) + self.spark_session = self.spark_engine.initialize_spark_session() + + def close_spark_session(self): + self.spark_engine.close_spark_session() def _resolve_input_data(self, input_data) -> str: """ @@ -224,13 +231,12 @@ def fetch_data(self) -> None: if self.config["input"]["data_type"] == "local": if self.config["input"]["data_format"] in self.valid_input_formats: sparkcsvdatareader = SparkCSVDataReader( + self.spark_session, self.config["input"], self.total_required_data, self.config["configurations"]["df_type"], - self.config['spark_config'] ) self.data = sparkcsvdatareader.read_data() - sparkcsvdatareader.close_spark_session() else: raise ValueError("Wrong data format") @@ -251,7 +257,6 @@ def fetch_data(self) -> None: spark_config=self.config['spark_config'] ) self.data = sparkcsvdatareader.read_data() - sparkcsvdatareader.close_spark_session() elif self.config["input"]["data_type"] == "sftp": sftp_data_reader = SftpDataReader(self.config["input"]["config"], @@ -271,7 +276,6 @@ def fetch_data(self) -> None: self.config['spark_config'] ).read_data() self.data = sparkcsvdatareader.read_data() - sparkcsvdatareader.close_spark_session() else: raise ValueError("Wrong data location") @@ -299,7 +303,6 @@ def export_data(self) -> None: """ df_type = self.config["configurations"]["df_type"] output_config = self.config["output"] - print(output_config) if output_config['output_location'] == "local": if df_type == "pandas": writer = PandasDataWriter( diff --git a/radarpipeline/project/sparkengine.py b/radarpipeline/project/sparkengine.py new file mode 100644 index 0000000..bb33c3c --- /dev/null +++ b/radarpipeline/project/sparkengine.py @@ -0,0 +1,97 @@ +import pyspark.sql as ps +from pyspark.sql import SparkSession, DataFrame +from pyspark.sql.types import StructField, StructType +from pyspark.sql.utils import IllegalArgumentException +from typing import Any, Dict, List, Optional, Union +import logging + +logger = logging.getLogger(__name__) + + +class SparkEngine(): + """ + Read CSV data from local directory using pySpark + """ + + def __init__(self, spark_config: Dict = {}): + default_spark_config = {'spark.executor.instances': 6, + 'spark.driver.memory': '10G', + 'spark.executor.cores': 4, + 'spark.executor.memory': '10g', + 'spark.memory.offHeap.enabled': True, + 'spark.memory.offHeap.size': '20g', + 'spark.driver.maxResultSize': '0', + 'spark.log.level': "OFF"} + self.spark_config = default_spark_config + if spark_config is not None: + self.spark_config.update(spark_config) + + def initialize_spark_session(self) -> ps.SparkSession: + """ + Initializes and returns a SparkSession + + Returns + ------- + SparkSession + A SparkSession object + """ + + """ + Spark configuration documentation: + https://spark.apache.org/docs/latest/configuration.html + + `spark.executor.instances` is the number of executors to + launch for an application. + + `spark.executor.cores` is the number of cores to = + use on each executor. + + `spark.executor.memory` is the amount of memory to + use per executor process. + + `spark.driver.memory` is the amount of memory to use for the driver process, + i.e. where SparkContext is initialized, in MiB unless otherwise specified. + + `spark.memory.offHeap.enabled` is to enable off-heap memory allocation + + `spark.memory.offHeap.size` is the absolute amount of memory which can be used + for off-heap allocation, in bytes unless otherwise specified. + + `spark.driver.maxResultSize` is the limit of total size of serialized results of + all partitions for each Spark action (e.g. collect) in bytes. + Should be at least 1M, or 0 for unlimited. + """ + self.spark = ( + SparkSession.builder.master("local").appName("radarpipeline") + .config('spark.executor.instances', + self.spark_config['spark.executor.instances']) + .config('spark.executor.cores', + self.spark_config['spark.executor.cores']) + .config('spark.executor.memory', + self.spark_config['spark.executor.memory']) + .config('spark.driver.memory', + self.spark_config['spark.driver.memory']) + .config('spark.memory.offHeap.enabled', + self.spark_config['spark.memory.offHeap.enabled']) + .config('spark.memory.offHeap.size', + self.spark_config['spark.memory.offHeap.size']) + .config('spark.driver.maxResultSize', + self.spark_config['spark.driver.maxResultSize']) + .config('spark.log.level', + self.spark_config['spark.log.level']) + .getOrCreate() + ) + self.spark._jsc.setLogLevel(self.spark_config['spark.log.level']) + self.spark.sparkContext.setLogLevel("OFF") + # Enable Apache Arrow for optimizations in Spark to Pandas conversion + self.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") + # Fallback to use non-Arrow conversion in case of errors + self.spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", + "true") + # For further reading: + # https://spark.apache.org/docs/3.0.1/sql-pyspark-pandas-with-arrow.html + logger.info("Spark Session created") + return self.spark + + def close_spark_session(self): + self.spark.stop() diff --git a/radarpipeline/radarpipeline.py b/radarpipeline/radarpipeline.py index 8f1d580..c2465ad 100644 --- a/radarpipeline/radarpipeline.py +++ b/radarpipeline/radarpipeline.py @@ -26,6 +26,8 @@ def run(config_path: str = "config.yaml"): project.compute_features() logger.info("Exporting the features data...") project.export_data() + logger.info("Data exported successfully. Closing Spark Engine") + project.close_spark_session() logger.info("Pipeline run completed successfully") except KeyboardInterrupt: logger.info("Pipeline run interrupted by user") diff --git a/tests/tests_common/test_utils.py b/tests/tests_common/test_utils.py index d173e19..daf5b32 100644 --- a/tests/tests_common/test_utils.py +++ b/tests/tests_common/test_utils.py @@ -83,7 +83,6 @@ def setUp(self): def test_read_correct_yaml(self): config = read_yaml(self.TESTDATA_FILENAME) - print(config) expected_config = { 'project': { 'project_name': 'mock_project', @@ -114,7 +113,6 @@ def test_read_unavailable_yaml(self): def test_read_yaml_with_spark_config(self): config = read_yaml(self.TESTDATA_FILENAME_SPARK) - print(config) expected_config = { 'project': { 'project_name': 'mock_project', diff --git a/tests/tests_project/test_project.py b/tests/tests_project/test_project.py index 41edbc8..e471056 100644 --- a/tests/tests_project/test_project.py +++ b/tests/tests_project/test_project.py @@ -94,7 +94,5 @@ def test_get_config(self): project = Project(self.remotelink) project_config = project._get_config() schema = utils.get_yaml_schema() - print(project_config) - print(self.expected_config) expected_config_updated = as_document(self.expected_config, schema).data self.assertDictEqual(project_config, expected_config_updated) From a8b0582c5ee315d476ba5304d45329cfd4de4935 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Tue, 16 Jan 2024 11:46:45 +0000 Subject: [PATCH 05/32] refactor Spark Reader mechanism --- radarpipeline/io/__init__.py | 2 +- radarpipeline/io/reader.py | 168 +++++++++---------------------- radarpipeline/project/project.py | 42 ++++---- 3 files changed, 66 insertions(+), 146 deletions(-) diff --git a/radarpipeline/io/__init__.py b/radarpipeline/io/__init__.py index dade1ac..ed5178c 100644 --- a/radarpipeline/io/__init__.py +++ b/radarpipeline/io/__init__.py @@ -1,4 +1,4 @@ from radarpipeline.io.abc import DataReader, SchemaReader -from radarpipeline.io.reader import AvroSchemaReader, SparkCSVDataReader, Reader +from radarpipeline.io.reader import AvroSchemaReader, Reader from radarpipeline.io.downloader import SftpDataReader from radarpipeline.io.writer import * diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index f31e86d..b46d574 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -62,12 +62,55 @@ def add_schema(self, schema_keys, schema): self.hashdict[schema_hash] = schema +class Reader(): + ''' + Class for reading data from a file + Reader(data_type : str, data_path: str, variables: Union[str, List]) + reader = Reader(...) + reader.get_data(variables=Union[List, str]) + reader.get_user_data(user_id=..) + ''' + def __init__(self, spark_session: ps.SparkSession, + config: Dict, required_data: List[str], df_type: str = "pandas"): + """_summary_ + + Args: + spark_session (ps.SparkSession): _description_ + config (Dict): _description_ + required_data (List[str]): _description_ + df_type (str, optional): _description_. Defaults to "pandas". + + Raises: + NotImplementedError: _description_ + """ + self.config = config + self.data_type = self.config["input"]["data_format"] + self.required_data = required_data + self.df_type = df_type + if self.data_type in ['csv', 'csv.gz']: + self.reader_class = SparkCSVDataReader(spark_session, config, + required_data, df_type) + else: + raise NotImplementedError("Only csv data type is supported for now") + + def read_data(self): + self.data = self.reader_class.read_data() + return self.data + + def get_data(self, variables: Union[List, str]) -> RadarData: + return self.data.get_combined_data_by_variable(variables) + + def get_user_data(self, user_id: str) -> RadarData: + return self.data.get_data_by_user_id(user_id) + + class SparkCSVDataReader(DataReader): """ Read CSV data from local directory using pySpark """ - def __init__(self, spark_session: ps.SparkSession, config: Dict, required_data: List[str], df_type: str = "pandas"): + def __init__(self, spark_session: ps.SparkSession, + config: Dict, required_data: List[str], df_type: str = "pandas"): super().__init__(config) self.source_formats = { # RADAR_OLD: uid/variable/yyyymmdd_hh00.csv.gz @@ -77,91 +120,13 @@ def __init__(self, spark_session: ps.SparkSession, config: Dict, required_data: "RADAR_NEW": re.compile(r"""^[\w-]+/([\w]+)/ [\d]+/([\d]+.csv.gz$|schema-\1.json$)""", re.X), } - default_spark_config = {'spark.executor.instances': 6, - 'spark.driver.memory': '10G', - 'spark.executor.cores': 4, - 'spark.executor.memory': '10g', - 'spark.memory.offHeap.enabled': True, - 'spark.memory.offHeap.size': '20g', - 'spark.driver.maxResultSize': '0', - 'spark.log.level': "OFF"} self.required_data = required_data self.df_type = df_type - self.source_path = self.config['config'].get("source_path", "") - self.spark_config = default_spark_config + self.source_path = self.config['input']['config'].get("source_path", "") self.schema_reader = AvroSchemaReader() self.spark = spark_session self.unionByName = partial(DataFrame.unionByName, allowMissingColumns=True) - def _initialize_spark_session(self) -> ps.SparkSession: - """ - Initializes and returns a SparkSession - - Returns - ------- - SparkSession - A SparkSession object - """ - - """ - Spark configuration documentation: - https://spark.apache.org/docs/latest/configuration.html - - `spark.executor.instances` is the number of executors to - launch for an application. - - `spark.executor.cores` is the number of cores to = - use on each executor. - - `spark.executor.memory` is the amount of memory to - use per executor process. - - `spark.driver.memory` is the amount of memory to use for the driver process, - i.e. where SparkContext is initialized, in MiB unless otherwise specified. - - `spark.memory.offHeap.enabled` is to enable off-heap memory allocation - - `spark.memory.offHeap.size` is the absolute amount of memory which can be used - for off-heap allocation, in bytes unless otherwise specified. - - `spark.driver.maxResultSize` is the limit of total size of serialized results of - all partitions for each Spark action (e.g. collect) in bytes. - Should be at least 1M, or 0 for unlimited. - """ - spark = ( - SparkSession.builder.master("local").appName("radarpipeline") - .config('spark.executor.instances', - self.spark_config['spark.executor.instances']) - .config('spark.executor.cores', - self.spark_config['spark.executor.cores']) - .config('spark.executor.memory', - self.spark_config['spark.executor.memory']) - .config('spark.driver.memory', - self.spark_config['spark.driver.memory']) - .config('spark.memory.offHeap.enabled', - self.spark_config['spark.memory.offHeap.enabled']) - .config('spark.memory.offHeap.size', - self.spark_config['spark.memory.offHeap.size']) - .config('spark.driver.maxResultSize', - self.spark_config['spark.driver.maxResultSize']) - .config('spark.log.level', - self.spark_config['spark.log.level']) - .getOrCreate() - ) - spark._jsc.setLogLevel(self.spark_config['spark.log.level']) - spark.sparkContext.setLogLevel("OFF") - # Enable Apache Arrow for optimizations in Spark to Pandas conversion - spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") - # Fallback to use non-Arrow conversion in case of errors - spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") - # For further reading: - # https://spark.apache.org/docs/3.0.1/sql-pyspark-pandas-with-arrow.html - logger.info("Spark Session created") - return spark - - def close_spark_session(self): - self.spark.stop() - def _get_source_type(self, source_path): """ Returns the source type of the data @@ -698,46 +663,3 @@ def _get_superior_spark_type(self, spark_data_type_list: List[Any]) -> Any: f"Conflicting types: {spark_data_type_list}. Returning String type." ) return constants.STRING_TYPE - - -class Reader(): - ''' - Class for reading data from a file - Reader(data_type : str, data_path: str, variables: Union[str, List]) - reader = Reader(...) - reader.get_data(variables=Union[List, str]) - reader.get_user_data(user_id=..) - ''' - def __init__(self, data_type: str, data_path: str, variables: Union[str, List]): - ''' - Parameters : data_type : str, data_path: str, variables: Union[str, List] - data_type : str - Type of data to be read - Only supports csv for now - data_path : str - Path to the data directory - variables : Union[str, List] - List of variables to be read - ''' - self.data_type = data_type - self.data_path = data_path - # check if variables is a str - # If so, convert it to a list - if isinstance(variables, str): - variables = [variables] - self.variables = variables - config_dict = {"local_directory": self.data_path} - # check if data_type is csv - if self.data_type == 'csv': - self.reader_class = SparkCSVDataReader(config_dict, self.variables) - else: - raise NotImplementedError("Only csv data type is supported for now") - - def read_data(self): - self.data = self.reader_class.read_data() - - def get_data(self, variables: Union[List, str]) -> RadarData: - return self.data.get_combined_data_by_variable(variables) - - def get_user_data(self, user_id: str) -> RadarData: - return self.data.get_data_by_user_id(user_id) diff --git a/radarpipeline/project/project.py b/radarpipeline/project/project.py index b221079..e89c11f 100644 --- a/radarpipeline/project/project.py +++ b/radarpipeline/project/project.py @@ -11,7 +11,7 @@ from radarpipeline.common import utils from radarpipeline.features import Feature, FeatureGroup -from radarpipeline.io import PandasDataWriter, SparkCSVDataReader, SparkDataWriter +from radarpipeline.io import PandasDataWriter, SparkDataWriter, Reader from radarpipeline.io import SftpDataReader from radarpipeline.project.validations import ConfigValidator from radarpipeline.project.sparkengine import SparkEngine @@ -44,7 +44,8 @@ def __init__(self, input_data: Union[str, dict]) -> None: self.validator.validate() self.feature_groups = self._get_feature_groups() self.total_required_data = self._get_total_required_data() - ## Initialize spark session + if "spark_config" not in self.config: + self.config["spark_config"] = {} self.spark_engine = SparkEngine(self.config['spark_config']) self.spark_session = self.spark_engine.initialize_spark_session() @@ -225,20 +226,15 @@ def fetch_data(self) -> None: """ Fetches the data from the data source """ - if 'spark_config' not in self.config: - self.config['spark_config'] = {} if self.config["input"]["data_type"] == "local": - if self.config["input"]["data_format"] in self.valid_input_formats: - sparkcsvdatareader = SparkCSVDataReader( - self.spark_session, - self.config["input"], - self.total_required_data, - self.config["configurations"]["df_type"], - ) - self.data = sparkcsvdatareader.read_data() - else: - raise ValueError("Wrong data format") + datareader= Reader( + self.spark_session, + self.config, + self.total_required_data, + self.config["configurations"]["df_type"], + ) + self.data = datareader.read_data() elif self.config["input"]["data_type"] == "mock": MOCK_URL = "https://github.com/RADAR-base-Analytics/mockdata" @@ -252,11 +248,13 @@ def fetch_data(self) -> None: "source_path": mock_data_directory } } - sparkcsvdatareader = SparkCSVDataReader( - mock_config_input, self.total_required_data, - spark_config=self.config['spark_config'] + datareader = Reader( + self.spark_session, + mock_config_input, + self.total_required_data, + self.config["configurations"]["df_type"], ) - self.data = sparkcsvdatareader.read_data() + self.data = datareader.read_data() elif self.config["input"]["data_type"] == "sftp": sftp_data_reader = SftpDataReader(self.config["input"]["config"], @@ -269,13 +267,13 @@ def fetch_data(self) -> None: "source_path": root_dir } } - sparkcsvdatareader = SparkCSVDataReader( + datareader = Reader( + self.spark_session, sftp_local_config, self.total_required_data, self.config["configurations"]["df_type"], - self.config['spark_config'] - ).read_data() - self.data = sparkcsvdatareader.read_data() + ) + self.data = datareader.read_data() else: raise ValueError("Wrong data location") From 26fc86c1c9c70793070eb6d2acb893b34694f894 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Tue, 16 Jan 2024 16:26:15 +0000 Subject: [PATCH 06/32] minor refactor in reader.py --- radarpipeline/io/__init__.py | 2 +- radarpipeline/io/reader.py | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/radarpipeline/io/__init__.py b/radarpipeline/io/__init__.py index ed5178c..58d8b7d 100644 --- a/radarpipeline/io/__init__.py +++ b/radarpipeline/io/__init__.py @@ -1,4 +1,4 @@ from radarpipeline.io.abc import DataReader, SchemaReader -from radarpipeline.io.reader import AvroSchemaReader, Reader +from radarpipeline.io.reader import AvroSchemaReader, Reader, SparkCSVDataReader from radarpipeline.io.downloader import SftpDataReader from radarpipeline.io.writer import * diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index b46d574..0fc45c4 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -20,14 +20,11 @@ from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter from avro.schema import RecordSchema, Field, PrimitiveSchema, UnionSchema, Schema - -from multiprocessing import Pool from datetime import datetime from collections import Counter from functools import reduce, partial - logger = logging.getLogger(__name__) @@ -75,13 +72,10 @@ def __init__(self, spark_session: ps.SparkSession, """_summary_ Args: - spark_session (ps.SparkSession): _description_ - config (Dict): _description_ - required_data (List[str]): _description_ - df_type (str, optional): _description_. Defaults to "pandas". - - Raises: - NotImplementedError: _description_ + spark_session (ps.SparkSession): spark session instance + config (Dict): Configuration data from the config.yaml file + required_data (List[str]): List of required data + df_type (str, optional): Type of dataframe format. Defaults to "pandas". """ self.config = config self.data_type = self.config["input"]["data_format"] From 25dd2374671dea801ceb145a216d9015640f241e Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 17 Jan 2024 14:45:19 +0000 Subject: [PATCH 07/32] Added and updated tests + upgraded spark version to 3.5.0 --- radarpipeline/common/utils.py | 24 +++++++ radarpipeline/project/__init__.py | 1 + radarpipeline/project/project.py | 12 ++-- radarpipeline/project/sparkengine.py | 2 +- requirements.txt | 2 +- .../phone_battery_charging_duration.csv | 70 +++++++++--------- .../expected_output/step_count_per_day.csv | 4 +- tests/tests_datalib/test_radar_user_data.py | 13 ++-- .../tests_datalib/test_radar_variable_data.py | 27 ++++--- tests/tests_datalib/test_radardata.py | 18 +++-- tests/tests_io/test_reader.py | 72 ++++++------------- tests/tests_project/test_project.py | 1 + tests/tests_project/test_sparkengine.py | 61 ++++++++++++++++ 13 files changed, 193 insertions(+), 114 deletions(-) create mode 100644 tests/tests_project/test_sparkengine.py diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index d109073..6335eaf 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -15,6 +15,10 @@ import posixpath from radarpipeline.common import constants +import unittest +from radarpipeline.project.sparkengine import SparkEngine +import pyspark.sql.functions as f +from pyspark.sql.types import TimestampType def read_yaml(yaml_file_path: str) -> Dict[str, Any]: @@ -267,3 +271,23 @@ def get_hash(array : List) -> int: Hash of the array """ return hash(tuple(array)) + + +class PySparkTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.spark_engine = SparkEngine() + cls.spark = cls.spark_engine.initialize_spark_session() + + @classmethod + def tearDownClass(cls): + cls.spark_engine.close_spark_session() + + def preprocess_data(self, data): + time_cols = ["value.time", "value.timeReceived", "value.dateTime"] + for i, col in enumerate(time_cols): + if col in data.columns: + data = data.withColumn(col, data[f"`{col}`"].cast(TimestampType())) + data.withColumn(col, f.from_unixtime( + f.unix_timestamp(f"`{col}`"))) + return data diff --git a/radarpipeline/project/__init__.py b/radarpipeline/project/__init__.py index e6bb37c..9afa79d 100644 --- a/radarpipeline/project/__init__.py +++ b/radarpipeline/project/__init__.py @@ -1 +1,2 @@ from radarpipeline.project.project import Project +from radarpipeline.project.sparkengine import SparkEngine diff --git a/radarpipeline/project/project.py b/radarpipeline/project/project.py index e89c11f..45f6f8f 100644 --- a/radarpipeline/project/project.py +++ b/radarpipeline/project/project.py @@ -243,14 +243,14 @@ def fetch_data(self) -> None: if not os.path.exists(cache_dir): Repo.clone_from(MOCK_URL, cache_dir) mock_data_directory = os.path.join(cache_dir, "mockdata") - mock_config_input = { - "config": { - "source_path": mock_data_directory - } - } + mock_config = { + "input": { + "config": { + "source_path": mock_data_directory}, + "data_format": "csv"}} datareader = Reader( self.spark_session, - mock_config_input, + mock_config, self.total_required_data, self.config["configurations"]["df_type"], ) diff --git a/radarpipeline/project/sparkengine.py b/radarpipeline/project/sparkengine.py index bb33c3c..c3635bf 100644 --- a/radarpipeline/project/sparkengine.py +++ b/radarpipeline/project/sparkengine.py @@ -13,7 +13,7 @@ class SparkEngine(): Read CSV data from local directory using pySpark """ - def __init__(self, spark_config: Dict = {}): + def __init__(self, spark_config: Dict = None): default_spark_config = {'spark.executor.instances': 6, 'spark.driver.memory': '10G', 'spark.executor.cores': 4, diff --git a/requirements.txt b/requirements.txt index f4b20d5..6fd2442 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ pandas==1.4.1 matplotlib==3.5.1 seaborn==0.11.2 scipy==1.10.0 -pyspark[sql]==3.3.0 +pyspark[sql]==3.5.0 GitPython>=3.1.32 strictyaml==1.7.3 pyspark-test==0.2.0 diff --git a/tests/resources/expected_output/phone_battery_charging_duration.csv b/tests/resources/expected_output/phone_battery_charging_duration.csv index a13b994..566158e 100644 --- a/tests/resources/expected_output/phone_battery_charging_duration.csv +++ b/tests/resources/expected_output/phone_battery_charging_duration.csv @@ -1,37 +1,37 @@ key.userId,date,value.status,value.statusTime,value.statusTimeInSeconds -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,CHARGING,0 days 02:21:00.209999800,141.00349999666668 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-01,CHARGING,2 days 02:45:02.355999947,3045.0392666657835 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-03,CHARGING,0 days 20:42:59.004000187,1242.9834000031167 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-04,CHARGING,3 days 00:26:06.661000014,4346.1110166669005 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-07,CHARGING,0 days 01:11:28.029999971,71.46716666618335 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-08,CHARGING,0 days 04:17:37.694000007,257.62823333345 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-09,CHARGING,0 days 06:29:13.859999896,389.23099999826667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-10,CHARGING,0 days 01:10:49.706000089,70.82843333481668 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-23,CHARGING,0 days 00:02:31.164000034,2.519400000566667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,CHARGING,0 days 01:05:19.040999889,65.31734999815001 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,CHARGING,0 days 01:36:19.865000010,96.3310833335 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,CHARGING,0 days 02:03:21.999000072,123.36665000120001 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-13,CHARGING,0 days 00:40:04.375999927,40.07293333211667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-14,CHARGING,0 days 01:34:26.461999893,94.44103333155 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,CHARGING,0 days 00:14:06.765000104,14.112750001733334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-29,CHARGING,0 days 03:15:18.118000031,195.30196666718336 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-13,CHARGING,0 days 00:05:05.431999920,5.090533332000001 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-27,CHARGING,0 days 01:10:28.252999784,70.47088332973334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-30,CHARGING,0 days 00:02:42.644000053,2.710733334216667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-02-04,CHARGING,0 days 01:42:59.499999999,102.99166666665 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-04,CHARGING,0 days 02:12:53.666999817,132.89444999695002 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-05,CHARGING,146 days 21:23:05.388999938,211523.08981666566 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,CHARGING,0 days 00:42:19.325000048,42.32208333413334 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,CHARGING,0 days 02:12:54.634000063,132.91056666771667 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,CHARGING,0 days 01:11:10.625000237,71.17708333728335 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,CHARGING,0 days 02:10:00.172000170,130.0028666695 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,CHARGING,0 days 03:38:00.754999876,218.01258333126668 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,CHARGING,0 days 03:11:40.501999856,191.67503333093333 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,CHARGING,0 days 01:45:50.284999848,105.8380833308 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,CHARGING,0 days 02:02:29.530000209,122.49216667015001 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,CHARGING,0 days 01:16:54.997999668,76.9166333278 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-31,CHARGING,0 days 02:16:37.223000050,136.62038333416666 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-08,CHARGING,0 days 20:06:12.864000082,1206.2144000013666 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-09,CHARGING,0 days 00:09:59.984000206,9.999733336766667 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,CHARGING,41 days 18:10:47.425999880,60130.79043333134 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,CHARGING,0 days 02:21:00.210000,141.0035 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-01,CHARGING,2 days 02:45:02.356000,3045.0392666666667 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-03,CHARGING,0 days 20:42:59.004000,1242.9834 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-04,CHARGING,3 days 00:26:06.661000,4346.111016666667 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-07,CHARGING,0 days 01:11:28.030000,71.46716666666667 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-08,CHARGING,0 days 04:17:37.694000,257.62823333333336 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-09,CHARGING,0 days 06:29:13.860000,389.231 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-10,CHARGING,0 days 01:10:49.706000,70.82843333333334 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-23,CHARGING,0 days 00:02:31.164000,2.5194 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,CHARGING,0 days 01:05:19.041000,65.31735 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,CHARGING,0 days 01:36:19.865000,96.33108333333334 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,CHARGING,0 days 02:03:21.999000,123.36665 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-13,CHARGING,0 days 00:40:04.376000,40.07293333333334 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-14,CHARGING,0 days 01:34:26.462000,94.44103333333334 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,CHARGING,0 days 00:14:06.765000,14.112750000000002 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-29,CHARGING,0 days 03:15:18.118000,195.3019666666667 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-13,CHARGING,0 days 00:05:05.432000,5.090533333333334 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-27,CHARGING,0 days 01:10:28.253000,70.47088333333335 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-30,CHARGING,0 days 00:02:42.644000,2.710733333333333 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-02-04,CHARGING,0 days 01:42:59.500000,102.99166666666666 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-04,CHARGING,0 days 02:12:53.667000,132.89445 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-05,CHARGING,146 days 22:23:05.389000,211583.08981666667 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,CHARGING,0 days 00:42:19.325000,42.32208333333334 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,CHARGING,0 days 02:12:54.634000,132.91056666666668 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,CHARGING,0 days 01:11:10.625000,71.17708333333333 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,CHARGING,0 days 02:10:00.172000,130.00286666666668 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,CHARGING,0 days 03:38:00.755000,218.01258333333334 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,CHARGING,0 days 03:11:40.502000,191.67503333333335 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,CHARGING,0 days 01:45:50.285000,105.83808333333334 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,CHARGING,0 days 02:02:29.530000,122.49216666666668 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,CHARGING,0 days 01:16:54.998000,76.91663333333334 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-31,CHARGING,0 days 02:16:37.223000,136.62038333333334 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-08,CHARGING,0 days 20:06:12.864000,1206.2144 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-09,CHARGING,0 days 00:09:59.984000,9.999733333333333 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,CHARGING,41 days 18:10:47.426000,60130.79043333334 5c0e2ec7-6f85-4041-9669-7145075d1754,2019-01-14,CHARGING,0 days 00:00:00,0.0 diff --git a/tests/resources/expected_output/step_count_per_day.csv b/tests/resources/expected_output/step_count_per_day.csv index 0d22073..04eadf5 100644 --- a/tests/resources/expected_output/step_count_per_day.csv +++ b/tests/resources/expected_output/step_count_per_day.csv @@ -1,7 +1,7 @@ key.userId,date,value.steps 072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-25,5705 -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,15412 -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-27,3548 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,14474 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-27,4486 07a69f47-1923-4cfc-b89b-0eefad483f43,2018-09-14,9 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-09-27,1 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-01,2 diff --git a/tests/tests_datalib/test_radar_user_data.py b/tests/tests_datalib/test_radar_user_data.py index 666065b..85c74cc 100644 --- a/tests/tests_datalib/test_radar_user_data.py +++ b/tests/tests_datalib/test_radar_user_data.py @@ -1,16 +1,19 @@ from radarpipeline.datalib import RadarVariableData, RadarUserData +from radarpipeline.common.utils import PySparkTestCase import unittest import os import pandas as pd -from pandas.testing import assert_frame_equal +from pyspark.testing import assertDataFrameEqual -class TestRadarUserData(unittest.TestCase): +class TestRadarUserData(PySparkTestCase): def setUp(self): - PANDAS_MOCK_PATH = ("tests/resources/test_data/test_participant/" + MOCK_PATH = ("tests/resources/test_data/test_participant/" "android_phone_step_count/0000_11.csv.gz") - self.mock_pandas = pd.read_csv(PANDAS_MOCK_PATH) - self.radar_variable_data = RadarVariableData(self.mock_pandas) + self.mock_df = self.spark.read.csv(MOCK_PATH, + header=True, + inferSchema=True) + self.radar_variable_data = RadarVariableData(self.mock_df) self.radar_user_data = RadarUserData({"test_variable_data": self.radar_variable_data}) diff --git a/tests/tests_datalib/test_radar_variable_data.py b/tests/tests_datalib/test_radar_variable_data.py index c4c3b04..bf1de78 100644 --- a/tests/tests_datalib/test_radar_variable_data.py +++ b/tests/tests_datalib/test_radar_variable_data.py @@ -1,24 +1,33 @@ from radarpipeline.datalib import RadarVariableData +from radarpipeline.common.utils import PySparkTestCase import unittest import os import pandas as pd -from pandas.testing import assert_frame_equal +from pyspark.sql.types import TimestampType +import pyspark.sql.functions as f +from pyspark.testing import assertDataFrameEqual -class TestRadarVariableData(unittest.TestCase): +class TestRadarVariableData(PySparkTestCase): + def setUp(self): - PANDAS_MOCK_PATH = ("tests/resources/test_data/test_participant/" - "android_phone_step_count/0000_11.csv.gz") - self.mock_pandas = pd.read_csv(PANDAS_MOCK_PATH) - self.radar_variable_data = RadarVariableData(self.mock_pandas) + MOCK_PATH = ("tests/resources/test_data/test_participant/" + "android_phone_step_count/0000_11.csv.gz") + self.mock_df = self.spark.read.csv(MOCK_PATH, + header=True, + inferSchema=True) + self.mock_df = self.preprocess_data(self.mock_df) + self.radar_variable_data = RadarVariableData(self.mock_df) def test_get_data(self): - assert_frame_equal(self.radar_variable_data.get_data(), self.mock_pandas) + assertDataFrameEqual(self.radar_variable_data.get_data(), self.mock_df, + checkRowOrder=True) def test_get_data_keys(self): self.assertEqual(self.radar_variable_data.get_data_keys(), - list(self.mock_pandas.columns)) + list(self.mock_df.columns)) def test_get_data_sizes(self): self.assertEqual(self.radar_variable_data.get_data_size(), - len(self.mock_pandas.index)) + int(self.mock_df.count())) + diff --git a/tests/tests_datalib/test_radardata.py b/tests/tests_datalib/test_radardata.py index 3888595..ba18781 100644 --- a/tests/tests_datalib/test_radardata.py +++ b/tests/tests_datalib/test_radardata.py @@ -2,15 +2,21 @@ import unittest import os import pandas as pd +from radarpipeline.common.utils import PySparkTestCase +import pyspark.sql.functions as f +from pyspark.sql.types import TimestampType from pandas.testing import assert_frame_equal -class TestRadarData(unittest.TestCase): +class TestRadarData(PySparkTestCase): + def setUp(self): - PANDAS_MOCK_PATH = ("tests/resources/test_data/test_participant/" - "android_phone_step_count/0000_11.csv.gz") - self.mock_pandas = pd.read_csv(PANDAS_MOCK_PATH) - self.radar_variable_data = RadarVariableData(self.mock_pandas) + MOCK_PATH = ("tests/resources/test_data/test_participant/" + "android_phone_step_count/0000_11.csv.gz") + self.mock_df = self.spark.read.csv(MOCK_PATH, + header=True, + inferSchema=True) + self.radar_variable_data = RadarVariableData(self.mock_df) self.radar_user_data = RadarUserData({"test_variable_data": self.radar_variable_data}) self.radar_data = RadarData({"test_user_data": self.radar_user_data}) @@ -33,6 +39,8 @@ def test_get_all_user_ids(self): self.assertEqual(self.radar_data._get_all_user_ids(), ["test_user_data"]) def test_get_combined_data_by_variable(self): + self.mock_df = self.preprocess_data(self.mock_df) + self.mock_pandas = self.mock_df.toPandas() assert_frame_equal(self.radar_data.get_combined_data_by_variable( "test_variable_data"), self.mock_pandas) diff --git a/tests/tests_io/test_reader.py b/tests/tests_io/test_reader.py index 5084c12..266a7cb 100644 --- a/tests/tests_io/test_reader.py +++ b/tests/tests_io/test_reader.py @@ -9,68 +9,40 @@ from radarpipeline.common import constants from pandas.testing import assert_frame_equal from numpy.testing import assert_array_equal - +from radarpipeline.common.utils import PySparkTestCase from pyspark.sql.types import StructField, StructType -class TestSparkCSVDataReader(unittest.TestCase): +class TestSparkCSVDataReader(PySparkTestCase): def setUp(self): - mock_config = {"config": {"source_path": "tests/resources/test_data/"}} + mock_config = { + "input": { + "config": { + "source_path": "tests/resources/test_data/"}, + "data_format": "csv"}} data_list = ['android_phone_step_count'] - self.sparkcsvdatareader = SparkCSVDataReader(mock_config, + self.sparkcsvdatareader = SparkCSVDataReader(self.spark, mock_config, required_data=data_list) - PANDAS_MOCK_PATH = ("tests/resources/test_data/test_participant/" - "android_phone_step_count/0000_11.csv.gz") - self.mock_pandas = pd.read_csv(PANDAS_MOCK_PATH) - self.radar_variable_data = RadarVariableData(self.mock_pandas) + MOCK_PATH = ("tests/resources/test_data/test_participant/" + "android_phone_step_count/0000_11.csv.gz") + self.mock_df = self.spark.read.csv(MOCK_PATH, + header=True, + inferSchema=True) + self.radar_variable_data = RadarVariableData(self.mock_df) self.radar_user_data = RadarUserData({"android_phone_step_count": self.radar_variable_data}) self.radar_data = RadarData({"test_participant": self.radar_user_data}) def test_read_data(self): + mock_dataframe = self.preprocess_data(self.mock_df) spark_data = self.sparkcsvdatareader.read_data() - assert_array_equal(spark_data.get_data()["test_participant"].get_data() - ["android_phone_step_count"].get_data().values, - self.mock_pandas.values) - self.assertTrue(isinstance(spark_data, RadarData)) - self.assertTrue(isinstance(spark_data.get_data()["test_participant"], - RadarUserData)) - self.assertTrue(isinstance(spark_data.get_data()["test_participant"].get_data() - ["android_phone_step_count"], RadarVariableData)) - - def tearDown(self): - self.sparkcsvdatareader.close_spark_session() - - -class TestSparkCustomConfig(unittest.TestCase): - def setUp(self): - mock_config = {"config": {"source_path": "tests/resources/test_data/"}} - self.spark_config = { - "spark.executor.instances": "3", - "spark.memory.offHeap.enabled": False, - "spark.executor.cores": 2, - "spark.executor.memory": "5g", - "spark.driver.memory": "10g", - "spark.memory.offHeap.size": "10g", - "spark.driver.maxResultSize": "0", - "spark.log.level": "ERROR"} - data_list = ['android_phone_step_count'] - self.sparkcsvdatareader = SparkCSVDataReader(mock_config, - required_data=data_list, - spark_config=self.spark_config) - - def test_spark_config(self): - spark_config_output = dict(self.sparkcsvdatareader.spark.sparkContext. - getConf().getAll()) - for key, value in self.spark_config.items(): - self.assertEqual(spark_config_output[key], str(value)) - - def test_spark_config_dict(self): - spark_config_output = self.sparkcsvdatareader.spark_config - self.assertDictEqual(self.spark_config, spark_config_output) - - def tearDown(self): - self.sparkcsvdatareader.close_spark_session() + self.assertEqual(set(spark_data.get_data()["test_participant"].get_data() + ["android_phone_step_count"].get_data().collect()), + set(mock_dataframe.collect())) + self.assertIsInstance(spark_data, RadarData) + self.assertIsInstance(spark_data.get_data()["test_participant"], RadarUserData) + self.assertIsInstance(spark_data.get_data()["test_participant"].get_data() + ["android_phone_step_count"], RadarVariableData) class TestAvroSchemaReader(unittest.TestCase): diff --git a/tests/tests_project/test_project.py b/tests/tests_project/test_project.py index e471056..6e2ebf8 100644 --- a/tests/tests_project/test_project.py +++ b/tests/tests_project/test_project.py @@ -71,6 +71,7 @@ def test_get_total_required_data(self): self.assertListEqual(sorted(required_data_output), sorted(expected_data)) def tearDown(self) -> None: + self.project.close_spark_session() del self.project diff --git a/tests/tests_project/test_sparkengine.py b/tests/tests_project/test_sparkengine.py new file mode 100644 index 0000000..eef0465 --- /dev/null +++ b/tests/tests_project/test_sparkengine.py @@ -0,0 +1,61 @@ +import unittest +from radarpipeline.project import SparkEngine + + +class TestSparkDefaultConfig(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.default_spark_config = {'spark.executor.instances': 6, + 'spark.driver.memory': '10G', + 'spark.executor.cores': 4, + 'spark.executor.memory': '10g', + 'spark.memory.offHeap.enabled': "true", + 'spark.memory.offHeap.size': '20g', + 'spark.driver.maxResultSize': '0', + 'spark.log.level': "OFF"} + cls.spark_engine = SparkEngine() + cls.spark = cls.spark_engine.initialize_spark_session() + + def test_spark_config(self): + spark_config_output = dict(self.spark.sparkContext. + getConf().getAll()) + for key, value in self.default_spark_config.items(): + self.assertEqual(spark_config_output[key], str(value)) + + def test_spark_config_dict(self): + spark_config_output = self.default_spark_config + self.assertDictEqual(self.default_spark_config, spark_config_output) + + @classmethod + def tearDownClass(cls): + cls.spark_engine.close_spark_session() + + +class TestSparkCustomConfig(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.spark_config = { + "spark.executor.instances": "3", + "spark.memory.offHeap.enabled": "false", + "spark.executor.cores": 2, + "spark.executor.memory": "5g", + "spark.driver.memory": "10g", + "spark.memory.offHeap.size": "10g", + "spark.driver.maxResultSize": "0", + "spark.log.level": "ERROR"} + cls.spark_engine = SparkEngine(spark_config=cls.spark_config) + cls.spark = cls.spark_engine.initialize_spark_session() + + def test_spark_config(self): + spark_config_output = dict(self.spark.sparkContext. + getConf().getAll()) + for key, value in self.spark_config.items(): + self.assertEqual(spark_config_output[key], str(value)) + + def test_spark_config_dict(self): + spark_config_output = self.spark_config + self.assertDictEqual(self.spark_config, spark_config_output) + + @classmethod + def tearDownClass(cls): + cls.spark_engine.close_spark_session() From 979ae94af08f36a12a985fa1c9191bcb1571eb80 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 17 Jan 2024 14:48:08 +0000 Subject: [PATCH 08/32] resolved linting errors --- radarpipeline/project/project.py | 2 +- tests/tests_datalib/test_radar_user_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/radarpipeline/project/project.py b/radarpipeline/project/project.py index 45f6f8f..2f160ac 100644 --- a/radarpipeline/project/project.py +++ b/radarpipeline/project/project.py @@ -228,7 +228,7 @@ def fetch_data(self) -> None: """ if self.config["input"]["data_type"] == "local": - datareader= Reader( + datareader = Reader( self.spark_session, self.config, self.total_required_data, diff --git a/tests/tests_datalib/test_radar_user_data.py b/tests/tests_datalib/test_radar_user_data.py index 85c74cc..7978cdf 100644 --- a/tests/tests_datalib/test_radar_user_data.py +++ b/tests/tests_datalib/test_radar_user_data.py @@ -9,7 +9,7 @@ class TestRadarUserData(PySparkTestCase): def setUp(self): MOCK_PATH = ("tests/resources/test_data/test_participant/" - "android_phone_step_count/0000_11.csv.gz") + "android_phone_step_count/0000_11.csv.gz") self.mock_df = self.spark.read.csv(MOCK_PATH, header=True, inferSchema=True) From c0ebb67b8919832e1b57b9767df1f1584af131c6 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 17 Jan 2024 14:56:44 +0000 Subject: [PATCH 09/32] minor refactoring of the data preprocessing function --- radarpipeline/common/utils.py | 17 +++++++++++------ radarpipeline/datalib/radar_variable_data.py | 9 ++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index 6335eaf..7a8bb69 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -273,6 +273,16 @@ def get_hash(array : List) -> int: return hash(tuple(array)) +def preprocess_time_data(data): + time_cols = ["value.time", "value.timeReceived", "value.dateTime"] + for i, col in enumerate(time_cols): + if col in data.columns: + data = data.withColumn(col, data[f"`{col}`"].cast(TimestampType())) + data.withColumn(col, f.from_unixtime( + f.unix_timestamp(f"`{col}`"))) + return data + + class PySparkTestCase(unittest.TestCase): @classmethod def setUpClass(cls): @@ -284,10 +294,5 @@ def tearDownClass(cls): cls.spark_engine.close_spark_session() def preprocess_data(self, data): - time_cols = ["value.time", "value.timeReceived", "value.dateTime"] - for i, col in enumerate(time_cols): - if col in data.columns: - data = data.withColumn(col, data[f"`{col}`"].cast(TimestampType())) - data.withColumn(col, f.from_unixtime( - f.unix_timestamp(f"`{col}`"))) + preprocess_time_data(data) return data diff --git a/radarpipeline/datalib/radar_variable_data.py b/radarpipeline/datalib/radar_variable_data.py index c32cfcf..1bf9fa6 100644 --- a/radarpipeline/datalib/radar_variable_data.py +++ b/radarpipeline/datalib/radar_variable_data.py @@ -5,6 +5,7 @@ from pyspark.sql.types import TimestampType from radarpipeline.datalib.abc import Data +from radarpipeline.common.utils import preprocess_time_data from radarpipeline.datatypes import DataType logger = logging.getLogger(__name__) @@ -39,12 +40,6 @@ def _preprocess_data(self) -> None: Converts all time value columns to datetime format """ try: - time_cols = ["value.time", "value.timeReceived", "value.dateTime"] - for i, col in enumerate(time_cols): - if col in self._data.columns: - self._data = self._data.withColumn(col, self._data[f"`{col}`"] - .cast(TimestampType())) - self._data.withColumn(col, f.from_unixtime( - f.unix_timestamp(f"`{col}`"))) + self._data = preprocess_time_data(self._data) except ValueError: logger.warning("Unable to convert time columns to datetime format") From 85d4a1c592f7cba9be5af215dcec7ebf69a1cb3c Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 17 Jan 2024 15:02:52 +0000 Subject: [PATCH 10/32] minor error correction --- radarpipeline/common/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index 7a8bb69..14688c1 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -294,5 +294,4 @@ def tearDownClass(cls): cls.spark_engine.close_spark_session() def preprocess_data(self, data): - preprocess_time_data(data) - return data + return preprocess_time_data(data) From e443cfc27b09047f2b3e78cfb1dc330081c3223b Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 17 Jan 2024 15:04:10 +0000 Subject: [PATCH 11/32] updated setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b1de97b..1c5d7cb 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def read_file(filename): "pandas==1.4.1", "numpy==1.22.3", "scipy==1.10.0", - "pyspark[sql]==3.3.0", + "pyspark[sql]==3.5.0", "GitPython>=3.1.32", "strictyaml==1.7.3", "paramiko==3.1.0", From 38e8612449af72754bef02a80a89f80aa4aaa89d Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 17 Jan 2024 15:11:04 +0000 Subject: [PATCH 12/32] updated paramiko dependency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1c5d7cb..da2d270 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ def read_file(filename): "pyspark[sql]==3.5.0", "GitPython>=3.1.32", "strictyaml==1.7.3", - "paramiko==3.1.0", + "paramiko==3.4.0", "avro==1.11.2"], test_suite="tests", include_package_data=True, From d10bbfbaf151e944145f4d4342dddf5f0b26c1c1 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 17 Jan 2024 15:14:55 +0000 Subject: [PATCH 13/32] updated modules version due to security vulnerabilities. --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6fd2442..de477d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ matplotlib==3.5.1 seaborn==0.11.2 scipy==1.10.0 pyspark[sql]==3.5.0 -GitPython>=3.1.32 +GitPython>=3.1.41 strictyaml==1.7.3 pyspark-test==0.2.0 paramiko==3.4.0 diff --git a/setup.py b/setup.py index da2d270..3214ed8 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def read_file(filename): "numpy==1.22.3", "scipy==1.10.0", "pyspark[sql]==3.5.0", - "GitPython>=3.1.32", + "GitPython>=3.1.41", "strictyaml==1.7.3", "paramiko==3.4.0", "avro==1.11.2"], From d7620393df2bdefc2040326ced2d7f660c3ecb64 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 17 Jan 2024 16:01:09 +0000 Subject: [PATCH 14/32] updated pandas and numpy versions --- radarpipeline/io/writer.py | 1 - requirements.txt | 4 ++-- setup.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/radarpipeline/io/writer.py b/radarpipeline/io/writer.py index 237da8f..7d1a103 100644 --- a/radarpipeline/io/writer.py +++ b/radarpipeline/io/writer.py @@ -102,7 +102,6 @@ def write_data(self) -> None: index=False, sep=constants.CSV_DELIMITER, encoding=constants.ENCODING, - line_terminator=constants.LINESEP, compression=self.compression, ) elif self.data_format == "pickle": diff --git a/requirements.txt b/requirements.txt index de477d1..0f3a101 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ twine==3.8.0 pyYaml==6.0 -numpy==1.22.3 -pandas==1.4.1 +numpy==1.24.4 +pandas==2.0.3 matplotlib==3.5.1 seaborn==0.11.2 scipy==1.10.0 diff --git a/setup.py b/setup.py index 3214ed8..aa72458 100644 --- a/setup.py +++ b/setup.py @@ -24,8 +24,8 @@ def read_file(filename): packages=find_packages(), install_requires=[ "pyYaml==6.0", - "pandas==1.4.1", - "numpy==1.22.3", + "pandas==2.0.3", + "numpy==1.24.4", "scipy==1.10.0", "pyspark[sql]==3.5.0", "GitPython>=3.1.41", From 9cedc21c96debe86eb11e5bee2a9e8cc2143a6d9 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Mon, 22 Jan 2024 17:56:49 +0530 Subject: [PATCH 15/32] updated test expected output files --- .../phone_battery_charging_duration.csv | 44 +++++++++---------- .../expected_output/step_count_per_day.csv | 44 ++++++++++--------- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/tests/resources/expected_output/phone_battery_charging_duration.csv b/tests/resources/expected_output/phone_battery_charging_duration.csv index 566158e..4d84b9c 100644 --- a/tests/resources/expected_output/phone_battery_charging_duration.csv +++ b/tests/resources/expected_output/phone_battery_charging_duration.csv @@ -1,37 +1,37 @@ key.userId,date,value.status,value.statusTime,value.statusTimeInSeconds -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,CHARGING,0 days 02:21:00.210000,141.0035 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,CHARGING,0 days 02:21:00.210000,141.00349999999997 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-01,CHARGING,2 days 02:45:02.356000,3045.0392666666667 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-03,CHARGING,0 days 20:42:59.004000,1242.9834 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-04,CHARGING,3 days 00:26:06.661000,4346.111016666667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-07,CHARGING,0 days 01:11:28.030000,71.46716666666667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-08,CHARGING,0 days 04:17:37.694000,257.62823333333336 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-09,CHARGING,0 days 06:29:13.860000,389.231 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-10,CHARGING,0 days 01:10:49.706000,70.82843333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-23,CHARGING,0 days 00:02:31.164000,2.5194 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,CHARGING,0 days 01:05:19.041000,65.31735 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,CHARGING,0 days 01:36:19.865000,96.33108333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,CHARGING,0 days 02:03:21.999000,123.36665 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-07,CHARGING,0 days 01:11:28.030000,71.46716666666666 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-08,CHARGING,0 days 04:07:36.656000,247.61093333333335 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-09,CHARGING,0 days 06:29:08.928000,389.1488 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-10,CHARGING,0 days 01:20:55.676000,80.92793333333334 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-23,CHARGING,0 days 00:02:31.164000,2.5193999999999996 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,CHARGING,0 days 00:07:42.688000,7.7114666666666665 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,CHARGING,0 days 02:33:56.218000,153.93696666666668 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,CHARGING,0 days 02:03:21.999000,123.36664999999999 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-13,CHARGING,0 days 00:40:04.376000,40.07293333333334 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-14,CHARGING,0 days 01:34:26.462000,94.44103333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,CHARGING,0 days 00:14:06.765000,14.112750000000002 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,CHARGING,0 days 00:14:06.765000,14.11275 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-29,CHARGING,0 days 03:15:18.118000,195.3019666666667 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-13,CHARGING,0 days 00:05:05.432000,5.090533333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-27,CHARGING,0 days 01:10:28.253000,70.47088333333335 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-27,CHARGING,0 days 01:10:28.253000,70.47088333333333 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-30,CHARGING,0 days 00:02:42.644000,2.710733333333333 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-02-04,CHARGING,0 days 01:42:59.500000,102.99166666666666 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-04,CHARGING,0 days 02:12:53.667000,132.89445 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-05,CHARGING,146 days 22:23:05.389000,211583.08981666667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,CHARGING,0 days 00:42:19.325000,42.32208333333334 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,CHARGING,0 days 02:12:54.634000,132.91056666666668 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,CHARGING,0 days 01:11:10.625000,71.17708333333333 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,CHARGING,0 days 02:10:00.172000,130.00286666666668 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,CHARGING,0 days 03:38:00.755000,218.01258333333334 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,CHARGING,0 days 03:11:40.502000,191.67503333333335 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,CHARGING,0 days 01:45:50.285000,105.83808333333334 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,CHARGING,0 days 02:02:29.530000,122.49216666666668 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,CHARGING,0 days 01:16:54.998000,76.91663333333334 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-05,CHARGING,146 days 21:23:05.389000,211523.08981666667 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,CHARGING,0 days 00:42:19.325000,42.32208333333333 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,CHARGING,0 days 01:09:38.651000,69.64418333333333 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,CHARGING,0 days 01:33:16.018000,93.26696666666666 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,CHARGING,0 days 02:21:10.660000,141.17766666666665 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,CHARGING,0 days 02:10:00.054000,130.0009 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,CHARGING,0 days 02:39:41.594000,159.69323333333332 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,CHARGING,0 days 02:29:59.711000,149.99518333333333 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,CHARGING,0 days 02:37:24.333000,157.40555 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,CHARGING,0 days 02:27:50.480000,147.84133333333332 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-31,CHARGING,0 days 02:16:37.223000,136.62038333333334 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-08,CHARGING,0 days 20:06:12.864000,1206.2144 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-09,CHARGING,0 days 00:09:59.984000,9.999733333333333 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,CHARGING,41 days 18:10:47.426000,60130.79043333334 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,CHARGING,41 days 18:10:47.426000,60130.790433333335 5c0e2ec7-6f85-4041-9669-7145075d1754,2019-01-14,CHARGING,0 days 00:00:00,0.0 diff --git a/tests/resources/expected_output/step_count_per_day.csv b/tests/resources/expected_output/step_count_per_day.csv index 04eadf5..f7fd9f2 100644 --- a/tests/resources/expected_output/step_count_per_day.csv +++ b/tests/resources/expected_output/step_count_per_day.csv @@ -1,7 +1,8 @@ key.userId,date,value.steps -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-25,5705 -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,14474 -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-27,4486 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-25,3043 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,7376 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-27,14200 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-28,46 07a69f47-1923-4cfc-b89b-0eefad483f43,2018-09-14,9 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-09-27,1 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-01,2 @@ -14,18 +15,19 @@ key.userId,date,value.steps 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-24,8 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-28,1 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-31,35 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,2579 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,23 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,2572 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,30 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-11,3260 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,111 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-13,2714 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-14,1 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,1 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-26,24 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-26,15 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-27,9 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-29,14 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-04,1480 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-11,2923 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-14,1 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-15,1 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-17,2648 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-13,1512 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-14,4116 @@ -40,21 +42,23 @@ key.userId,date,value.steps 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-19,2 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,2 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-21,2 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,1586 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,2014 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,2301 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,4062 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,5026 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,6409 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,4593 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,8031 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,23 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,3393 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,2202 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,2911 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,4798 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,7884 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,4358 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,4944 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-12-01,3509 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-25,1747 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-26,2209 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-29,1554 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-30,185 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-31,1737 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-01,1880 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-29,1548 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-30,17 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-31,1195 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-01,2596 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-08,34 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-09,3353 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-09,1599 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-10,1754 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,1 5c0e2ec7-6f85-4041-9669-7145075d1754,2019-01-14,2 From f2d8faa20b3bdc2ac5e7bebedddd76f65d36c6fb Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Mon, 22 Jan 2024 19:39:38 +0530 Subject: [PATCH 16/32] updated tests --- .../phone_battery_charging_duration.csv | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/resources/expected_output/phone_battery_charging_duration.csv b/tests/resources/expected_output/phone_battery_charging_duration.csv index 4d84b9c..b3cd3e8 100644 --- a/tests/resources/expected_output/phone_battery_charging_duration.csv +++ b/tests/resources/expected_output/phone_battery_charging_duration.csv @@ -1,37 +1,37 @@ key.userId,date,value.status,value.statusTime,value.statusTimeInSeconds -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,CHARGING,0 days 02:21:00.210000,141.00349999999997 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,CHARGING,0 days 02:21:00.210000,141.0035 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-01,CHARGING,2 days 02:45:02.356000,3045.0392666666667 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-03,CHARGING,0 days 20:42:59.004000,1242.9834 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-04,CHARGING,3 days 00:26:06.661000,4346.111016666667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-07,CHARGING,0 days 01:11:28.030000,71.46716666666666 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-07,CHARGING,0 days 01:11:28.030000,71.46716666666667 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-08,CHARGING,0 days 04:07:36.656000,247.61093333333335 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-09,CHARGING,0 days 06:29:08.928000,389.1488 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-10,CHARGING,0 days 01:20:55.676000,80.92793333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-23,CHARGING,0 days 00:02:31.164000,2.5193999999999996 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,CHARGING,0 days 00:07:42.688000,7.7114666666666665 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-23,CHARGING,0 days 00:02:31.164000,2.5194 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,CHARGING,0 days 00:07:42.688000,7.711466666666667 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,CHARGING,0 days 02:33:56.218000,153.93696666666668 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,CHARGING,0 days 02:03:21.999000,123.36664999999999 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,CHARGING,0 days 02:03:21.999000,123.36665 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-13,CHARGING,0 days 00:40:04.376000,40.07293333333334 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-14,CHARGING,0 days 01:34:26.462000,94.44103333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,CHARGING,0 days 00:14:06.765000,14.11275 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,CHARGING,0 days 00:14:06.765000,14.112750000000002 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-29,CHARGING,0 days 03:15:18.118000,195.3019666666667 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-13,CHARGING,0 days 00:05:05.432000,5.090533333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-27,CHARGING,0 days 01:10:28.253000,70.47088333333333 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-27,CHARGING,0 days 01:10:28.253000,70.47088333333335 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-30,CHARGING,0 days 00:02:42.644000,2.710733333333333 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-02-04,CHARGING,0 days 01:42:59.500000,102.99166666666666 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-04,CHARGING,0 days 02:12:53.667000,132.89445 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-05,CHARGING,146 days 21:23:05.389000,211523.08981666667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,CHARGING,0 days 00:42:19.325000,42.32208333333333 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,CHARGING,0 days 00:42:19.325000,42.32208333333334 2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,CHARGING,0 days 01:09:38.651000,69.64418333333333 2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,CHARGING,0 days 01:33:16.018000,93.26696666666666 2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,CHARGING,0 days 02:21:10.660000,141.17766666666665 2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,CHARGING,0 days 02:10:00.054000,130.0009 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,CHARGING,0 days 02:39:41.594000,159.69323333333332 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,CHARGING,0 days 02:29:59.711000,149.99518333333333 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,CHARGING,0 days 02:39:41.594000,159.69323333333335 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,CHARGING,0 days 02:29:59.711000,149.99518333333336 2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,CHARGING,0 days 02:37:24.333000,157.40555 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,CHARGING,0 days 02:27:50.480000,147.84133333333332 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,CHARGING,0 days 02:27:50.480000,147.84133333333335 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-31,CHARGING,0 days 02:16:37.223000,136.62038333333334 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-08,CHARGING,0 days 20:06:12.864000,1206.2144 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-09,CHARGING,0 days 00:09:59.984000,9.999733333333333 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,CHARGING,41 days 18:10:47.426000,60130.790433333335 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,CHARGING,41 days 18:10:47.426000,60130.79043333334 5c0e2ec7-6f85-4041-9669-7145075d1754,2019-01-14,CHARGING,0 days 00:00:00,0.0 From a50a70e838e47dcac1bda72b0252a3fb733eacbf Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 24 Jan 2024 17:38:06 +0530 Subject: [PATCH 17/32] solved test issue caused due of timezone setting in spark --- config.yaml | 2 +- mockdata | 2 +- radarpipeline/project/sparkengine.py | 1 + .../phone_battery_charging_duration.csv | 42 +++++++++--------- .../expected_output/step_count_per_day.csv | 44 +++++++++---------- 5 files changed, 44 insertions(+), 47 deletions(-) diff --git a/config.yaml b/config.yaml index e7b9097..42faed2 100644 --- a/config.yaml +++ b/config.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: local # couldbe mock, local, sftp, s3 + data_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/mockdata b/mockdata index c791236..17e9ff9 160000 --- a/mockdata +++ b/mockdata @@ -1 +1 @@ -Subproject commit c7912366b946edc8a7e51bbde6b41ae4a248c995 +Subproject commit 17e9ff951ea125dea0c9e65a5695d707749fdc84 diff --git a/radarpipeline/project/sparkengine.py b/radarpipeline/project/sparkengine.py index c3635bf..e2f83cc 100644 --- a/radarpipeline/project/sparkengine.py +++ b/radarpipeline/project/sparkengine.py @@ -88,6 +88,7 @@ def initialize_spark_session(self) -> ps.SparkSession: # Fallback to use non-Arrow conversion in case of errors self.spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") + self.spark.conf.set("spark.sql.session.timeZone", "UTC") # For further reading: # https://spark.apache.org/docs/3.0.1/sql-pyspark-pandas-with-arrow.html logger.info("Spark Session created") diff --git a/tests/resources/expected_output/phone_battery_charging_duration.csv b/tests/resources/expected_output/phone_battery_charging_duration.csv index b3cd3e8..9d59082 100644 --- a/tests/resources/expected_output/phone_battery_charging_duration.csv +++ b/tests/resources/expected_output/phone_battery_charging_duration.csv @@ -1,37 +1,37 @@ key.userId,date,value.status,value.statusTime,value.statusTimeInSeconds -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,CHARGING,0 days 02:21:00.210000,141.0035 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,CHARGING,0 days 02:21:00.210000,141.00349999999997 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-01,CHARGING,2 days 02:45:02.356000,3045.0392666666667 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-03,CHARGING,0 days 20:42:59.004000,1242.9834 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-04,CHARGING,3 days 00:26:06.661000,4346.111016666667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-07,CHARGING,0 days 01:11:28.030000,71.46716666666667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-08,CHARGING,0 days 04:07:36.656000,247.61093333333335 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-09,CHARGING,0 days 06:29:08.928000,389.1488 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-10,CHARGING,0 days 01:20:55.676000,80.92793333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-23,CHARGING,0 days 00:02:31.164000,2.5194 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,CHARGING,0 days 00:07:42.688000,7.711466666666667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,CHARGING,0 days 02:33:56.218000,153.93696666666668 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,CHARGING,0 days 02:03:21.999000,123.36665 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-07,CHARGING,0 days 01:11:28.030000,71.46716666666666 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-08,CHARGING,0 days 04:17:37.694000,257.6282333333333 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-09,CHARGING,0 days 06:29:13.860000,389.231 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-10,CHARGING,0 days 01:10:49.706000,70.82843333333334 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-23,CHARGING,0 days 00:02:31.164000,2.5193999999999996 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,CHARGING,0 days 01:05:19.041000,65.31735 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,CHARGING,0 days 01:36:19.865000,96.33108333333332 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,CHARGING,0 days 02:03:21.999000,123.36664999999999 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-13,CHARGING,0 days 00:40:04.376000,40.07293333333334 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-14,CHARGING,0 days 01:34:26.462000,94.44103333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,CHARGING,0 days 00:14:06.765000,14.112750000000002 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,CHARGING,0 days 00:14:06.765000,14.11275 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-29,CHARGING,0 days 03:15:18.118000,195.3019666666667 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-13,CHARGING,0 days 00:05:05.432000,5.090533333333334 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-27,CHARGING,0 days 01:10:28.253000,70.47088333333335 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-27,CHARGING,0 days 01:10:28.253000,70.47088333333333 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-30,CHARGING,0 days 00:02:42.644000,2.710733333333333 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-02-04,CHARGING,0 days 01:42:59.500000,102.99166666666666 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-04,CHARGING,0 days 02:12:53.667000,132.89445 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-03-05,CHARGING,146 days 21:23:05.389000,211523.08981666667 -07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,CHARGING,0 days 00:42:19.325000,42.32208333333334 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,CHARGING,0 days 01:09:38.651000,69.64418333333333 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,CHARGING,0 days 01:33:16.018000,93.26696666666666 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,CHARGING,0 days 02:21:10.660000,141.17766666666665 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,CHARGING,0 days 02:10:00.054000,130.0009 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,CHARGING,0 days 02:39:41.594000,159.69323333333335 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,CHARGING,0 days 02:29:59.711000,149.99518333333336 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,CHARGING,0 days 02:37:24.333000,157.40555 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,CHARGING,0 days 02:27:50.480000,147.84133333333335 +07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,CHARGING,0 days 00:42:19.325000,42.32208333333333 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,CHARGING,0 days 02:12:54.634000,132.91056666666665 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,CHARGING,0 days 01:11:10.625000,71.17708333333333 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,CHARGING,0 days 02:10:00.172000,130.00286666666665 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,CHARGING,0 days 03:38:00.755000,218.0125833333333 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,CHARGING,0 days 03:11:40.502000,191.67503333333335 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,CHARGING,0 days 01:45:50.285000,105.83808333333333 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,CHARGING,0 days 02:02:29.530000,122.49216666666666 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,CHARGING,0 days 01:16:54.998000,76.91663333333332 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-31,CHARGING,0 days 02:16:37.223000,136.62038333333334 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-08,CHARGING,0 days 20:06:12.864000,1206.2144 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-09,CHARGING,0 days 00:09:59.984000,9.999733333333333 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,CHARGING,41 days 18:10:47.426000,60130.79043333334 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,CHARGING,41 days 18:10:47.426000,60130.790433333335 5c0e2ec7-6f85-4041-9669-7145075d1754,2019-01-14,CHARGING,0 days 00:00:00,0.0 diff --git a/tests/resources/expected_output/step_count_per_day.csv b/tests/resources/expected_output/step_count_per_day.csv index f7fd9f2..0d22073 100644 --- a/tests/resources/expected_output/step_count_per_day.csv +++ b/tests/resources/expected_output/step_count_per_day.csv @@ -1,8 +1,7 @@ key.userId,date,value.steps -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-25,3043 -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,7376 -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-27,14200 -072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-28,46 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-25,5705 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-26,15412 +072ddb22-82ef-4b81-8460-41ab096b54bb,2019-10-27,3548 07a69f47-1923-4cfc-b89b-0eefad483f43,2018-09-14,9 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-09-27,1 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-01,2 @@ -15,19 +14,18 @@ key.userId,date,value.steps 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-24,8 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-28,1 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-10-31,35 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,2572 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,30 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-07,2579 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-08,23 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-11,3260 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-12,111 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-13,2714 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-14,1 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-19,1 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-26,15 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-27,9 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-26,24 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-11-29,14 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-04,1480 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-11,2923 -07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-15,1 +07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-14,1 07a69f47-1923-4cfc-b89b-0eefad483f43,2019-12-17,2648 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-13,1512 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-01-14,4116 @@ -42,23 +40,21 @@ key.userId,date,value.steps 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-19,2 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-20,2 07a69f47-1923-4cfc-b89b-0eefad483f43,2020-10-21,2 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,23 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,3393 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,2202 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,2911 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,4798 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,7884 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,4358 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,4944 -2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-12-01,3509 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-23,1586 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-24,2014 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-25,2301 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-26,4062 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-27,5026 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-28,6409 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-29,4593 +2a02e53a-951e-4fd0-b47f-195a87096bd0,2018-11-30,8031 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-25,1747 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-26,2209 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-29,1548 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-30,17 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-31,1195 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-01,2596 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-29,1554 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-30,185 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-10-31,1737 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-01,1880 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-08,34 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-09,1599 -5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-10,1754 +5c0e2ec7-6f85-4041-9669-7145075d1754,2018-11-09,3353 5c0e2ec7-6f85-4041-9669-7145075d1754,2018-12-03,1 5c0e2ec7-6f85-4041-9669-7145075d1754,2019-01-14,2 From 111de11b7afa2c04e6e1b70eeaac8c122a2b6fa2 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Thu, 25 Jan 2024 16:49:55 +0530 Subject: [PATCH 18/32] resolved error caused when data for an user is empty --- radarpipeline/io/reader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index 0fc45c4..b185192 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -233,9 +233,14 @@ def _read_variable_data_files( dfs.append(df) # Spark Join all the dfs - df = reduce(self.unionByName, dfs) - variable_data = RadarVariableData(df, self.df_type) - + # check if dfs are empty + if len(dfs) == 0: + # creating empty spark df + df = self.spark.createDataFrame([], schema=schema.get_schema()) + variable_data = RadarVariableData(df, self.df_type) + else: + df = reduce(self.unionByName, dfs) + variable_data = RadarVariableData(df, self.df_type) return variable_data def _read_data_from_old_format(self, source_path: str, user_data_dict: dict): From 2d884cd08490f738435de06283e8b002ffa51a13 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Thu, 8 Feb 2024 16:12:57 +0530 Subject: [PATCH 19/32] Added custom data reading module that can be used independently --- radarpipeline/io/__init__.py | 1 + radarpipeline/io/ingestion.py | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 radarpipeline/io/ingestion.py diff --git a/radarpipeline/io/__init__.py b/radarpipeline/io/__init__.py index 58d8b7d..51d167b 100644 --- a/radarpipeline/io/__init__.py +++ b/radarpipeline/io/__init__.py @@ -2,3 +2,4 @@ from radarpipeline.io.reader import AvroSchemaReader, Reader, SparkCSVDataReader from radarpipeline.io.downloader import SftpDataReader from radarpipeline.io.writer import * +from radarpipeline.io.ingestion import CustomDataReader diff --git a/radarpipeline/io/ingestion.py b/radarpipeline/io/ingestion.py new file mode 100644 index 0000000..a87edfc --- /dev/null +++ b/radarpipeline/io/ingestion.py @@ -0,0 +1,36 @@ +import logging + +from radarpipeline.io.reader import Reader +from radarpipeline.project.sparkengine import SparkEngine + +from typing import Dict + +logger = logging.getLogger(__name__) + + +class CustomDataReader(): + def __init__(self, input_config, variables, data_type="local", data_format="csv", + df_type="pandas") -> None: + self.variables = variables + self.data_format = data_format + self.data_type = data_type + self.config = self.modify_config(input_config, data_format) + self.sparkengine = SparkEngine() + self.spark = self.sparkengine.initialize_spark_session() + self.data_reader = Reader(self.spark, self.config, variables, df_type) + + def modify_config(self, input_config, data_format) -> Dict: + """ + Modify the input configuration to include the variables of interest + """ + config = {'input': {}} + config['input'] = input_config + config['input']['data_format'] = data_format + config['input']['data_type'] = self.data_type + return config + + def read_data(self): + return self.data_reader.read_data() + + def close_session(self): + self.sparkengine.close_spark_session() From 68eabe3d788fff01b4151001670e37a0d3dfb2bc Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Thu, 8 Feb 2024 17:16:42 +0530 Subject: [PATCH 20/32] added more possible time columns to preprocess_time_data --- radarpipeline/common/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index 14688c1..11da55b 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -274,7 +274,8 @@ def get_hash(array : List) -> int: def preprocess_time_data(data): - time_cols = ["value.time", "value.timeReceived", "value.dateTime"] + time_cols = ["value.time", "value.timeReceived", "value.dateTime", + "value.timeCompleted", "value.timeNotification"] for i, col in enumerate(time_cols): if col in data.columns: data = data.withColumn(col, data[f"`{col}`"].cast(TimestampType())) From 857ea89cf6c1c3d4fe2b83da7eed6fc81ed621a3 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Fri, 9 Feb 2024 15:46:33 +0530 Subject: [PATCH 21/32] Added user sampling mechanism in radarpipeline --- config.yaml | 16 ++++++ radarpipeline/common/utils.py | 6 ++- radarpipeline/io/__init__.py | 1 + radarpipeline/io/abc.py | 9 ++++ radarpipeline/io/reader.py | 29 ++++++++--- radarpipeline/io/sampler.py | 35 +++++++++++++ radarpipeline/project/project.py | 1 + radarpipeline/project/validations.py | 73 ++++++++++++++++++++++++++++ 8 files changed, 162 insertions(+), 8 deletions(-) create mode 100644 radarpipeline/io/sampler.py diff --git a/config.yaml b/config.yaml index 42faed2..1d3e9d0 100644 --- a/config.yaml +++ b/config.yaml @@ -25,6 +25,22 @@ input: configurations: df_type: 'pandas' + #user_sampling: + ## Possible methods: fraction, count, userid + #method: fraction + #config: + # fraction: 0.3 + #method: count + #config: + # count: 2 + #method: userid + #config: + # userids: + # - 2a02e53a-951e-4fd0-b47f-195a87096bd0 + ## TODO: For future + #data_sampling: + ## Possible methods: time, count, fraction + # method: range_time features: - location: 'https://github.com/RADAR-base-Analytics/mockfeatures' diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index 11da55b..ff7ec4f 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -154,7 +154,11 @@ def get_yaml_schema() -> Map: "data_format": Str() }), "configurations": Map({ - "df_type": Str() + "df_type": Str(), + Optional("user_sampling"): Map({ + "method": Str(), + "config": MapPattern(Str(), Seq(Str()) or Str()) + }) }), "features": Seq(Map({ "location": Str(), diff --git a/radarpipeline/io/__init__.py b/radarpipeline/io/__init__.py index 51d167b..6a6de95 100644 --- a/radarpipeline/io/__init__.py +++ b/radarpipeline/io/__init__.py @@ -3,3 +3,4 @@ from radarpipeline.io.downloader import SftpDataReader from radarpipeline.io.writer import * from radarpipeline.io.ingestion import CustomDataReader +from radarpipeline.io.sampler import UserSampler diff --git a/radarpipeline/io/abc.py b/radarpipeline/io/abc.py index 28d176f..711e201 100644 --- a/radarpipeline/io/abc.py +++ b/radarpipeline/io/abc.py @@ -55,3 +55,12 @@ def __init__(self, features: Dict[str, DataType], output_dir: str) -> None: @abstractmethod def write_data(self) -> None: pass + + +class Sampler(ABC): + """ + Abstract class for sampling the RADAR data + """ + + def __init__(self, config) -> None: + self.config = config diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index b185192..eb40229 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -14,6 +14,7 @@ from radarpipeline.common import constants from radarpipeline.datalib import RadarData, RadarUserData, RadarVariableData from radarpipeline.io.abc import DataReader, SchemaReader +from radarpipeline.io.sampler import UserSampler from radarpipeline.common.utils import get_hash import avro @@ -81,9 +82,14 @@ def __init__(self, spark_session: ps.SparkSession, self.data_type = self.config["input"]["data_format"] self.required_data = required_data self.df_type = df_type + if self.config["configurations"]['user_sampling'] is None: + self.data_sampler = None + else: + self.data_sampler = UserSampler(self.config["configurations"]['user_sampling']) if self.data_type in ['csv', 'csv.gz']: self.reader_class = SparkCSVDataReader(spark_session, config, - required_data, df_type) + required_data, df_type, + self.data_sampler) else: raise NotImplementedError("Only csv data type is supported for now") @@ -104,7 +110,8 @@ class SparkCSVDataReader(DataReader): """ def __init__(self, spark_session: ps.SparkSession, - config: Dict, required_data: List[str], df_type: str = "pandas"): + config: Dict, required_data: List[str], df_type: str = "pandas", + sampler: UserSampler = None): super().__init__(config) self.source_formats = { # RADAR_OLD: uid/variable/yyyymmdd_hh00.csv.gz @@ -117,6 +124,7 @@ def __init__(self, spark_session: ps.SparkSession, self.required_data = required_data self.df_type = df_type self.source_path = self.config['input']['config'].get("source_path", "") + self.sampler = sampler self.schema_reader = AvroSchemaReader() self.spark = spark_session self.unionByName = partial(DataFrame.unionByName, allowMissingColumns=True) @@ -244,10 +252,11 @@ def _read_variable_data_files( return variable_data def _read_data_from_old_format(self, source_path: str, user_data_dict: dict): - for uid in os.listdir(source_path): - # Skip hidden files - if uid[0] == ".": - continue + uids = os.listdir(source_path) + uids = self._remove_hidden_dirs(uids) + if self.sampler is not None: + uids = self.sampler.sample_uids(uids) + for uid in uids: logger.info(f"Reading data for user: {uid}") variable_data_dict = {} for dirname in self.required_data: @@ -280,7 +289,11 @@ def _read_data_from_old_format(self, source_path: str, user_data_dict: dict): def _read_data_from_new_format(self, source_path: str, user_data_dict: dict): # RADAR_NEW: uid/variable/yyyymm/yyyymmdd.csv.gz - for uid in os.listdir(source_path): + uids = os.listdir(source_path) + uids = self._remove_hidden_dirs(uids) + if self.sampler is not None: + uids = self.sampler.sample_uids(uids) + for uid in uids: # Skip hidden files if uid[0] == ".": continue @@ -318,6 +331,8 @@ def _read_data_from_new_format(self, source_path: str, user_data_dict: dict): radar_data = RadarData(user_data_dict, self.df_type) return radar_data, user_data_dict + def _remove_hidden_dirs(self, uids): + return [uid for uid in uids if uid[0] != "."] class AvroSchemaReader(SchemaReader): """ diff --git a/radarpipeline/io/sampler.py b/radarpipeline/io/sampler.py new file mode 100644 index 0000000..0db9faf --- /dev/null +++ b/radarpipeline/io/sampler.py @@ -0,0 +1,35 @@ +from radarpipeline.io.abc import Sampler +from typing import Any, Dict, List, Optional, Union +from random import sample +import logging + + +logger = logging.getLogger(__name__) + + +class UserSampler(Sampler): + + def __init__(self, config: Dict) -> None: + super().__init__(config) + + def sample_uids(self, uid_list) -> None: + if self.config['method'] == "fraction": + fraction = self.config['config']['fraction'] + # sample fraction of the uids + return self._sample_list(uid_list, round(len(uid_list) * fraction)) + elif self.config['method'] == "count": + count = self.config['config']['count'] + # sample count of the uids + return self._sample_list(uid_list, count) + elif self.config['method'] == "userid": + sampled_uids = self.config['config']['userids'] + for sampled_uid in sampled_uids: + if sampled_uid not in uid_list: + logger.warning(f"User id {sampled_uid} not found in the data") + sampled_uids.remove(sampled_uid) + return sampled_uids + else: + raise ValueError("Invalid method") + + def _sample_list(self, uid_list, number): + return sample(uid_list, number) diff --git a/radarpipeline/project/project.py b/radarpipeline/project/project.py index 2f160ac..4dd7f26 100644 --- a/radarpipeline/project/project.py +++ b/radarpipeline/project/project.py @@ -248,6 +248,7 @@ def fetch_data(self) -> None: "config": { "source_path": mock_data_directory}, "data_format": "csv"}} + mock_config["configurations"] = self.config["configurations"] datareader = Reader( self.spark_session, mock_config, diff --git a/radarpipeline/project/validations.py b/radarpipeline/project/validations.py index 1fc25de..c13e72a 100644 --- a/radarpipeline/project/validations.py +++ b/radarpipeline/project/validations.py @@ -108,11 +108,84 @@ def _validate_configurations(self) -> None: self.config["configurations"] = {} valid_df_types = ["pandas", "spark"] + valid_user_sampling_methods = ["fraction", "count", "userid"] if "df_type" not in self.config["configurations"]: self.config["configurations"]["df_type"] = "pandas" elif self.config["configurations"]["df_type"] not in valid_df_types: raise ValueError("Invalid value for the key: df_type") + if "user_sampling" in self.config["configurations"]: + if "method" not in self.config["configurations"]["user_sampling"]: + raise ValueError("Key not present in the user_sampling config: method") + elif ( + self.config["configurations"]["user_sampling"]["method"] + not in valid_user_sampling_methods + ): + raise ValueError("Invalid value for the key: method") + else: + self.config["configurations"]["user_sampling"] = self._validate_user_sampling_config(self.config["configurations"]["user_sampling"]) + else: + self.config["configurations"]["user_sampling"] = None + + def _validate_user_sampling_config(self, user_sampling_config): + """ + Validates the user_sampling config + """ + + if user_sampling_config["method"] == "fraction": + if "config" not in user_sampling_config: + raise ValueError("Key not present in the user_sampling config: config") + if "fraction" not in user_sampling_config["config"]: + raise ValueError( + "Key not present in the user_sampling config: fraction" + ) + # converting fraction to float + try: + user_sampling_config["config"]["fraction"] = float( + user_sampling_config["config"]["fraction"] + ) + except ValueError: + raise ValueError( + "Invalid value for the key: fraction. It should be a number" + ) + if not 0 < user_sampling_config["config"]["fraction"] <= 1: + raise ValueError( + "Invalid value for the key: fraction. It should be between 0 and 1" + ) + + elif user_sampling_config["method"] == "count": + if "config" not in user_sampling_config: + raise ValueError("Key not present in the user_sampling config: config") + if "count" not in user_sampling_config["config"]: + raise ValueError("Key not present in the user_sampling config: count") + try: + user_sampling_config["config"]["count"] = int( + user_sampling_config["config"]["count"] + ) + except ValueError: + raise ValueError( + "Invalid value for the key: count. It should be a number" + ) + if user_sampling_config["config"]["count"] <= 0: + raise ValueError( + "Invalid value for the key: count. It should be greater than 0" + ) + + elif user_sampling_config["method"] == "userid": + if "config" not in user_sampling_config: + raise ValueError("Key not present in the user_sampling config: config") + if "userids" not in user_sampling_config["config"]: + raise ValueError("Key not present in the user_sampling config: userids") + if len(user_sampling_config["config"]["userids"]) == 0: + raise ValueError( + "user_ids array cannot be empty in the user_sampling config" + ) + # check if userids are in array. If not convert to array + if not isinstance(user_sampling_config["config"]["userids"], list): + user_sampling_config["config"]["userids"] = [ + user_sampling_config["config"]["userids"] + ] + return user_sampling_config def _validate_features(self) -> None: """ From ffd69ad04942d20e577faf7705b3e7ea412f56d6 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Fri, 9 Feb 2024 17:37:31 +0530 Subject: [PATCH 22/32] Added data sampling as well --- config.yaml | 19 +++- radarpipeline/common/utils.py | 8 +- radarpipeline/io/__init__.py | 2 +- radarpipeline/io/reader.py | 30 +++-- radarpipeline/io/sampler.py | 9 ++ radarpipeline/project/validations.py | 157 ++++++++++++++++++--------- 6 files changed, 156 insertions(+), 69 deletions(-) diff --git a/config.yaml b/config.yaml index 1d3e9d0..c9cc44a 100644 --- a/config.yaml +++ b/config.yaml @@ -25,11 +25,11 @@ input: configurations: df_type: 'pandas' - #user_sampling: + user_sampling: ## Possible methods: fraction, count, userid - #method: fraction - #config: - # fraction: 0.3 + method: fraction + config: + fraction: 0.8 #method: count #config: # count: 2 @@ -40,7 +40,16 @@ configurations: ## TODO: For future #data_sampling: ## Possible methods: time, count, fraction - # method: range_time + # method: time + # config: + # starttime: + # endtime: + # method: count + # config: + # count: 100 + # method: fraction + # config: + # fraction: 0.3 features: - location: 'https://github.com/RADAR-base-Analytics/mockfeatures' diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index ff7ec4f..200b8a9 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -157,8 +157,12 @@ def get_yaml_schema() -> Map: "df_type": Str(), Optional("user_sampling"): Map({ "method": Str(), - "config": MapPattern(Str(), Seq(Str()) or Str()) - }) + "config": MapPattern(Str(), Seq(Str()) | Str()), + }), + Optional("data_sampling"): Map({ + "method": Str(), + "config": MapPattern(Str(), Str()), + }), }), "features": Seq(Map({ "location": Str(), diff --git a/radarpipeline/io/__init__.py b/radarpipeline/io/__init__.py index 6a6de95..cafc0ae 100644 --- a/radarpipeline/io/__init__.py +++ b/radarpipeline/io/__init__.py @@ -3,4 +3,4 @@ from radarpipeline.io.downloader import SftpDataReader from radarpipeline.io.writer import * from radarpipeline.io.ingestion import CustomDataReader -from radarpipeline.io.sampler import UserSampler +from radarpipeline.io.sampler import UserSampler, DataSampler diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index eb40229..7107adf 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -14,7 +14,7 @@ from radarpipeline.common import constants from radarpipeline.datalib import RadarData, RadarUserData, RadarVariableData from radarpipeline.io.abc import DataReader, SchemaReader -from radarpipeline.io.sampler import UserSampler +from radarpipeline.io.sampler import UserSampler, DataSampler from radarpipeline.common.utils import get_hash import avro @@ -83,13 +83,20 @@ def __init__(self, spark_session: ps.SparkSession, self.required_data = required_data self.df_type = df_type if self.config["configurations"]['user_sampling'] is None: - self.data_sampler = None + self.user_sampler = None else: - self.data_sampler = UserSampler(self.config["configurations"]['user_sampling']) + self.user_sampler = UserSampler(self.config["configurations"] + ['user_sampling']) + if self.config["configurations"]['data_sampling'] is None: + self.data_sampling = None + else: + self.data_sampler = DataSampler(self.config["configurations"] + ['data_sampling']) + if self.data_type in ['csv', 'csv.gz']: self.reader_class = SparkCSVDataReader(spark_session, config, required_data, df_type, - self.data_sampler) + self.user_sampler, self.data_sampler) else: raise NotImplementedError("Only csv data type is supported for now") @@ -111,7 +118,7 @@ class SparkCSVDataReader(DataReader): def __init__(self, spark_session: ps.SparkSession, config: Dict, required_data: List[str], df_type: str = "pandas", - sampler: UserSampler = None): + user_sampler: UserSampler = None, data_sampler: DataSampler = None): super().__init__(config) self.source_formats = { # RADAR_OLD: uid/variable/yyyymmdd_hh00.csv.gz @@ -124,7 +131,8 @@ def __init__(self, spark_session: ps.SparkSession, self.required_data = required_data self.df_type = df_type self.source_path = self.config['input']['config'].get("source_path", "") - self.sampler = sampler + self.user_sampler = user_sampler + self.data_sampler = data_sampler self.schema_reader = AvroSchemaReader() self.spark = spark_session self.unionByName = partial(DataFrame.unionByName, allowMissingColumns=True) @@ -248,14 +256,15 @@ def _read_variable_data_files( variable_data = RadarVariableData(df, self.df_type) else: df = reduce(self.unionByName, dfs) + self.data_sampler.sample_data(df) variable_data = RadarVariableData(df, self.df_type) return variable_data def _read_data_from_old_format(self, source_path: str, user_data_dict: dict): uids = os.listdir(source_path) uids = self._remove_hidden_dirs(uids) - if self.sampler is not None: - uids = self.sampler.sample_uids(uids) + if self.user_sampler is not None: + uids = self.user_sampler.sample_uids(uids) for uid in uids: logger.info(f"Reading data for user: {uid}") variable_data_dict = {} @@ -291,8 +300,8 @@ def _read_data_from_new_format(self, source_path: str, user_data_dict: dict): # RADAR_NEW: uid/variable/yyyymm/yyyymmdd.csv.gz uids = os.listdir(source_path) uids = self._remove_hidden_dirs(uids) - if self.sampler is not None: - uids = self.sampler.sample_uids(uids) + if self.user_sampler is not None: + uids = self.user_sampler.sample_uids(uids) for uid in uids: # Skip hidden files if uid[0] == ".": @@ -334,6 +343,7 @@ def _read_data_from_new_format(self, source_path: str, user_data_dict: dict): def _remove_hidden_dirs(self, uids): return [uid for uid in uids if uid[0] != "."] + class AvroSchemaReader(SchemaReader): """ Reads schema from local directory diff --git a/radarpipeline/io/sampler.py b/radarpipeline/io/sampler.py index 0db9faf..884edc8 100644 --- a/radarpipeline/io/sampler.py +++ b/radarpipeline/io/sampler.py @@ -33,3 +33,12 @@ def sample_uids(self, uid_list) -> None: def _sample_list(self, uid_list, number): return sample(uid_list, number) + + +class DataSampler(Sampler): + def __init__(self, config: Dict) -> None: + super().__init__(config) + + def sample_data(self, df): + # TODO: Implement this + return df diff --git a/radarpipeline/project/validations.py b/radarpipeline/project/validations.py index c13e72a..c38c74a 100644 --- a/radarpipeline/project/validations.py +++ b/radarpipeline/project/validations.py @@ -123,70 +123,57 @@ def _validate_configurations(self) -> None: ): raise ValueError("Invalid value for the key: method") else: - self.config["configurations"]["user_sampling"] = self._validate_user_sampling_config(self.config["configurations"]["user_sampling"]) + self.config["configurations"][ + "user_sampling"] = self._validate_user_sampling_config( + self.config["configurations"]["user_sampling"]) else: self.config["configurations"]["user_sampling"] = None + # Validating data sampling + valid_data_sampling_methods = ["fraction", "count", "time"] + if "data_sampling" in self.config["configurations"]: + if "method" not in self.config["configurations"]["data_sampling"]: + raise ValueError("Key not present in the data_sampling config: method") + elif ( + self.config["configurations"]["data_sampling"]["method"] + not in valid_data_sampling_methods + ): + raise ValueError("Invalid value for the key: method") + else: + self.config["configurations"][ + "data_sampling"] = self._validate_data_sampling_config( + self.config["configurations"]["data_sampling"]) + else: + self.config["configurations"]["data_sampling"] = None + def _validate_user_sampling_config(self, user_sampling_config): """ Validates the user_sampling config """ if user_sampling_config["method"] == "fraction": - if "config" not in user_sampling_config: - raise ValueError("Key not present in the user_sampling config: config") - if "fraction" not in user_sampling_config["config"]: - raise ValueError( - "Key not present in the user_sampling config: fraction" - ) - # converting fraction to float - try: - user_sampling_config["config"]["fraction"] = float( - user_sampling_config["config"]["fraction"] - ) - except ValueError: - raise ValueError( - "Invalid value for the key: fraction. It should be a number" - ) - if not 0 < user_sampling_config["config"]["fraction"] <= 1: - raise ValueError( - "Invalid value for the key: fraction. It should be between 0 and 1" - ) - + user_sampling_config = self._validate_sampling_fraction( + user_sampling_config) elif user_sampling_config["method"] == "count": - if "config" not in user_sampling_config: - raise ValueError("Key not present in the user_sampling config: config") - if "count" not in user_sampling_config["config"]: - raise ValueError("Key not present in the user_sampling config: count") - try: - user_sampling_config["config"]["count"] = int( - user_sampling_config["config"]["count"] - ) - except ValueError: - raise ValueError( - "Invalid value for the key: count. It should be a number" - ) - if user_sampling_config["config"]["count"] <= 0: - raise ValueError( - "Invalid value for the key: count. It should be greater than 0" - ) - + user_sampling_config = self._validate_sampling_count( + user_sampling_config) elif user_sampling_config["method"] == "userid": - if "config" not in user_sampling_config: - raise ValueError("Key not present in the user_sampling config: config") - if "userids" not in user_sampling_config["config"]: - raise ValueError("Key not present in the user_sampling config: userids") - if len(user_sampling_config["config"]["userids"]) == 0: - raise ValueError( - "user_ids array cannot be empty in the user_sampling config" - ) - # check if userids are in array. If not convert to array - if not isinstance(user_sampling_config["config"]["userids"], list): - user_sampling_config["config"]["userids"] = [ - user_sampling_config["config"]["userids"] - ] + user_sampling_config = self._validate_sampling_userids( + user_sampling_config) return user_sampling_config + def _validate_data_sampling_config(self, data_sampling_config): + if data_sampling_config["method"] == "fraction": + data_sampling_config = self._validate_sampling_fraction( + data_sampling_config) + elif data_sampling_config["method"] == "count": + data_sampling_config = self._validate_sampling_count( + data_sampling_config) + elif data_sampling_config["method"] == "time": + data_sampling_config = self._validate_sampling_time( + data_sampling_config) + return data_sampling_config + def _validate_features(self) -> None: """ Validates the features config @@ -299,3 +286,71 @@ def _validate_output(self) -> None: self.config["output"]["compress"] = True else: raise ValueError("Key not present in the config: output_location") + + def _validate_sampling_fraction(self, sampling_config): + if "config" not in sampling_config: + raise ValueError("Key not present in the user_sampling config: config") + if "fraction" not in sampling_config["config"]: + raise ValueError( + "Key not present in the user_sampling config: fraction" + ) + # converting fraction to float + try: + sampling_config["config"]["fraction"] = float( + sampling_config["config"]["fraction"] + ) + except ValueError: + raise ValueError( + "Invalid value for the key: fraction. It should be a number" + ) + if not 0 < sampling_config["config"]["fraction"] <= 1: + raise ValueError( + "Invalid value for the key: fraction. It should be between 0 and 1" + ) + return sampling_config + + def _validate_sampling_count(self, sampling_config): + if "config" not in sampling_config: + raise ValueError("Key not present in the user_sampling config: config") + if "count" not in sampling_config["config"]: + raise ValueError("Key not present in the user_sampling config: count") + try: + sampling_config["config"]["count"] = int( + sampling_config["config"]["count"] + ) + except ValueError: + raise ValueError( + "Invalid value for the key: count. It should be a number" + ) + if sampling_config["config"]["count"] <= 0: + raise ValueError( + "Invalid value for the key: count. It should be greater than 0" + ) + return sampling_config + + def _validate_sampling_userids(self, sampling_config): + if "config" not in sampling_config: + raise ValueError("Key not present in the user_sampling config: config") + if "userids" not in sampling_config["config"]: + raise ValueError("Key not present in the user_sampling config: userids") + if len(sampling_config["config"]["userids"]) == 0: + raise ValueError( + "user_ids array cannot be empty in the user_sampling config" + ) + # check if userids are in array. If not convert to array + if not isinstance(sampling_config["config"]["userids"], list): + sampling_config["config"]["userids"] = [ + sampling_config["config"]["userids"] + ] + return sampling_config + + def _validate_sampling_time(self, sampling_config): + if "config" not in sampling_config: + raise ValueError("Key not present in the user_sampling config: config") + if ("starttime" not in sampling_config["config"] + ) and ( + "endtime" not in sampling_config["config"]): + raise ValueError("Neither startime nor endtime present in the config") + # check if starttime and endtime are can be converted into time format + # if so, convert them + return sampling_config From bf474872ea7d9631b3f7589d6a9a89c0e4035552 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Mon, 12 Feb 2024 15:03:52 +0530 Subject: [PATCH 23/32] minor code changes --- radarpipeline/io/reader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index 7107adf..153ec4a 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -88,7 +88,7 @@ def __init__(self, spark_session: ps.SparkSession, self.user_sampler = UserSampler(self.config["configurations"] ['user_sampling']) if self.config["configurations"]['data_sampling'] is None: - self.data_sampling = None + self.data_sampler = None else: self.data_sampler = DataSampler(self.config["configurations"] ['data_sampling']) @@ -256,7 +256,8 @@ def _read_variable_data_files( variable_data = RadarVariableData(df, self.df_type) else: df = reduce(self.unionByName, dfs) - self.data_sampler.sample_data(df) + if self.data_sampler is not None: + self.data_sampler.sample_data(df) variable_data = RadarVariableData(df, self.df_type) return variable_data From b4bc9d213a26284dc979d582bebe0b4eab5ccaa4 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 14 Feb 2024 18:30:59 +0530 Subject: [PATCH 24/32] Added data_sampline methods by: time, count & fraction --- config.yaml | 30 +++++++++-------- radarpipeline/common/utils.py | 10 ++++++ radarpipeline/datalib/radar_variable_data.py | 5 ++- radarpipeline/io/reader.py | 5 ++- radarpipeline/io/sampler.py | 34 ++++++++++++++++++-- radarpipeline/project/validations.py | 10 ++++++ 6 files changed, 73 insertions(+), 21 deletions(-) diff --git a/config.yaml b/config.yaml index c9cc44a..b836d64 100644 --- a/config.yaml +++ b/config.yaml @@ -25,11 +25,11 @@ input: configurations: df_type: 'pandas' - user_sampling: + #user_sampling: ## Possible methods: fraction, count, userid - method: fraction - config: - fraction: 0.8 + # method: fraction + # config: + # fraction: 0.8 #method: count #config: # count: 2 @@ -40,16 +40,18 @@ configurations: ## TODO: For future #data_sampling: ## Possible methods: time, count, fraction - # method: time - # config: - # starttime: - # endtime: - # method: count - # config: - # count: 100 - # method: fraction - # config: - # fraction: 0.3 + ## starttime and endtime format is dd-mm-yyyy hh:mm:ss in UTC timezone + #method: time + #config: + # starttime: 2018-11-25 00:00:00 + # endtime: 2018-11-29 00:00:00 + # time_column: value.time + #method: count + #config: + # count: 100 + #method: fraction + #config: + # fraction: 0.3 features: - location: 'https://github.com/RADAR-base-Analytics/mockfeatures' diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index 200b8a9..d8dd514 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -10,6 +10,7 @@ import yaml from strictyaml import load, Map, Int, Str, Seq, Bool, Optional from strictyaml import YAMLError, CommaSeparated, MapPattern +from dateutil import parser import ntpath import posixpath @@ -292,6 +293,15 @@ def preprocess_time_data(data): return data +def convert_str_to_time(time): + try: + return parser.parse(time) + except ValueError: + raise ValueError( + "Invalid value for the key: time. It should be a valid time format" + ) + + class PySparkTestCase(unittest.TestCase): @classmethod def setUpClass(cls): diff --git a/radarpipeline/datalib/radar_variable_data.py b/radarpipeline/datalib/radar_variable_data.py index 1bf9fa6..d02c8ae 100644 --- a/radarpipeline/datalib/radar_variable_data.py +++ b/radarpipeline/datalib/radar_variable_data.py @@ -18,10 +18,13 @@ class RadarVariableData(Data): _data: DataType - def __init__(self, data: DataType, df_type: str = "pandas") -> None: + def __init__(self, data: DataType, df_type: str = "pandas", + data_sampler=None) -> None: self._data = data self.df_type = df_type self._preprocess_data() + if data_sampler is not None: + self._data = data_sampler.sample_data(self._data) def get_data(self) -> DataType: return self._data diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index 153ec4a..5dd5dff 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -256,9 +256,8 @@ def _read_variable_data_files( variable_data = RadarVariableData(df, self.df_type) else: df = reduce(self.unionByName, dfs) - if self.data_sampler is not None: - self.data_sampler.sample_data(df) - variable_data = RadarVariableData(df, self.df_type) + variable_data = RadarVariableData(df, self.df_type, + data_sampler=self.data_sampler) return variable_data def _read_data_from_old_format(self, source_path: str, user_data_dict: dict): diff --git a/radarpipeline/io/sampler.py b/radarpipeline/io/sampler.py index 884edc8..bbaad99 100644 --- a/radarpipeline/io/sampler.py +++ b/radarpipeline/io/sampler.py @@ -39,6 +39,34 @@ class DataSampler(Sampler): def __init__(self, config: Dict) -> None: super().__init__(config) - def sample_data(self, df): - # TODO: Implement this - return df + def sample_data(self, df: ps.DataFrame) -> Optional[ps.DataFrame]: + if self.config['method'] == "fraction": + fraction = self.config['config']['fraction'] + # sample fraction of the data + return self._sample_data(df, fraction) + elif self.config['method'] == "count": + count = self.config['config']['count'] + # sample count of the data + return self._sample_data(df, count / df.count()) + elif self.config['method'] == "time": + starttime = self.config['config'].get('starttime', None) + endtime = self.config['config'].get('endtime', None) + time_col = self.config['config'].get('time_col', "value.time") + # check if time_col is present in df + if time_col not in df.columns: + raise ValueError(f"Column {time_col} not found in the dataframe") + return self._sample_data_by_time(df, starttime, endtime, time_col) + + else: + raise ValueError("Invalid method") + + def _sample_data(self, df, fraction): + return df.sample(fraction=fraction, withReplacement=True) + + def _sample_data_by_time(self, df, starttime, endtime, time_col): + if endtime is None: + return df.filter(df[f"`{time_col}`"] >= starttime) + elif starttime is None: + return df.filter(df[f"`{time_col}`"] < endtime) + else: + return df.filter(df[f"`{time_col}`"].between(starttime, endtime)) diff --git a/radarpipeline/project/validations.py b/radarpipeline/project/validations.py index c38c74a..5e5a39d 100644 --- a/radarpipeline/project/validations.py +++ b/radarpipeline/project/validations.py @@ -353,4 +353,14 @@ def _validate_sampling_time(self, sampling_config): raise ValueError("Neither startime nor endtime present in the config") # check if starttime and endtime are can be converted into time format # if so, convert them + if "starttime" in sampling_config["config"]: + sampling_config["config"]['starttime'] = utils.convert_str_to_time( + sampling_config["config"]['starttime']) + if "endtime" in sampling_config["config"]: + sampling_config["config"]['endtime'] = utils.convert_str_to_time( + sampling_config["config"]['endtime']) + if "time_column" not in sampling_config["config"]: + logger.warning("time_column not present in the config. \ + Using default time column: value.time") + sampling_config["config"]["time_column"] = "value.time" return sampling_config From 96597988ce9bfed0e3dc5dd78d00f4a2951b2dba Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 14 Feb 2024 18:42:48 +0530 Subject: [PATCH 25/32] minor import changes --- radarpipeline/io/sampler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/radarpipeline/io/sampler.py b/radarpipeline/io/sampler.py index bbaad99..bcf3b1e 100644 --- a/radarpipeline/io/sampler.py +++ b/radarpipeline/io/sampler.py @@ -1,6 +1,7 @@ from radarpipeline.io.abc import Sampler from typing import Any, Dict, List, Optional, Union from random import sample +import pyspark.sql as ps import logging From c52281c3ab5d22180584e172849d7ce0e3e96010 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Mon, 29 Apr 2024 12:23:01 +0100 Subject: [PATCH 26/32] Added feature to access spark-cluster --- config.yaml | 3 +- radarpipeline/common/utils.py | 1 + radarpipeline/io/ingestion.py | 5 ++- radarpipeline/project/sparkengine.py | 46 ++++++++++++++++------------ 4 files changed, 32 insertions(+), 23 deletions(-) diff --git a/config.yaml b/config.yaml index b836d64..2c1d2f9 100644 --- a/config.yaml +++ b/config.yaml @@ -69,5 +69,4 @@ output: compress: false spark_config: - spark.executor.instances: 2 - spark.driver.memory: 13G \ No newline at end of file + spark_master: local # Could be local, and if using a remote spark://host:port diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index d8dd514..878dd1c 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -178,6 +178,7 @@ def get_yaml_schema() -> Map: "compress": Bool() }), Optional("spark_config"): Map({ + Optional("spark_master", default="local"): Str(), Optional("spark.executor.instances", default=4): Int(), Optional("spark.executor.cores", default=4): Int(), Optional("spark.executor.memory", default='10g'): Str(), diff --git a/radarpipeline/io/ingestion.py b/radarpipeline/io/ingestion.py index a87edfc..555d961 100644 --- a/radarpipeline/io/ingestion.py +++ b/radarpipeline/io/ingestion.py @@ -23,10 +23,13 @@ def modify_config(self, input_config, data_format) -> Dict: """ Modify the input configuration to include the variables of interest """ - config = {'input': {}} + config = {'input': {}, "configurations": {}} config['input'] = input_config config['input']['data_format'] = data_format config['input']['data_type'] = self.data_type + config['configurations']['df_type'] = "pandas" + config['configurations']['user_sampling'] = None + config['configurations']['data_sampling'] = None return config def read_data(self): diff --git a/radarpipeline/project/sparkengine.py b/radarpipeline/project/sparkengine.py index e2f83cc..e04f90e 100644 --- a/radarpipeline/project/sparkengine.py +++ b/radarpipeline/project/sparkengine.py @@ -14,7 +14,7 @@ class SparkEngine(): """ def __init__(self, spark_config: Dict = None): - default_spark_config = {'spark.executor.instances': 6, + default_spark_config = {'spark.executor.instances': 2, 'spark.driver.memory': '10G', 'spark.executor.cores': 4, 'spark.executor.memory': '10g', @@ -61,25 +61,31 @@ def initialize_spark_session(self) -> ps.SparkSession: all partitions for each Spark action (e.g. collect) in bytes. Should be at least 1M, or 0 for unlimited. """ - self.spark = ( - SparkSession.builder.master("local").appName("radarpipeline") - .config('spark.executor.instances', - self.spark_config['spark.executor.instances']) - .config('spark.executor.cores', - self.spark_config['spark.executor.cores']) - .config('spark.executor.memory', - self.spark_config['spark.executor.memory']) - .config('spark.driver.memory', - self.spark_config['spark.driver.memory']) - .config('spark.memory.offHeap.enabled', - self.spark_config['spark.memory.offHeap.enabled']) - .config('spark.memory.offHeap.size', - self.spark_config['spark.memory.offHeap.size']) - .config('spark.driver.maxResultSize', - self.spark_config['spark.driver.maxResultSize']) - .config('spark.log.level', - self.spark_config['spark.log.level']) - .getOrCreate() + if self.spark_config['spark_master'] == "local": + self.spark = ( + SparkSession.builder.master("local[*]").appName("radarpipeline") + .config('spark.executor.instances', + self.spark_config['spark.executor.instances']) + .config('spark.executor.cores', + self.spark_config['spark.executor.cores']) + .config('spark.executor.memory', + self.spark_config['spark.executor.memory']) + .config('spark.driver.memory', + self.spark_config['spark.driver.memory']) + .config('spark.memory.offHeap.enabled', + self.spark_config['spark.memory.offHeap.enabled']) + .config('spark.memory.offHeap.size', + self.spark_config['spark.memory.offHeap.size']) + .config('spark.driver.maxResultSize', + self.spark_config['spark.driver.maxResultSize']) + .config('spark.log.level', + self.spark_config['spark.log.level']) + .getOrCreate() + ) + else: + self.spark = ( + SparkSession.builder.master(self.spark_config['spark_master']).appName("radarpipeline") + .getOrCreate() ) self.spark._jsc.setLogLevel(self.spark_config['spark.log.level']) self.spark.sparkContext.setLogLevel("OFF") From 77bd07068ca112dfc99a9fef215eddb8191d2052 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Tue, 30 Apr 2024 10:37:27 +0100 Subject: [PATCH 27/32] Updated tests --- config.yaml | 5 +---- radarpipeline/project/sparkengine.py | 3 ++- requirements.txt | 1 + tests/tests_common/test_utils.py | 1 + tests/tests_project/test_sparkengine.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config.yaml b/config.yaml index 2c1d2f9..8c06604 100644 --- a/config.yaml +++ b/config.yaml @@ -66,7 +66,4 @@ output: config: target_path: output/mockdata data_format: csv - compress: false - -spark_config: - spark_master: local # Could be local, and if using a remote spark://host:port + compress: false \ No newline at end of file diff --git a/radarpipeline/project/sparkengine.py b/radarpipeline/project/sparkengine.py index e04f90e..4807a28 100644 --- a/radarpipeline/project/sparkengine.py +++ b/radarpipeline/project/sparkengine.py @@ -14,7 +14,8 @@ class SparkEngine(): """ def __init__(self, spark_config: Dict = None): - default_spark_config = {'spark.executor.instances': 2, + default_spark_config = {'spark_master': 'local', + 'spark.executor.instances': 2, 'spark.driver.memory': '10G', 'spark.executor.cores': 4, 'spark.executor.memory': '10g', diff --git a/requirements.txt b/requirements.txt index 0f3a101..0a13102 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ pytest-mock pytest-sftpserver dhp avro==1.11.2 +pyarrow==16.0.0 diff --git a/tests/tests_common/test_utils.py b/tests/tests_common/test_utils.py index daf5b32..d79bd99 100644 --- a/tests/tests_common/test_utils.py +++ b/tests/tests_common/test_utils.py @@ -135,6 +135,7 @@ def test_read_yaml_with_spark_config(self): 'data_format': 'csv', 'compress': False}, 'spark_config': { + "spark_master": "local", "spark.executor.instances": 2, "spark.memory.offHeap.enabled": False, "spark.executor.cores": 4, diff --git a/tests/tests_project/test_sparkengine.py b/tests/tests_project/test_sparkengine.py index eef0465..1d9f0b2 100644 --- a/tests/tests_project/test_sparkengine.py +++ b/tests/tests_project/test_sparkengine.py @@ -5,7 +5,7 @@ class TestSparkDefaultConfig(unittest.TestCase): @classmethod def setUpClass(cls): - cls.default_spark_config = {'spark.executor.instances': 6, + cls.default_spark_config = {'spark.executor.instances': 2, 'spark.driver.memory': '10G', 'spark.executor.cores': 4, 'spark.executor.memory': '10g', From a542b557341550d3c8233d63eff989f282c478a4 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Tue, 30 Apr 2024 15:37:18 +0100 Subject: [PATCH 28/32] added tests to test different sampling configs --- radarpipeline/radarpipeline.py | 4 +- .../test_yamls/test_config_all_sampling.yaml | 69 ++++++++++++ .../test_yamls/test_config_date_sampling.yaml | 69 ++++++++++++ .../test_yamls/test_config_user_sampling.yaml | 69 ++++++++++++ tests/tests_common/test_utils.py | 106 +++++++++++++++++- 5 files changed, 314 insertions(+), 3 deletions(-) create mode 100644 tests/resources/test_yamls/test_config_all_sampling.yaml create mode 100644 tests/resources/test_yamls/test_config_date_sampling.yaml create mode 100644 tests/resources/test_yamls/test_config_user_sampling.yaml diff --git a/radarpipeline/radarpipeline.py b/radarpipeline/radarpipeline.py index c2465ad..70c810e 100644 --- a/radarpipeline/radarpipeline.py +++ b/radarpipeline/radarpipeline.py @@ -1,7 +1,7 @@ import logging import sys import traceback - +from typing import Dict, Union from radarpipeline import Project from radarpipeline.common.logger import logger_init @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -def run(config_path: str = "config.yaml"): +def run(config_path: Union[str, Dict] = "config.yaml"): """ Pipeline entry point. config_path could be a local path to a configuration file diff --git a/tests/resources/test_yamls/test_config_all_sampling.yaml b/tests/resources/test_yamls/test_config_all_sampling.yaml new file mode 100644 index 0000000..901e654 --- /dev/null +++ b/tests/resources/test_yamls/test_config_all_sampling.yaml @@ -0,0 +1,69 @@ +project: + project_name: mock_project + description: mock_description + version: mock_version + +input: + data_type: mock # couldbe mock, local, sftp, s3 + config: + # In case of sftp, use the following format + # sftp_host: + # sftp_source_path: + # sftp_username: + # sftp_private_key: + # sftp_target_path: + # In case of s3, use the following format + # aws_access_key_id: + # aws_secret_access_key: + # region_name: + # s3_access_url: + # bucket: + # prefix: + # In case of local or Mock, use the following format + source_path: mockdata/mockdata + data_format: csv + +configurations: + df_type: 'pandas' + user_sampling: + ## Possible methods: fraction, count, userid + # method: fraction + # config: + # fraction: 0.8 + method: count + config: + count: 2 + #method: userid + #config: + # userids: + # - 2a02e53a-951e-4fd0-b47f-195a87096bd0 + ## TODO: For future + data_sampling: + ## Possible methods: time, count, fraction + ## starttime and endtime format is dd-mm-yyyy hh:mm:ss in UTC timezone + #method: time + #config: + # starttime: 2018-11-25 00:00:00 + # endtime: 2018-11-29 00:00:00 + # time_column: value.time + method: count + config: + count: 100 + #method: fraction + #config: + # fraction: 0.3 + +features: + - location: 'https://github.com/RADAR-base-Analytics/mockfeatures' + branch: main + feature_groups: + - MockFeatureGroup + feature_names: + - all + +output: + output_location: local # can be local, postgres, sftp + config: + target_path: output/mockdata + data_format: csv + compress: false \ No newline at end of file diff --git a/tests/resources/test_yamls/test_config_date_sampling.yaml b/tests/resources/test_yamls/test_config_date_sampling.yaml new file mode 100644 index 0000000..fa540d9 --- /dev/null +++ b/tests/resources/test_yamls/test_config_date_sampling.yaml @@ -0,0 +1,69 @@ +project: + project_name: mock_project + description: mock_description + version: mock_version + +input: + data_type: mock # couldbe mock, local, sftp, s3 + config: + # In case of sftp, use the following format + # sftp_host: + # sftp_source_path: + # sftp_username: + # sftp_private_key: + # sftp_target_path: + # In case of s3, use the following format + # aws_access_key_id: + # aws_secret_access_key: + # region_name: + # s3_access_url: + # bucket: + # prefix: + # In case of local or Mock, use the following format + source_path: mockdata/mockdata + data_format: csv + +configurations: + df_type: 'pandas' + #user_sampling: + ## Possible methods: fraction, count, userid + # method: fraction + # config: + # fraction: 0.8 + #method: count + #config: + # count: 2 + #method: userid + #config: + # userids: + # - 2a02e53a-951e-4fd0-b47f-195a87096bd0 + ## TODO: For future + data_sampling: + ## Possible methods: time, count, fraction + ## starttime and endtime format is dd-mm-yyyy hh:mm:ss in UTC timezone + method: time + config: + starttime: 2018-11-25 00:00:00 + endtime: 2018-11-29 00:00:00 + time_column: value.time + #method: count + #config: + # count: 100 + #method: fraction + #config: + # fraction: 0.3 + +features: + - location: 'https://github.com/RADAR-base-Analytics/mockfeatures' + branch: main + feature_groups: + - MockFeatureGroup + feature_names: + - all + +output: + output_location: local # can be local, postgres, sftp + config: + target_path: output/mockdata + data_format: csv + compress: false \ No newline at end of file diff --git a/tests/resources/test_yamls/test_config_user_sampling.yaml b/tests/resources/test_yamls/test_config_user_sampling.yaml new file mode 100644 index 0000000..350e85d --- /dev/null +++ b/tests/resources/test_yamls/test_config_user_sampling.yaml @@ -0,0 +1,69 @@ +project: + project_name: mock_project + description: mock_description + version: mock_version + +input: + data_type: mock # couldbe mock, local, sftp, s3 + config: + # In case of sftp, use the following format + # sftp_host: + # sftp_source_path: + # sftp_username: + # sftp_private_key: + # sftp_target_path: + # In case of s3, use the following format + # aws_access_key_id: + # aws_secret_access_key: + # region_name: + # s3_access_url: + # bucket: + # prefix: + # In case of local or Mock, use the following format + source_path: mockdata/mockdata + data_format: csv + +configurations: + df_type: 'pandas' + user_sampling: + ## Possible methods: fraction, count, userid + # method: fraction + # config: + # fraction: 0.8 + #method: count + #config: + # count: 2 + method: userid + config: + userids: + - 2a02e53a-951e-4fd0-b47f-195a87096bd0 + ## TODO: For future + #data_sampling: + ## Possible methods: time, count, fraction + ## starttime and endtime format is dd-mm-yyyy hh:mm:ss in UTC timezone + #method: time + #config: + # starttime: 2018-11-25 00:00:00 + # endtime: 2018-11-29 00:00:00 + # time_column: value.time + #method: count + #config: + # count: 100 + #method: fraction + #config: + # fraction: 0.3 + +features: + - location: 'https://github.com/RADAR-base-Analytics/mockfeatures' + branch: main + feature_groups: + - MockFeatureGroup + feature_names: + - all + +output: + output_location: local # can be local, postgres, sftp + config: + target_path: output/mockdata + data_format: csv + compress: false \ No newline at end of file diff --git a/tests/tests_common/test_utils.py b/tests/tests_common/test_utils.py index d79bd99..1812b16 100644 --- a/tests/tests_common/test_utils.py +++ b/tests/tests_common/test_utils.py @@ -80,7 +80,9 @@ def setUp(self): "config_with_incorrect_spark.yaml" self.TESTDATA_FILENAME_WRONG = "tests/resources/config.yaml" self.TESTDATA_FILENAME_EMPTY = "tests/resources/test_config.yaml" - + self.TESTDATA_FILENAME_USER_SAMPLING = "tests/resources/test_yamls/test_config_user_sampling.yaml" + self.TESTDATA_FILENAME_DATA_SAMPLING = "tests/resources/test_yamls/test_config_date_sampling.yaml" + self.TESTDATA_FILENAME_ALL_SAMPLING = "tests/resources/test_yamls/test_config_all_sampling.yaml" def test_read_correct_yaml(self): config = read_yaml(self.TESTDATA_FILENAME) expected_config = { @@ -148,3 +150,105 @@ def test_read_yaml_with_spark_config(self): def test_read_yaml_with_incorrect_spark_config(self): with self.assertRaises(YAMLValidationError): read_yaml(self.TESTDATA_FILENAME_INCORRECT_SPARK) + + def test_read_yaml_with_user_sampling(self): + config = read_yaml(self.TESTDATA_FILENAME_USER_SAMPLING) + expected_config = { + 'project': { + 'project_name': 'mock_project', + 'description': 'mock_description', + 'version': 'mock_version'}, + 'input': { + 'data_type': 'mock', + 'config': {'source_path': 'mockdata/mockdata'}, + 'data_format': 'csv' + }, + 'configurations': { + 'df_type': 'pandas', + 'user_sampling': { + 'method': 'userid', + 'config': {'userids': ["2a02e53a-951e-4fd0-b47f-195a87096bd0"]} + } + }, + 'features': [{ + 'location': 'https://github.com/RADAR-base-Analytics/mockfeatures', + 'branch': 'main', + 'feature_groups': ['MockFeatureGroup'], + 'feature_names': [['all']]}], + 'output': { + 'output_location': 'local', + 'config': {'target_path': 'output/mockdata'}, + 'data_format': 'csv', + 'compress': False}} + self.assertDictEqual(config, expected_config) + + def test_read_yaml_with_data_sampling(self): + config = read_yaml(self.TESTDATA_FILENAME_DATA_SAMPLING) + expected_config = { + 'project': { + 'project_name': 'mock_project', + 'description': 'mock_description', + 'version': 'mock_version'}, + 'input': { + 'data_type': 'mock', + 'config': {'source_path': 'mockdata/mockdata'}, + 'data_format': 'csv' + }, + 'configurations': { + 'df_type': 'pandas', + 'data_sampling': { + 'method': 'time', + 'config': { + 'starttime': '2018-11-25 00:00:00', + 'endtime': '2018-11-29 00:00:00', + 'time_column': 'value.time' + } + } + }, + 'features': [{ + 'location': 'https://github.com/RADAR-base-Analytics/mockfeatures', + 'branch': 'main', + 'feature_groups': ['MockFeatureGroup'], + 'feature_names': [['all']]}], + 'output': { + 'output_location': 'local', + 'config': {'target_path': 'output/mockdata'}, + 'data_format': 'csv', + 'compress': False} + } + self.assertDictEqual(config, expected_config) + + def test_read_yaml_with_all_sampling(self): + config = read_yaml(self.TESTDATA_FILENAME_ALL_SAMPLING) + expected_config = { + 'project': { + 'project_name': 'mock_project', + 'description': 'mock_description', + 'version': 'mock_version'}, + 'input': { + 'data_type': 'mock', + 'config': {'source_path': 'mockdata/mockdata'}, + 'data_format': 'csv' + }, + 'configurations': { + 'df_type': 'pandas', + 'user_sampling': { + 'method': 'count', + 'config': {'count': "2"} + }, + 'data_sampling': { + 'method': 'count', + 'config': {'count': "100"} + } + }, + 'features': [{ + 'location': 'https://github.com/RADAR-base-Analytics/mockfeatures', + 'branch': 'main', + 'feature_groups': ['MockFeatureGroup'], + 'feature_names': [['all']]}], + 'output': { + 'output_location': 'local', + 'config': {'target_path': 'output/mockdata'}, + 'data_format': 'csv', + 'compress': False}} + self.assertDictEqual(config, expected_config) From 6fc67c4710e24d3bc5954b6d3ffb9a53829d033a Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Wed, 1 May 2024 11:35:15 +0100 Subject: [PATCH 29/32] added pipeline tests for samplings --- tests/test_integration/test_sampling.py | 133 ++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 tests/test_integration/test_sampling.py diff --git a/tests/test_integration/test_sampling.py b/tests/test_integration/test_sampling.py new file mode 100644 index 0000000..a9680b6 --- /dev/null +++ b/tests/test_integration/test_sampling.py @@ -0,0 +1,133 @@ +import unittest +from radarpipeline import radarpipeline, Project +from strictyaml.exceptions import YAMLValidationError +import pathlib as pl +import os +import pandas as pd +from pandas.testing import assert_frame_equal + + +class TestSampling(unittest.TestCase): + + def setUp(self): + self.default_config = { + 'project': { + 'project_name': 'mock_project', + 'description': 'mock_description', + 'version': 'mock_version'}, + 'input': { + 'data_type': 'mock', + 'config': {'source_path': 'mockdata/mockdata'}, + 'data_format': 'csv' + }, + 'configurations': {'df_type': 'pandas'}, + 'features': [{ + 'location': 'https://github.com/RADAR-base-Analytics/mockfeatures', + 'branch': 'main', + 'feature_groups': ['MockFeatureGroup'], + 'feature_names': [['all']]}], + 'output': { + 'output_location': 'local', + 'config': {'target_path': 'output/mockdata'}, + 'data_format': 'csv', + 'compress': False}} + + def get_config_output(self, config): + project = Project(input_data=config) + project.fetch_data() + project.compute_features() + output_data = project.features + project.close_spark_session() + return output_data + + def test_user_sampling_userid(self): + user_sampling_config = self.default_config + user_sampling_config['configurations']['user_sampling'] = {} + user_sampling_config['configurations']['user_sampling']['method'] = 'userid' + user_sampling_config['configurations']['user_sampling']['config'] = { + 'userids': ["2a02e53a-951e-4fd0-b47f-195a87096bd0"]} + output_data = self.get_config_output(user_sampling_config) + self.assertEqual(output_data['PhoneBatteryChargingDuration'][ + 'key.userId'].unique(), ['2a02e53a-951e-4fd0-b47f-195a87096bd0']) + self.assertEqual(output_data['StepCountPerDay']['key.userId'].unique(), + ['2a02e53a-951e-4fd0-b47f-195a87096bd0']) + + def test_user_sampling_count(self): + user_sampling_config = self.default_config + user_sampling_config['configurations']['user_sampling'] = {} + user_sampling_config['configurations']['user_sampling']['method'] = 'count' + user_sampling_config['configurations']['user_sampling']['config'] = {'count': 2} + output_data = self.get_config_output(user_sampling_config) + self.assertEqual(output_data['PhoneBatteryChargingDuration'][ + 'key.userId'].unique().shape[0], 2) + self.assertEqual(output_data['StepCountPerDay'][ + 'key.userId'].unique().shape[0], 2) + + def test_user_sampling_fraction(self): + user_sampling_config = self.default_config + user_sampling_config['configurations']['user_sampling'] = {} + user_sampling_config['configurations']['user_sampling']['method'] = 'fraction' + user_sampling_config['configurations']['user_sampling'][ + 'config'] = {'fraction': 0.75} + output_data = self.get_config_output(user_sampling_config) + self.assertEqual(output_data['PhoneBatteryChargingDuration'][ + 'key.userId'].unique().shape[0], 3) + self.assertEqual(output_data['StepCountPerDay'][ + 'key.userId'].unique().shape[0], 3) + + def test_data_sampling_time(self): + starttime = "2018-11-25 00:00:00" + endtime = "2018-11-29 00:00:00" + data_sampling_config = self.default_config + data_sampling_config['configurations']['data_sampling'] = {} + data_sampling_config['configurations']['data_sampling']['method'] = 'time' + data_sampling_config['configurations'][ + 'data_sampling']['config'] = { + 'starttime': starttime, + 'endtime': endtime, + 'time_column': 'value.time'} + output_data = self.get_config_output(data_sampling_config) + self.assertGreaterEqual(output_data['PhoneBatteryChargingDuration'][ + 'date'].min(), pd.Timestamp(starttime).date()) + self.assertLessEqual(output_data['PhoneBatteryChargingDuration'][ + 'date'].max(), pd.Timestamp(endtime).date()) + self.assertGreaterEqual(output_data['StepCountPerDay'][ + 'date'].min(), pd.Timestamp(starttime).date()) + self.assertLessEqual(output_data['StepCountPerDay'][ + 'date'].max(), pd.Timestamp(endtime).date()) + + def test_data_sampling_count(self): + data_sampling_config = self.default_config + data_sampling_config['configurations']['data_sampling'] = {} + data_sampling_config['configurations']['data_sampling']['method'] = 'count' + data_sampling_config['configurations']['data_sampling']['config'] = { + 'count': 100 + } + output_data = self.get_config_output(data_sampling_config) + # check if count of per key.userId is less than or equal to 100 + self.assertTrue(all(output_data['PhoneBatteryChargingDuration'].groupby( + 'key.userId').size() <= 100)) + self.assertTrue(all(output_data['StepCountPerDay'].groupby( + 'key.userId').size() <= 100)) + + def test_user_data_sampling(self): + user_data_sampling_config = self.default_config + user_data_sampling_config['configurations']['user_sampling'] = {} + user_data_sampling_config['configurations']['user_sampling'][ + 'method'] = 'userid' + user_data_sampling_config['configurations']['user_sampling'][ + 'config'] = {'userids': ["2a02e53a-951e-4fd0-b47f-195a87096bd0"]} + user_data_sampling_config['configurations']['data_sampling'] = {} + user_data_sampling_config['configurations']['data_sampling'][ + 'method'] = 'count' + user_data_sampling_config['configurations']['data_sampling'][ + 'config'] = {'count': 100} + output_data = self.get_config_output(user_data_sampling_config) + self.assertEqual(output_data['PhoneBatteryChargingDuration'][ + 'key.userId'].unique(), ['2a02e53a-951e-4fd0-b47f-195a87096bd0']) + self.assertEqual(output_data['StepCountPerDay'][ + 'key.userId'].unique(), ['2a02e53a-951e-4fd0-b47f-195a87096bd0']) + self.assertTrue(all(output_data[ + 'PhoneBatteryChargingDuration'].groupby('key.userId').size() <= 100)) + self.assertTrue(all(output_data[ + 'StepCountPerDay'].groupby('key.userId').size() <= 100)) From 86360327ac2b3d5ce4f78c0e350feb33a2db02ee Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Mon, 3 Jun 2024 13:20:07 +0100 Subject: [PATCH 30/32] changed data_type to source_type in config.yaml --- config.yaml | 2 +- config.yaml.template | 2 +- radarpipeline/common/utils.py | 2 +- radarpipeline/io/ingestion.py | 6 +- radarpipeline/io/reader.py | 58 +++++++++---------- radarpipeline/project/project.py | 6 +- radarpipeline/project/validations.py | 6 +- .../config_with_incorrect_spark.yaml | 2 +- .../test_yamls/config_with_spark.yaml | 2 +- tests/resources/test_yamls/test_config.yaml | 2 +- .../test_yamls/test_config_all_sampling.yaml | 2 +- .../test_yamls/test_config_date_sampling.yaml | 2 +- .../test_yamls/test_config_incomplete.yaml | 2 +- .../test_yamls/test_config_input_invalid.yaml | 2 +- .../test_yamls/test_config_project.yaml | 2 +- .../test_yamls/test_config_user_sampling.yaml | 2 +- tests/test_integration/test_sampling.py | 2 +- tests/tests_common/test_utils.py | 10 ++-- tests/tests_project/test_validations.py | 10 ++-- 19 files changed, 61 insertions(+), 61 deletions(-) diff --git a/config.yaml b/config.yaml index 8c06604..8248287 100644 --- a/config.yaml +++ b/config.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/config.yaml.template b/config.yaml.template index e25b4e7..e39b700 100644 --- a/config.yaml.template +++ b/config.yaml.template @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: ## In case of sftp, use the following format # sftp_host: diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index 878dd1c..34135cd 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -150,7 +150,7 @@ def get_yaml_schema() -> Map: Optional("version"): Str() }), "input": Map({ - "data_type": Str(), + "source_type": Str(), "config": MapPattern(Str(), Str()), "data_format": Str() }), diff --git a/radarpipeline/io/ingestion.py b/radarpipeline/io/ingestion.py index 555d961..e56b31c 100644 --- a/radarpipeline/io/ingestion.py +++ b/radarpipeline/io/ingestion.py @@ -9,11 +9,11 @@ class CustomDataReader(): - def __init__(self, input_config, variables, data_type="local", data_format="csv", + def __init__(self, input_config, variables, source_type="local", data_format="csv", df_type="pandas") -> None: self.variables = variables self.data_format = data_format - self.data_type = data_type + self.source_type = source_type self.config = self.modify_config(input_config, data_format) self.sparkengine = SparkEngine() self.spark = self.sparkengine.initialize_spark_session() @@ -26,7 +26,7 @@ def modify_config(self, input_config, data_format) -> Dict: config = {'input': {}, "configurations": {}} config['input'] = input_config config['input']['data_format'] = data_format - config['input']['data_type'] = self.data_type + config['input']['source_type'] = self.source_type config['configurations']['df_type'] = "pandas" config['configurations']['user_sampling'] = None config['configurations']['data_sampling'] = None diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index 5dd5dff..488e6d3 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -63,7 +63,7 @@ def add_schema(self, schema_keys, schema): class Reader(): ''' Class for reading data from a file - Reader(data_type : str, data_path: str, variables: Union[str, List]) + Reader(source_type : str, data_path: str, variables: Union[str, List]) reader = Reader(...) reader.get_data(variables=Union[List, str]) reader.get_user_data(user_id=..) @@ -79,7 +79,7 @@ def __init__(self, spark_session: ps.SparkSession, df_type (str, optional): Type of dataframe format. Defaults to "pandas". """ self.config = config - self.data_type = self.config["input"]["data_format"] + self.source_type = self.config["input"]["data_format"] self.required_data = required_data self.df_type = df_type if self.config["configurations"]['user_sampling'] is None: @@ -93,7 +93,7 @@ def __init__(self, spark_session: ps.SparkSession, self.data_sampler = DataSampler(self.config["configurations"] ['data_sampling']) - if self.data_type in ['csv', 'csv.gz']: + if self.source_type in ['csv', 'csv.gz']: self.reader_class = SparkCSVDataReader(spark_session, config, required_data, df_type, self.user_sampler, self.data_sampler) @@ -492,13 +492,13 @@ def _recursive_schema_loader(self, record_schema, precursor="", schema_dict={}): else: return {} - def _get_field(self, data_type: Union[str, Dict, List]) -> Any: + def _get_field(self, source_type: Union[str, Dict, List]) -> Any: """ Returns a Spark data type for a given data type Parameters ---------- - data_type : Union[str, Dict] + source_type : Union[str, Dict] Data type to convert to a Spark data type Returns @@ -507,12 +507,12 @@ def _get_field(self, data_type: Union[str, Dict, List]) -> Any: A Spark data type """ - if type(data_type) is dict: - spark_data_type = self._get_data_type_from_dict(data_type) - elif type(data_type) is list: - spark_data_type = self._get_superior_type_from_list(data_type) + if type(source_type) is dict: + spark_data_type = self._get_data_type_from_dict(source_type) + elif type(source_type) is list: + spark_data_type = self._get_superior_type_from_list(source_type) else: - spark_data_type = self._get_data_type_from_mapping(data_type) + spark_data_type = self._get_data_type_from_mapping(source_type) return spark_data_type @@ -522,13 +522,13 @@ def _resolve_union_schema(self, union_schemas: List[Schema]): list_type.append(schema.type) return self._get_superior_type_from_list(list_type) - def _handle_unknown_data_type(self, data_type: Union[str, Dict, List]) -> Any: + def _handle_unknown_data_type(self, source_type: Union[str, Dict, List]) -> Any: """ Handles unknown data types Parameters ---------- - data_type : Union[str, Dict] + source_type : Union[str, Dict] Data type to handle Returns @@ -537,16 +537,16 @@ def _handle_unknown_data_type(self, data_type: Union[str, Dict, List]) -> Any: A Spark data type """ - logger.warning(f"Unknown data type: {data_type}. Returning String type.") + logger.warning(f"Unknown data type: {source_type}. Returning String type.") return constants.STRING_TYPE - def _get_data_type_from_mapping(self, data_type: Union[str, Dict, List]) -> Any: + def _get_data_type_from_mapping(self, source_type: Union[str, Dict, List]) -> Any: """ Returns a Spark data type for a given data type Parameters ---------- - data_type : str + source_type : str Data type to convert to a Spark data type Returns @@ -555,20 +555,20 @@ def _get_data_type_from_mapping(self, data_type: Union[str, Dict, List]) -> Any: A Spark data type """ - if data_type in constants.DATA_TYPE_MAPPING: - spark_data_type = constants.DATA_TYPE_MAPPING[data_type] + if source_type in constants.DATA_TYPE_MAPPING: + spark_data_type = constants.DATA_TYPE_MAPPING[source_type] else: - spark_data_type = self._handle_unknown_data_type(data_type) + spark_data_type = self._handle_unknown_data_type(source_type) return spark_data_type - def _get_data_type_from_dict(self, data_type: Dict) -> Any: + def _get_data_type_from_dict(self, source_type: Dict) -> Any: """ Returns a Spark data type for a given data type Parameters ---------- - data_type : Dict + source_type : Dict Data type to convert to a Spark data type Returns @@ -577,10 +577,10 @@ def _get_data_type_from_dict(self, data_type: Dict) -> Any: A Spark data type """ - if "type" in data_type: - return self._get_field(data_type["type"]) + if "type" in source_type: + return self._get_field(source_type["type"]) else: - return self._handle_unknown_data_type(data_type) + return self._handle_unknown_data_type(source_type) def _get_superior_type_from_list(self, data_type_list: List[Any]) -> Any: """ @@ -602,13 +602,13 @@ def _get_superior_type_from_list(self, data_type_list: List[Any]) -> Any: if "null" in spark_data_type_list: spark_data_type_list.remove("null") - for index, data_type in enumerate(spark_data_type_list): - if type(data_type) is dict: - spark_data_type_list[index] = self._get_data_type_from_dict(data_type) - elif data_type in constants.DATA_TYPE_MAPPING: - spark_data_type_list[index] = constants.DATA_TYPE_MAPPING[data_type] + for index, source_type in enumerate(spark_data_type_list): + if type(source_type) is dict: + spark_data_type_list[index] = self._get_data_type_from_dict(source_type) + elif source_type in constants.DATA_TYPE_MAPPING: + spark_data_type_list[index] = constants.DATA_TYPE_MAPPING[source_type] else: - spark_data_type_list[index] = self._handle_unknown_data_type(data_type) + spark_data_type_list[index] = self._handle_unknown_data_type(source_type) if len(data_type_list) == 0: return constants.STRING_TYPE diff --git a/radarpipeline/project/project.py b/radarpipeline/project/project.py index 4dd7f26..86bfc3b 100644 --- a/radarpipeline/project/project.py +++ b/radarpipeline/project/project.py @@ -227,7 +227,7 @@ def fetch_data(self) -> None: Fetches the data from the data source """ - if self.config["input"]["data_type"] == "local": + if self.config["input"]["source_type"] == "local": datareader = Reader( self.spark_session, self.config, @@ -236,7 +236,7 @@ def fetch_data(self) -> None: ) self.data = datareader.read_data() - elif self.config["input"]["data_type"] == "mock": + elif self.config["input"]["source_type"] == "mock": MOCK_URL = "https://github.com/RADAR-base-Analytics/mockdata" cache_dir = os.path.join( os.path.expanduser("~"), ".cache", "radarpipeline", "mockdata") @@ -257,7 +257,7 @@ def fetch_data(self) -> None: ) self.data = datareader.read_data() - elif self.config["input"]["data_type"] == "sftp": + elif self.config["input"]["source_type"] == "sftp": sftp_data_reader = SftpDataReader(self.config["input"]["config"], self.total_required_data) root_dir = sftp_data_reader.get_root_dir() diff --git a/radarpipeline/project/validations.py b/radarpipeline/project/validations.py index 5e5a39d..da9fea6 100644 --- a/radarpipeline/project/validations.py +++ b/radarpipeline/project/validations.py @@ -59,7 +59,7 @@ def _validate_input(self) -> None: Validates the input data config """ - if self.config["input"]["data_type"] == "sftp": + if self.config["input"]["source_type"] == "sftp": sftp_config_keys = [ "sftp_host", "sftp_username", @@ -70,7 +70,7 @@ def _validate_input(self) -> None: if key not in self.config["input"]["config"]: raise ValueError(f"Key not present in the config: {key}") - elif self.config["input"]["data_type"] == "local": + elif self.config["input"]["source_type"] == "local": if "source_path" not in self.config["input"]["config"]: raise ValueError("Key not present in the config: source_path") else: @@ -93,7 +93,7 @@ def _validate_input(self) -> None: if self.config["input"]["data_format"] not in self.valid_input_formats: raise ValueError("Invalid value for key in input: data_format") - elif self.config["input"]["data_type"] == "mock": + elif self.config["input"]["source_type"] == "mock": self._update_mock_data() else: diff --git a/tests/resources/test_yamls/config_with_incorrect_spark.yaml b/tests/resources/test_yamls/config_with_incorrect_spark.yaml index 8c784de..e44ca4b 100644 --- a/tests/resources/test_yamls/config_with_incorrect_spark.yaml +++ b/tests/resources/test_yamls/config_with_incorrect_spark.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/tests/resources/test_yamls/config_with_spark.yaml b/tests/resources/test_yamls/config_with_spark.yaml index 728f68f..4965022 100644 --- a/tests/resources/test_yamls/config_with_spark.yaml +++ b/tests/resources/test_yamls/config_with_spark.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/tests/resources/test_yamls/test_config.yaml b/tests/resources/test_yamls/test_config.yaml index 6368c18..f2e8885 100644 --- a/tests/resources/test_yamls/test_config.yaml +++ b/tests/resources/test_yamls/test_config.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/tests/resources/test_yamls/test_config_all_sampling.yaml b/tests/resources/test_yamls/test_config_all_sampling.yaml index 901e654..d48363c 100644 --- a/tests/resources/test_yamls/test_config_all_sampling.yaml +++ b/tests/resources/test_yamls/test_config_all_sampling.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/tests/resources/test_yamls/test_config_date_sampling.yaml b/tests/resources/test_yamls/test_config_date_sampling.yaml index fa540d9..7b2ae83 100644 --- a/tests/resources/test_yamls/test_config_date_sampling.yaml +++ b/tests/resources/test_yamls/test_config_date_sampling.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/tests/resources/test_yamls/test_config_incomplete.yaml b/tests/resources/test_yamls/test_config_incomplete.yaml index 22fa2b4..2e5e7b7 100644 --- a/tests/resources/test_yamls/test_config_incomplete.yaml +++ b/tests/resources/test_yamls/test_config_incomplete.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/tests/resources/test_yamls/test_config_input_invalid.yaml b/tests/resources/test_yamls/test_config_input_invalid.yaml index ad2e3da..689e7d7 100644 --- a/tests/resources/test_yamls/test_config_input_invalid.yaml +++ b/tests/resources/test_yamls/test_config_input_invalid.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/tests/resources/test_yamls/test_config_project.yaml b/tests/resources/test_yamls/test_config_project.yaml index e520122..929a61b 100644 --- a/tests/resources/test_yamls/test_config_project.yaml +++ b/tests/resources/test_yamls/test_config_project.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/tests/resources/test_yamls/test_config_user_sampling.yaml b/tests/resources/test_yamls/test_config_user_sampling.yaml index 350e85d..2689954 100644 --- a/tests/resources/test_yamls/test_config_user_sampling.yaml +++ b/tests/resources/test_yamls/test_config_user_sampling.yaml @@ -4,7 +4,7 @@ project: version: mock_version input: - data_type: mock # couldbe mock, local, sftp, s3 + source_type: mock # couldbe mock, local, sftp, s3 config: # In case of sftp, use the following format # sftp_host: diff --git a/tests/test_integration/test_sampling.py b/tests/test_integration/test_sampling.py index a9680b6..e486d9c 100644 --- a/tests/test_integration/test_sampling.py +++ b/tests/test_integration/test_sampling.py @@ -16,7 +16,7 @@ def setUp(self): 'description': 'mock_description', 'version': 'mock_version'}, 'input': { - 'data_type': 'mock', + 'source_type': 'mock', 'config': {'source_path': 'mockdata/mockdata'}, 'data_format': 'csv' }, diff --git a/tests/tests_common/test_utils.py b/tests/tests_common/test_utils.py index 1812b16..0e3fe4b 100644 --- a/tests/tests_common/test_utils.py +++ b/tests/tests_common/test_utils.py @@ -91,7 +91,7 @@ def test_read_correct_yaml(self): 'description': 'mock_description', 'version': 'mock_version'}, 'input': { - 'data_type': 'mock', + 'source_type': 'mock', 'config': {'source_path': 'mockdata/mockdata'}, 'data_format': 'csv' }, @@ -121,7 +121,7 @@ def test_read_yaml_with_spark_config(self): 'description': 'mock_description', 'version': 'mock_version'}, 'input': { - 'data_type': 'mock', + 'source_type': 'mock', 'config': {'source_path': 'mockdata/mockdata'}, 'data_format': 'csv' }, @@ -159,7 +159,7 @@ def test_read_yaml_with_user_sampling(self): 'description': 'mock_description', 'version': 'mock_version'}, 'input': { - 'data_type': 'mock', + 'source_type': 'mock', 'config': {'source_path': 'mockdata/mockdata'}, 'data_format': 'csv' }, @@ -190,7 +190,7 @@ def test_read_yaml_with_data_sampling(self): 'description': 'mock_description', 'version': 'mock_version'}, 'input': { - 'data_type': 'mock', + 'source_type': 'mock', 'config': {'source_path': 'mockdata/mockdata'}, 'data_format': 'csv' }, @@ -226,7 +226,7 @@ def test_read_yaml_with_all_sampling(self): 'description': 'mock_description', 'version': 'mock_version'}, 'input': { - 'data_type': 'mock', + 'source_type': 'mock', 'config': {'source_path': 'mockdata/mockdata'}, 'data_format': 'csv' }, diff --git a/tests/tests_project/test_validations.py b/tests/tests_project/test_validations.py index 33b7a94..9d292e9 100644 --- a/tests/tests_project/test_validations.py +++ b/tests/tests_project/test_validations.py @@ -9,7 +9,7 @@ 'description': 'mock_description', 'version': 'mock_version'}, 'input': { - 'data_type': 'mock', + 'source_type': 'mock', 'config': { 'source_path': 'mockdata/mockdata'}, 'data_format': 'csv'}, @@ -40,7 +40,7 @@ def test_validate_mock(self): def test_validate_sftp(self): config = self.mock_config - config['input']['data_type'] = 'sftp' + config['input']['source_type'] = 'sftp' config['input']['config'] = { "sftp_host": "mock_host", "sftp_source_path": "mock_source_path", @@ -54,7 +54,7 @@ def test_validate_sftp(self): def test_validate_invalid_sftp(self): config = self.mock_config - config['input']['data_type'] = 'sftp' + config['input']['source_type'] = 'sftp' config['input']['config'] = { "sftp_host": "mock_host", "sftp_source_path": "mock_source_path", @@ -68,7 +68,7 @@ def test_validate_invalid_sftp(self): def test_validate_local(self): config = self.mock_config - config['input']['data_type'] = 'local' + config['input']['source_type'] = 'local' config['input']['config'] = { "source_path": "mockdata/mockdata" } @@ -78,7 +78,7 @@ def test_validate_local(self): def test_validate_local_wrong_source_path(self): config = self.mock_config - config['input']['data_type'] = 'local' + config['input']['source_type'] = 'local' config['input']['config'] = { "source_path": "xyz" } From 4ce5b82b51b65de6ffb4540eef4a1eb4638e4372 Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Tue, 4 Jun 2024 14:44:44 +0100 Subject: [PATCH 31/32] Added multiple time ranges compatibility with data sampling --- config.yaml | 15 ++++++----- config.yaml.template | 32 +++++++++++++++++++++- radarpipeline/common/utils.py | 2 +- radarpipeline/io/sampler.py | 17 ++++++++++-- radarpipeline/project/validations.py | 40 +++++++++++++++++++--------- 5 files changed, 83 insertions(+), 23 deletions(-) diff --git a/config.yaml b/config.yaml index 8248287..e59dc70 100644 --- a/config.yaml +++ b/config.yaml @@ -38,14 +38,17 @@ configurations: # userids: # - 2a02e53a-951e-4fd0-b47f-195a87096bd0 ## TODO: For future - #data_sampling: + data_sampling: ## Possible methods: time, count, fraction ## starttime and endtime format is dd-mm-yyyy hh:mm:ss in UTC timezone - #method: time - #config: - # starttime: 2018-11-25 00:00:00 - # endtime: 2018-11-29 00:00:00 - # time_column: value.time + ## It is possible to have multiple time ranges. See below Example + method: time + config: + - starttime: 2018-11-22 00:00:00 + endtime: 2018-11-26 00:00:00 + time_column: value.time + #- starttime: 2018-12-27 00:00:00 + # time_column: value.time #method: count #config: # count: 100 diff --git a/config.yaml.template b/config.yaml.template index e39b700..152faa7 100644 --- a/config.yaml.template +++ b/config.yaml.template @@ -24,7 +24,37 @@ input: data_format: csv configurations: - df_type: 'pandas' # can be pandas or spark + df_type: 'pandas' + #user_sampling: + ## Possible methods: fraction, count, userid + # method: fraction + # config: + # fraction: 0.8 + #method: count + #config: + # count: 2 + #method: userid + #config: + # userids: + # - 2a02e53a-951e-4fd0-b47f-195a87096bd0 + ## TODO: For future + data_sampling: + ## Possible methods: time, count, fraction + ## starttime and endtime format is dd-mm-yyyy hh:mm:ss in UTC timezone + ## It is possible to have multiple time ranges. See below Example + #method: time + #config: + #- starttime: 2018-11-22 00:00:00 + # endtime: 2018-11-26 00:00:00 + # time_column: value.time + #- starttime: 2018-12-27 00:00:00 + # time_column: value.time + #method: count + #config: + # count: 100 + #method: fraction + #config: + # fraction: 0.3 features: - location: 'https://github.com/RADAR-base-Analytics/mockfeatures' diff --git a/radarpipeline/common/utils.py b/radarpipeline/common/utils.py index 34135cd..2b935d9 100644 --- a/radarpipeline/common/utils.py +++ b/radarpipeline/common/utils.py @@ -162,7 +162,7 @@ def get_yaml_schema() -> Map: }), Optional("data_sampling"): Map({ "method": Str(), - "config": MapPattern(Str(), Str()), + "config": MapPattern(Str(), Str()) | Seq(MapPattern(Str(), Str())), }), }), "features": Seq(Map({ diff --git a/radarpipeline/io/sampler.py b/radarpipeline/io/sampler.py index bcf3b1e..f513f98 100644 --- a/radarpipeline/io/sampler.py +++ b/radarpipeline/io/sampler.py @@ -49,7 +49,9 @@ def sample_data(self, df: ps.DataFrame) -> Optional[ps.DataFrame]: count = self.config['config']['count'] # sample count of the data return self._sample_data(df, count / df.count()) - elif self.config['method'] == "time": + elif self.config['method'] == "time" and not (type(self.config['config']) is list and len(self.config['config']) != 1): + if len(self.config['config']) == 1: + self.config['config'] = self.config['config'][0] starttime = self.config['config'].get('starttime', None) endtime = self.config['config'].get('endtime', None) time_col = self.config['config'].get('time_col', "value.time") @@ -57,7 +59,18 @@ def sample_data(self, df: ps.DataFrame) -> Optional[ps.DataFrame]: if time_col not in df.columns: raise ValueError(f"Column {time_col} not found in the dataframe") return self._sample_data_by_time(df, starttime, endtime, time_col) - + elif self.config['method'] == "time" and type(self.config['config']) is list: + sampled_dfs = [] + for time_config in self.config['config']: + starttime = time_config.get('starttime', None) + endtime = time_config.get('endtime', None) + time_col = time_config.get('time_col', "value.time") + # check if time_col is present in df + if time_col not in df.columns: + raise ValueError(f"Column {time_col} not found in the dataframe") + sampled_dfs.append(self._sample_data_by_time(df, starttime, endtime, time_col)) + # combined sampled_dfs into one dataframe and deduplicate it + return sampled_dfs[0].union(*sampled_dfs[1:]).distinct() else: raise ValueError("Invalid method") diff --git a/radarpipeline/project/validations.py b/radarpipeline/project/validations.py index da9fea6..e740d9a 100644 --- a/radarpipeline/project/validations.py +++ b/radarpipeline/project/validations.py @@ -344,23 +344,37 @@ def _validate_sampling_userids(self, sampling_config): ] return sampling_config - def _validate_sampling_time(self, sampling_config): - if "config" not in sampling_config: - raise ValueError("Key not present in the user_sampling config: config") - if ("starttime" not in sampling_config["config"] + def _validate_sampling_time_instance(self, time_dict: dict): + if ("starttime" not in time_dict ) and ( - "endtime" not in sampling_config["config"]): + "endtime" not in time_dict): raise ValueError("Neither startime nor endtime present in the config") # check if starttime and endtime are can be converted into time format # if so, convert them - if "starttime" in sampling_config["config"]: - sampling_config["config"]['starttime'] = utils.convert_str_to_time( - sampling_config["config"]['starttime']) - if "endtime" in sampling_config["config"]: - sampling_config["config"]['endtime'] = utils.convert_str_to_time( - sampling_config["config"]['endtime']) - if "time_column" not in sampling_config["config"]: + if "starttime" in time_dict: + time_dict['starttime'] = utils.convert_str_to_time( + time_dict['starttime']) + if "endtime" in time_dict: + time_dict['endtime'] = utils.convert_str_to_time( + time_dict['endtime']) + if "time_column" not in time_dict: logger.warning("time_column not present in the config. \ Using default time column: value.time") - sampling_config["config"]["time_column"] = "value.time" + time_dict["time_column"] = "value.time" + return time_dict + + def _validate_sampling_time(self, sampling_config): + if "config" not in sampling_config: + raise ValueError("Key not present in the user_sampling config: config") + if type(sampling_config["config"]) is list: + if len(sampling_config["config"]) == 0: + raise ValueError( + "No starttime and endtime present in the config" + ) + for i, time_dict in enumerate(sampling_config["config"]): + sampling_config["config"][i] = self._validate_sampling_time_instance( + time_dict) + else: + sampling_config["config"] = self._validate_sampling_time_instance( + sampling_config["config"]) return sampling_config From 0351724a31ae6c8b371a38aa728dae47af7adeeb Mon Sep 17 00:00:00 2001 From: Heet Sankesara Date: Tue, 4 Jun 2024 15:29:59 +0100 Subject: [PATCH 32/32] Added tests for multiple time ranges --- config.yaml | 12 +-- radarpipeline/io/reader.py | 3 +- radarpipeline/io/sampler.py | 7 +- radarpipeline/project/sparkengine.py | 45 ++++----- tests/test_integration/test_sampling.py | 123 ++++++++++++++++++++++++ 5 files changed, 159 insertions(+), 31 deletions(-) diff --git a/config.yaml b/config.yaml index e59dc70..da635f3 100644 --- a/config.yaml +++ b/config.yaml @@ -38,15 +38,15 @@ configurations: # userids: # - 2a02e53a-951e-4fd0-b47f-195a87096bd0 ## TODO: For future - data_sampling: + #data_sampling: ## Possible methods: time, count, fraction ## starttime and endtime format is dd-mm-yyyy hh:mm:ss in UTC timezone ## It is possible to have multiple time ranges. See below Example - method: time - config: - - starttime: 2018-11-22 00:00:00 - endtime: 2018-11-26 00:00:00 - time_column: value.time + #method: time + #config: + #- starttime: 2018-11-22 00:00:00 + # endtime: 2018-11-26 00:00:00 + # time_column: value.time #- starttime: 2018-12-27 00:00:00 # time_column: value.time #method: count diff --git a/radarpipeline/io/reader.py b/radarpipeline/io/reader.py index 488e6d3..4cffa40 100644 --- a/radarpipeline/io/reader.py +++ b/radarpipeline/io/reader.py @@ -608,7 +608,8 @@ def _get_superior_type_from_list(self, data_type_list: List[Any]) -> Any: elif source_type in constants.DATA_TYPE_MAPPING: spark_data_type_list[index] = constants.DATA_TYPE_MAPPING[source_type] else: - spark_data_type_list[index] = self._handle_unknown_data_type(source_type) + spark_data_type_list[index] = self._handle_unknown_data_type( + source_type) if len(data_type_list) == 0: return constants.STRING_TYPE diff --git a/radarpipeline/io/sampler.py b/radarpipeline/io/sampler.py index f513f98..92a5838 100644 --- a/radarpipeline/io/sampler.py +++ b/radarpipeline/io/sampler.py @@ -49,7 +49,9 @@ def sample_data(self, df: ps.DataFrame) -> Optional[ps.DataFrame]: count = self.config['config']['count'] # sample count of the data return self._sample_data(df, count / df.count()) - elif self.config['method'] == "time" and not (type(self.config['config']) is list and len(self.config['config']) != 1): + elif (self.config['method'] == "time" + and not (type(self.config['config']) is list + and len(self.config['config']) != 1)): if len(self.config['config']) == 1: self.config['config'] = self.config['config'][0] starttime = self.config['config'].get('starttime', None) @@ -68,7 +70,8 @@ def sample_data(self, df: ps.DataFrame) -> Optional[ps.DataFrame]: # check if time_col is present in df if time_col not in df.columns: raise ValueError(f"Column {time_col} not found in the dataframe") - sampled_dfs.append(self._sample_data_by_time(df, starttime, endtime, time_col)) + sampled_dfs.append( + self._sample_data_by_time(df, starttime, endtime, time_col)) # combined sampled_dfs into one dataframe and deduplicate it return sampled_dfs[0].union(*sampled_dfs[1:]).distinct() else: diff --git a/radarpipeline/project/sparkengine.py b/radarpipeline/project/sparkengine.py index 4807a28..616a5fc 100644 --- a/radarpipeline/project/sparkengine.py +++ b/radarpipeline/project/sparkengine.py @@ -63,31 +63,32 @@ def initialize_spark_session(self) -> ps.SparkSession: Should be at least 1M, or 0 for unlimited. """ if self.spark_config['spark_master'] == "local": - self.spark = ( - SparkSession.builder.master("local[*]").appName("radarpipeline") - .config('spark.executor.instances', - self.spark_config['spark.executor.instances']) - .config('spark.executor.cores', - self.spark_config['spark.executor.cores']) - .config('spark.executor.memory', - self.spark_config['spark.executor.memory']) - .config('spark.driver.memory', - self.spark_config['spark.driver.memory']) - .config('spark.memory.offHeap.enabled', - self.spark_config['spark.memory.offHeap.enabled']) - .config('spark.memory.offHeap.size', - self.spark_config['spark.memory.offHeap.size']) - .config('spark.driver.maxResultSize', - self.spark_config['spark.driver.maxResultSize']) - .config('spark.log.level', - self.spark_config['spark.log.level']) - .getOrCreate() - ) + self.spark = ( + SparkSession.builder.master("local[*]").appName("radarpipeline") + .config('spark.executor.instances', + self.spark_config['spark.executor.instances']) + .config('spark.executor.cores', + self.spark_config['spark.executor.cores']) + .config('spark.executor.memory', + self.spark_config['spark.executor.memory']) + .config('spark.driver.memory', + self.spark_config['spark.driver.memory']) + .config('spark.memory.offHeap.enabled', + self.spark_config['spark.memory.offHeap.enabled']) + .config('spark.memory.offHeap.size', + self.spark_config['spark.memory.offHeap.size']) + .config('spark.driver.maxResultSize', + self.spark_config['spark.driver.maxResultSize']) + .config('spark.log.level', + self.spark_config['spark.log.level']) + .getOrCreate() + ) else: self.spark = ( - SparkSession.builder.master(self.spark_config['spark_master']).appName("radarpipeline") + SparkSession.builder.master( + self.spark_config['spark_master']).appName("radarpipeline") .getOrCreate() - ) + ) self.spark._jsc.setLogLevel(self.spark_config['spark.log.level']) self.spark.sparkContext.setLogLevel("OFF") # Enable Apache Arrow for optimizations in Spark to Pandas conversion diff --git a/tests/test_integration/test_sampling.py b/tests/test_integration/test_sampling.py index e486d9c..c0286f5 100644 --- a/tests/test_integration/test_sampling.py +++ b/tests/test_integration/test_sampling.py @@ -96,6 +96,129 @@ def test_data_sampling_time(self): self.assertLessEqual(output_data['StepCountPerDay'][ 'date'].max(), pd.Timestamp(endtime).date()) + def test_data_sampling_time_list(self): + starttime = "2018-11-25 00:00:00" + endtime = "2018-11-29 00:00:00" + time_list = [{"starttime": starttime, "endtime": endtime, + "time_column": 'value.time'}] + data_sampling_config = self.default_config + data_sampling_config['configurations']['data_sampling'] = {} + data_sampling_config['configurations']['data_sampling']['method'] = 'time' + data_sampling_config['configurations'][ + 'data_sampling']['config'] = time_list + output_data = self.get_config_output(data_sampling_config) + self.assertGreaterEqual(output_data['PhoneBatteryChargingDuration'][ + 'date'].min(), pd.Timestamp(starttime).date()) + self.assertLessEqual(output_data['PhoneBatteryChargingDuration'][ + 'date'].max(), pd.Timestamp(endtime).date()) + self.assertGreaterEqual(output_data['StepCountPerDay'][ + 'date'].min(), pd.Timestamp(starttime).date()) + self.assertLessEqual(output_data['StepCountPerDay'][ + 'date'].max(), pd.Timestamp(endtime).date()) + + def test_data_sampling_multiple_time(self): + starttime_1 = "2018-11-25 00:00:00" + endtime_1 = "2018-11-29 00:00:00" + starttime_2 = "2019-01-01 00:00:00" + endtime_2 = "2019-04-30 00:00:00" + time_list = [{"starttime": starttime_1, "endtime": endtime_1, + "time_column": 'value.time'}, + {"starttime": starttime_2, "endtime": endtime_2}] + data_sampling_config = self.default_config + data_sampling_config['configurations']['data_sampling'] = {} + data_sampling_config['configurations']['data_sampling']['method'] = 'time' + data_sampling_config['configurations'][ + 'data_sampling']['config'] = time_list + output_data = self.get_config_output(data_sampling_config) + self.assertGreaterEqual(output_data['PhoneBatteryChargingDuration'][ + 'date'].min(), pd.Timestamp(starttime_1).date()) + self.assertLessEqual(output_data['PhoneBatteryChargingDuration'][ + 'date'].max(), pd.Timestamp(endtime_2).date()) + self.assertGreaterEqual(output_data['StepCountPerDay'][ + 'date'].min(), pd.Timestamp(starttime_1).date()) + self.assertLessEqual(output_data['StepCountPerDay'][ + 'date'].max(), pd.Timestamp(endtime_2).date()) + # check output_data['PhoneBatteryChargingDuration']['date'] is + # between the time range + # starttime_1 and endtime_1 and starttime_2 and endtime_2 + self.assertTrue(all((output_data['PhoneBatteryChargingDuration']['date'] + >= pd.Timestamp(starttime_1).date()) + & (output_data['PhoneBatteryChargingDuration']['date'] + <= pd.Timestamp(endtime_1).date()) + | (output_data['PhoneBatteryChargingDuration']['date'] + >= pd.Timestamp(starttime_2).date()) + & (output_data['PhoneBatteryChargingDuration']['date'] + <= pd.Timestamp(endtime_2).date()))) + + self.assertTrue(all((output_data['StepCountPerDay']['date'] + >= pd.Timestamp(starttime_1).date()) + & (output_data['StepCountPerDay']['date'] + <= pd.Timestamp(endtime_1).date()) + | (output_data['StepCountPerDay']['date'] + >= pd.Timestamp(starttime_2).date()) + & (output_data['StepCountPerDay']['date'] + <= pd.Timestamp(endtime_2).date()))) + + def test_data_sampling_multiple_time_single_starttime(self): + starttime_1 = "2018-11-25 00:00:00" + endtime_1 = "2018-11-29 00:00:00" + starttime_2 = "2019-01-01 00:00:00" + time_list = [{"starttime": starttime_1, "endtime": endtime_1, + "time_column": 'value.time'}, + {"starttime": starttime_2}] + data_sampling_config = self.default_config + data_sampling_config['configurations']['data_sampling'] = {} + data_sampling_config['configurations']['data_sampling']['method'] = 'time' + data_sampling_config['configurations'][ + 'data_sampling']['config'] = time_list + output_data = self.get_config_output(data_sampling_config) + # check output_data['PhoneBatteryChargingDuration']['date'] is between + # the time range + # starttime_1 and endtime_1 and starttime_2 + self.assertTrue(all((output_data['PhoneBatteryChargingDuration']['date'] + >= pd.Timestamp(starttime_1).date()) + & (output_data['PhoneBatteryChargingDuration']['date'] + <= pd.Timestamp(endtime_1).date()) + | (output_data['PhoneBatteryChargingDuration']['date'] + >= pd.Timestamp(starttime_2).date()))) + + self.assertTrue(all((output_data['StepCountPerDay']['date'] + >= pd.Timestamp(starttime_1).date()) + & (output_data['StepCountPerDay']['date'] + <= pd.Timestamp(endtime_1).date()) + | (output_data['StepCountPerDay']['date'] + >= pd.Timestamp(starttime_2).date()))) + + def test_data_sampling_multiple_time_single_endtime(self): + endtime_1 = "2018-11-29 00:00:00" + starttime_2 = "2019-01-01 00:00:00" + endtime_2 = "2019-04-30 00:00:00" + time_list = [{"endtime": endtime_1, + "time_column": 'value.time'}, + {"starttime": starttime_2, "endtime": endtime_2}] + data_sampling_config = self.default_config + data_sampling_config['configurations']['data_sampling'] = {} + data_sampling_config['configurations']['data_sampling']['method'] = 'time' + data_sampling_config['configurations'][ + 'data_sampling']['config'] = time_list + output_data = self.get_config_output(data_sampling_config) + # check output_data['PhoneBatteryChargingDuration']['date'] is between + # the time range + # endtime_1 and starttime_2 and endtime_2 + self.assertTrue(all((output_data['PhoneBatteryChargingDuration']['date'] + <= pd.Timestamp(endtime_1).date()) + | (output_data['PhoneBatteryChargingDuration']['date'] + >= pd.Timestamp(starttime_2).date()) + & (output_data['PhoneBatteryChargingDuration']['date'] + <= pd.Timestamp(endtime_2).date()))) + + self.assertTrue(all((output_data['StepCountPerDay']['date'] + <= pd.Timestamp(endtime_1).date()) + | (output_data['StepCountPerDay']['date'] + >= pd.Timestamp(starttime_2).date()) + & (output_data['StepCountPerDay']['date'] + <= pd.Timestamp(endtime_2).date()))) + def test_data_sampling_count(self): data_sampling_config = self.default_config data_sampling_config['configurations']['data_sampling'] = {}