diff --git a/examples/benchmarks/LightGBM/features_resample_N.py b/examples/benchmarks/LightGBM/features_resample_N.py new file mode 100644 index 0000000000..13061513cb --- /dev/null +++ b/examples/benchmarks/LightGBM/features_resample_N.py @@ -0,0 +1,18 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import pandas as pd + +from qlib.data.inst_processor import InstProcessor +from qlib.utils.resam import resam_calendar + + +class ResampleNProcessor(InstProcessor): + def __init__(self, target_frq: str, **kwargs): + self.target_frq = target_frq + + def __call__(self, df: pd.DataFrame, *args, **kwargs): + df.index = pd.to_datetime(df.index) + res_index = resam_calendar(df.index, "1min", self.target_frq) + df = df.resample(self.target_frq).last().reindex(res_index) + return df diff --git a/examples/benchmarks/LightGBM/features_sample.py b/examples/benchmarks/LightGBM/features_sample.py deleted file mode 100644 index 0b996bd1f9..0000000000 --- a/examples/benchmarks/LightGBM/features_sample.py +++ /dev/null @@ -1,16 +0,0 @@ -import datetime -import pandas as pd - -from qlib.data.inst_processor import InstProcessor - - -class Resample1minProcessor(InstProcessor): - def __init__(self, hour: int, minute: int, **kwargs): - self.hour = hour - self.minute = minute - - def __call__(self, df: pd.DataFrame, *args, **kwargs): - df.index = pd.to_datetime(df.index) - df = df.loc[df.index.time == datetime.time(self.hour, self.minute)] - df.index = df.index.normalize() - return df diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py new file mode 100644 index 0000000000..07d7ac27c4 --- /dev/null +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -0,0 +1,135 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import pandas as pd + +from qlib.data.dataset.loader import QlibDataLoader +from qlib.contrib.data.handler import DataHandlerLP, _DEFAULT_LEARN_PROCESSORS, check_transform_proc + + +class Avg15minLoader(QlibDataLoader): + def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame: + df = super(Avg15minLoader, self).load(instruments, start_time, end_time) + if self.is_group: + # feature_day(day freq) and feature_15min(1min freq, Average every 15 minutes) renamed feature + df.columns = df.columns.map(lambda x: ("feature", x[1]) if x[0].startswith("feature") else x) + return df + + +class Avg15minHandler(DataHandlerLP): + def __init__( + self, + instruments="csi500", + start_time=None, + end_time=None, + freq="day", + infer_processors=[], + learn_processors=_DEFAULT_LEARN_PROCESSORS, + fit_start_time=None, + fit_end_time=None, + process_type=DataHandlerLP.PTYPE_A, + filter_pipe=None, + inst_processor=None, + **kwargs, + ): + infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) + learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) + data_loader = Avg15minLoader( + config=self.loader_config(), filter_pipe=filter_pipe, freq=freq, inst_processor=inst_processor + ) + super().__init__( + instruments=instruments, + start_time=start_time, + end_time=end_time, + data_loader=data_loader, + infer_processors=infer_processors, + learn_processors=learn_processors, + process_type=process_type, + ) + + def loader_config(self): + + # Results for dataset: df: pd.DataFrame + # len(df.columns) == 6 + 6 * 16, len(df.index.get_level_values(level="datetime").unique()) == T + # df.columns: close0, close1, ..., close16, open0, ..., open16, ..., vwap16 + # freq == day: + # close0, open0, low0, high0, volume0, vwap0 + # freq == 1min: + # close1, ..., close16, ..., vwap1, ..., vwap16 + # df.index.name == ["datetime", "instrument"]: pd.MultiIndex + # Example: + # feature ... label + # close0 open0 low0 ... vwap1 vwap16 LABEL0 + # datetime instrument ... + # 2020-10-09 SH600000 11.794546 11.819587 11.769505 ... NaN NaN -0.005214 + # 2020-10-15 SH600000 12.044961 11.944795 11.932274 ... NaN NaN -0.007202 + # ... ... ... ... ... ... ... ... + # 2021-05-28 SZ300676 6.369684 6.495406 6.306568 ... NaN NaN -0.001321 + # 2021-05-31 SZ300676 6.601626 6.465643 6.465130 ... NaN NaN -0.023428 + + # features day: len(columns) == 6, freq = day + # $close is the closing price of the current trading day: + # if the user needs to get the `close` before the last T days, use Ref($close, T-1), for example: + # $close Ref($close, 1) Ref($close, 2) Ref($close, 3) Ref($close, 4) + # instrument datetime + # SH600519 2021-06-01 244.271530 + # 2021-06-02 242.205917 244.271530 + # 2021-06-03 242.229889 242.205917 244.271530 + # 2021-06-04 245.421524 242.229889 242.205917 244.271530 + # 2021-06-07 247.547089 245.421524 242.229889 242.205917 244.271530 + + # WARNING: Ref($close, N), if N == 0, Ref($close, N) ==> $close + + fields = ["$close", "$open", "$low", "$high", "$volume", "$vwap"] + # names: close0, open0, ..., vwap0 + names = list(map(lambda x: x.strip("$") + "0", fields)) + + config = {"feature_day": (fields, names)} + + # features 15min: len(columns) == 6 * 16, freq = 1min + # $close is the closing price of the current trading day: + # if the user gets 'close' for the i-th 15min of the last T days, use `Ref(Mean($close, 15), (T-1) * 240 + i * 15)`, for example: + # Ref(Mean($close, 15), 225) Ref(Mean($close, 15), 465) Ref(Mean($close, 15), 705) + # instrument datetime + # SH600519 2021-05-31 241.769897 243.077942 244.712997 + # 2021-06-01 244.271530 241.769897 243.077942 + # 2021-06-02 242.205917 244.271530 241.769897 + + # WARNING: Ref(Mean($close, 15), N), if N == 0, Ref(Mean($close, 15), N) ==> Mean($close, 15) + + # Results of the current script: + # time: 09:00 --> 09:14, ..., 14:45 --> 14:59 + # fields: Ref(Mean($close, 15), 225), ..., Mean($close, 15) + # name: close1, ..., close16 + # + + # Expression description: take close as an example + # Mean($close, 15) ==> df["$close"].rolling(15, min_periods=1).mean() + # Ref(Mean($close, 15), 15) ==> df["$close"].rolling(15, min_periods=1).mean().shift(15) + + # NOTE: The last data of each trading day, which is the average of the i-th 15 minutes + + # Average: + # Average of the i-th 15-minute period of each trading day: 1 <= i <= 250 // 16 + # Avg(15minutes): Ref(Mean($close, 15), 240 - i * 15) + # + # Average of the first 15 minutes of each trading day; i = 1 + # Avg(09:00 --> 09:14), df.index.loc["09:14"]: Ref(Mean($close, 15), 240- 1 * 15) ==> Ref(Mean($close, 15), 225) + # Average of the last 15 minutes of each trading day; i = 16 + # Avg(14:45 --> 14:59), df.index.loc["14:59"]: Ref(Mean($close, 15), 240 - 16 * 15) ==> Ref(Mean($close, 15), 0) ==> Mean($close, 15) + + # 15min resample to day + # df.resample("1d").last() + tmp_fields = [] + tmp_names = [] + for i, _f in enumerate(fields): + _fields = [f"Ref(Mean({_f}, 15), {j * 15})" for j in range(1, 240 // 15)] + _names = [f"{names[i][:-1]}{int(names[i][-1])+j}" for j in range(240 // 15 - 1, 0, -1)] + _fields.append(f"Mean({_f}, 15)") + _names.append(f"{names[i][:-1]}{int(names[i][-1])+240 // 15}") + tmp_fields += _fields + tmp_names += _names + config["feature_15min"] = (tmp_fields, tmp_names) + # label + config["label"] = (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) + return config diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml index fcb3dd367a..829c871159 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml @@ -4,7 +4,7 @@ qlib_init: 1min: "~/.qlib/qlib_data/cn_data_1min" region: cn dataset_cache: null - maxtasksperchild: 1 + maxtasksperchild: null market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config @@ -16,15 +16,15 @@ data_handler_config: &data_handler_config instruments: *market freq: label: day - feature: 1min + feature_15min: 1min + feature_day: day # with label as reference inst_processor: - feature: - - class: Resample1minProcessor - module_path: features_sample.py + feature_15min: + - class: ResampleNProcessor + module_path: features_resample_N.py kwargs: - hour: 14 - minute: 56 + target_frq: 1d port_analysis_config: &port_analysis_config strategy: @@ -62,25 +62,25 @@ task: module_path: qlib.data.dataset kwargs: handler: - class: Alpha158 - module_path: qlib.contrib.data.handler + class: Avg15minHandler + module_path: multi_freq_handler.py kwargs: *data_handler_config segments: train: [2008-01-01, 2014-12-31] valid: [2015-01-01, 2016-12-31] test: [2017-01-01, 2020-08-01] - record: + record: - class: SignalRecord - module_path: qlib.workflow.record_temp - kwargs: + module_path: qlib.workflow.record_temp + kwargs: model: dataset: - class: SigAnaRecord - module_path: qlib.workflow.record_temp - kwargs: + module_path: qlib.workflow.record_temp + kwargs: ana_long_short: False ann_scaler: 252 - class: PortAnaRecord - module_path: qlib.workflow.record_temp - kwargs: - config: *port_analysis_config \ No newline at end of file + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/qlib/data/data.py b/qlib/data/data.py index 8bb9cb89df..7fbc48f715 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -549,7 +549,7 @@ def dataset_processor(instruments_d, column_names, start_time, end_time, freq, i inst_l.append(inst) task_l.append( delayed(DatasetProvider.expression_calculator)( - inst, start_time, end_time, freq, normalize_column_names, spans, C + inst, start_time, end_time, freq, normalize_column_names, spans, C, inst_processors ) ) diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index 16e02b9756..605b70cf9c 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -88,7 +88,7 @@ def load(self, name): obj = self.recorder.load_object(name) return obj - def list(): + def list(self): """ List the supported artifacts.