Modify multi-freq example (#626)

* modify the example of multi-freq * add Copyright * add a comment to average_ops.py * modify the example of multi-freq * add comment to multi_freq_handler.py * add the Ref expression description to multi_freq_handler.py * add expression description to multi_freq_handler.py
microsoft · Sep 30, 2021 · 79dfffc · 79dfffc
1 parent dd917fe
commit 79dfffc
Show file tree

Hide file tree

Showing 6 changed files with 172 additions and 35 deletions.
diff --git a/examples/benchmarks/LightGBM/features_resample_N.py b/examples/benchmarks/LightGBM/features_resample_N.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
+import pandas as pd
+
+from qlib.data.inst_processor import InstProcessor
+from qlib.utils.resam import resam_calendar
+
+
+class ResampleNProcessor(InstProcessor):
+    def __init__(self, target_frq: str, **kwargs):
+        self.target_frq = target_frq
+
+    def __call__(self, df: pd.DataFrame, *args, **kwargs):
+        df.index = pd.to_datetime(df.index)
+        res_index = resam_calendar(df.index, "1min", self.target_frq)
+        df = df.resample(self.target_frq).last().reindex(res_index)
+        return df
diff --git a/examples/benchmarks/LightGBM/features_sample.py b/examples/benchmarks/LightGBM/features_sample.py
diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py
@@ -0,0 +1,135 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
+import pandas as pd
+
+from qlib.data.dataset.loader import QlibDataLoader
+from qlib.contrib.data.handler import DataHandlerLP, _DEFAULT_LEARN_PROCESSORS, check_transform_proc
+
+
+class Avg15minLoader(QlibDataLoader):
+    def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame:
+        df = super(Avg15minLoader, self).load(instruments, start_time, end_time)
+        if self.is_group:
+            # feature_day(day freq) and feature_15min(1min freq, Average every 15 minutes) renamed feature
+            df.columns = df.columns.map(lambda x: ("feature", x[1]) if x[0].startswith("feature") else x)
+        return df
+
+
+class Avg15minHandler(DataHandlerLP):
+    def __init__(
+        self,
+        instruments="csi500",
+        start_time=None,
+        end_time=None,
+        freq="day",
+        infer_processors=[],
+        learn_processors=_DEFAULT_LEARN_PROCESSORS,
+        fit_start_time=None,
+        fit_end_time=None,
+        process_type=DataHandlerLP.PTYPE_A,
+        filter_pipe=None,
+        inst_processor=None,
+        **kwargs,
+    ):
+        infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
+        learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
+        data_loader = Avg15minLoader(
+            config=self.loader_config(), filter_pipe=filter_pipe, freq=freq, inst_processor=inst_processor
+        )
+        super().__init__(
+            instruments=instruments,
+            start_time=start_time,
+            end_time=end_time,
+            data_loader=data_loader,
+            infer_processors=infer_processors,
+            learn_processors=learn_processors,
+            process_type=process_type,
+        )
+
+    def loader_config(self):
+
+        # Results for dataset: df: pd.DataFrame
+        #   len(df.columns) == 6 + 6 * 16, len(df.index.get_level_values(level="datetime").unique()) == T
+        #   df.columns: close0, close1, ..., close16, open0, ..., open16, ..., vwap16
+        #       freq == day:
+        #           close0, open0, low0, high0, volume0, vwap0
+        #       freq == 1min:
+        #           close1, ..., close16, ..., vwap1, ..., vwap16
+        #   df.index.name == ["datetime", "instrument"]: pd.MultiIndex
+        # Example:
+        #                          feature                        ...                  label
+        #                           close0      open0       low0  ... vwap1 vwap16    LABEL0
+        # datetime   instrument                                   ...
+        # 2020-10-09 SH600000    11.794546  11.819587  11.769505  ...   NaN    NaN -0.005214
+        # 2020-10-15 SH600000    12.044961  11.944795  11.932274  ...   NaN    NaN -0.007202
+        # ...                          ...        ...        ...  ...   ...    ...       ...
+        # 2021-05-28 SZ300676     6.369684   6.495406   6.306568  ...   NaN    NaN -0.001321
+        # 2021-05-31 SZ300676     6.601626   6.465643   6.465130  ...   NaN    NaN -0.023428
+
+        # features day: len(columns) == 6, freq = day
+        # $close is the closing price of the current trading day:
+        #   if the user needs to get the `close` before the last T days, use Ref($close, T-1), for example:
+        #                                    $close  Ref($close, 1)  Ref($close, 2)  Ref($close, 3)  Ref($close, 4)
+        #         instrument datetime
+        #         SH600519   2021-06-01  244.271530
+        #                    2021-06-02  242.205917      244.271530
+        #                    2021-06-03  242.229889      242.205917      244.271530
+        #                    2021-06-04  245.421524      242.229889      242.205917      244.271530
+        #                    2021-06-07  247.547089      245.421524      242.229889      242.205917      244.271530
+
+        # WARNING: Ref($close, N), if N == 0, Ref($close, N) ==> $close
+
+        fields = ["$close", "$open", "$low", "$high", "$volume", "$vwap"]
+        # names: close0, open0, ..., vwap0
+        names = list(map(lambda x: x.strip("$") + "0", fields))
+
+        config = {"feature_day": (fields, names)}
+
+        # features 15min: len(columns) == 6 * 16, freq = 1min
+        #   $close is the closing price of the current trading day:
+        #       if the user gets 'close' for the i-th 15min of the last T days, use `Ref(Mean($close, 15), (T-1) * 240 + i * 15)`, for example:
+        #                                    Ref(Mean($close, 15), 225)  Ref(Mean($close, 15), 465)  Ref(Mean($close, 15), 705)
+        #             instrument datetime
+        #             SH600519   2021-05-31                  241.769897                  243.077942                  244.712997
+        #                        2021-06-01                  244.271530                  241.769897                  243.077942
+        #                        2021-06-02                  242.205917                  244.271530                  241.769897
+
+        # WARNING: Ref(Mean($close, 15), N), if N == 0, Ref(Mean($close, 15), N) ==> Mean($close, 15)
+
+        # Results of the current script:
+        #   time:   09:00 --> 09:14,            ..., 14:45 --> 14:59
+        #   fields: Ref(Mean($close, 15), 225), ..., Mean($close, 15)
+        #   name:   close1,                     ..., close16
+        #
+
+        # Expression description: take close as an example
+        #   Mean($close, 15) ==> df["$close"].rolling(15, min_periods=1).mean()
+        #   Ref(Mean($close, 15), 15) ==> df["$close"].rolling(15, min_periods=1).mean().shift(15)
+
+        #   NOTE: The last data of each trading day, which is the average of the i-th 15 minutes
+
+        # Average:
+        #   Average of the i-th 15-minute period of each trading day: 1 <= i <= 250 // 16
+        #       Avg(15minutes): Ref(Mean($close, 15), 240 - i * 15)
+        #
+        #   Average of the first 15 minutes of each trading day; i = 1
+        #       Avg(09:00 --> 09:14), df.index.loc["09:14"]: Ref(Mean($close, 15), 240- 1 * 15) ==> Ref(Mean($close, 15), 225)
+        #   Average of the last 15 minutes of each trading day; i = 16
+        #       Avg(14:45 --> 14:59), df.index.loc["14:59"]: Ref(Mean($close, 15), 240 - 16 * 15) ==> Ref(Mean($close, 15), 0) ==> Mean($close, 15)
+
+        # 15min resample to day
+        #   df.resample("1d").last()
+        tmp_fields = []
+        tmp_names = []
+        for i, _f in enumerate(fields):
+            _fields = [f"Ref(Mean({_f}, 15), {j * 15})" for j in range(1, 240 // 15)]
+            _names = [f"{names[i][:-1]}{int(names[i][-1])+j}" for j in range(240 // 15 - 1, 0, -1)]
+            _fields.append(f"Mean({_f}, 15)")
+            _names.append(f"{names[i][:-1]}{int(names[i][-1])+240 // 15}")
+            tmp_fields += _fields
+            tmp_names += _names
+        config["feature_15min"] = (tmp_fields, tmp_names)
+        # label
+        config["label"] = (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"])
+        return config
diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml
@@ -4,7 +4,7 @@ qlib_init:
         1min: "~/.qlib/qlib_data/cn_data_1min"
     region: cn
     dataset_cache: null
-    maxtasksperchild: 1
+    maxtasksperchild: null
 market: &market csi300
 benchmark: &benchmark SH000300
 data_handler_config: &data_handler_config
@@ -16,15 +16,15 @@ data_handler_config: &data_handler_config
     instruments: *market
     freq:
         label: day
-        feature: 1min
+        feature_15min: 1min
+        feature_day: day
     # with label as reference
     inst_processor:
-        feature:
-            - class: Resample1minProcessor
-              module_path: features_sample.py
+        feature_15min:
+            - class: ResampleNProcessor
+              module_path: features_resample_N.py
               kwargs:
-                  hour: 14
-                  minute: 56
+                  target_frq: 1d
 
 port_analysis_config: &port_analysis_config
     strategy:
@@ -62,25 +62,25 @@ task:
         module_path: qlib.data.dataset
         kwargs:
             handler:
-                class: Alpha158
-                module_path: qlib.contrib.data.handler
+                class: Avg15minHandler
+                module_path: multi_freq_handler.py
                 kwargs: *data_handler_config
             segments:
                 train: [2008-01-01, 2014-12-31]
                 valid: [2015-01-01, 2016-12-31]
                 test: [2017-01-01, 2020-08-01]
-    record: 
+    record:
         - class: SignalRecord
-        module_path: qlib.workflow.record_temp
-        kwargs: 
+          module_path: qlib.workflow.record_temp
+          kwargs:
             model: <MODEL>
             dataset: <DATASET>
         - class: SigAnaRecord
-        module_path: qlib.workflow.record_temp
-        kwargs: 
+          module_path: qlib.workflow.record_temp
+          kwargs:
             ana_long_short: False
             ann_scaler: 252
         - class: PortAnaRecord
-        module_path: qlib.workflow.record_temp
-        kwargs: 
-            config: *port_analysis_config
+          module_path: qlib.workflow.record_temp
+          kwargs:
+            config: *port_analysis_config
diff --git a/qlib/data/data.py b/qlib/data/data.py
@@ -549,7 +549,7 @@ def dataset_processor(instruments_d, column_names, start_time, end_time, freq, i
             inst_l.append(inst)
             task_l.append(
                 delayed(DatasetProvider.expression_calculator)(
-                    inst, start_time, end_time, freq, normalize_column_names, spans, C
+                    inst, start_time, end_time, freq, normalize_column_names, spans, C, inst_processors
                 )
             )
 

diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py
@@ -88,7 +88,7 @@ def load(self, name):
         obj = self.recorder.load_object(name)
         return obj
 
-    def list():
+    def list(self):
         """
         List the supported artifacts.