Skip to content
View adamnovotnycom's full-sized avatar

Highlights

  • Pro

Block or report adamnovotnycom

Block user

Prevent this user from interacting with your repositories and sending you notifications. Learn more about blocking users.

You must be logged in to block users.

Please don't include any personal information such as legal names or email addresses. Maximum 100 characters, markdown supported. This note will be visible to only you.
Report abuse

Contact GitHub support about this user’s behavior. Learn more about reporting abuse.

Report abuse

Pinned Loading

  1. machine-learning-docker-template machine-learning-docker-template Public

    Jupyter Notebook 2 1

  2. sklearn_pipe.ipynb sklearn_pipe.ipynb
    1
    {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"sklearn_pipe.ipynb","provenance":[],"collapsed_sections":["Px8yvYANQkV_"],"toc_visible":true,"mount_file_id":"1dzgBGllszE7-0j9cu-WI_i8HPvlDhAhr","authorship_tag":"ABX9TyMnY1Al295O8NuFyE9rdMde"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"jJShBSUIWBHY"},"source":["# Sklearn pipeline"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0L2bqfaHOMBn","executionInfo":{"status":"ok","timestamp":1631992634080,"user_tz":240,"elapsed":1089,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"1ed1ced4-9240-4572-e99a-5c8cbc8236d3"},"source":["import datetime\n","import multiprocessing\n","import pandas as pd\n","from sklearn.base import BaseEstimator, TransformerMixin\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.impute import SimpleImputer\n","from sklearn.linear_model import LogisticRegression\n","from sklearn import metrics\n","from sklearn.pipeline import FeatureUnion, Pipeline \n","from sklearn.preprocessing import StandardScaler, OneHotEncoder\n","from sklearn.model_selection import GridSearchCV, TimeSeriesSplit\n","import sys\n","print(sys.version)\n","print(pd.__version__)\n","print(multiprocessing.cpu_count())"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["3.7.12 (default, Sep 10 2021, 00:21:48) \n","[GCC 7.5.0]\n","1.1.5\n","2\n"]}]},{"cell_type":"markdown","metadata":{"id":"Px8yvYANQkV_"},"source":["## Load raw stock price data"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":203},"id":"tX-hbcJ1Ou_5","executionInfo":{"status":"ok","timestamp":1631992634679,"user_tz":240,"elapsed":460,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"8dcb1e31-ec86-4003-80c6-b7969b773d26"},"source":["df = pd.read_csv(\n","    \"/content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/SPY_yahoo_finance.csv\",\n","    header=0\n",")\n","df.columns = [x.lower().replace(\" \", \"_\") for x in df.columns]\n","df.head(5)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>date</th>\n","      <th>open</th>\n","      <th>high</th>\n","      <th>low</th>\n","      <th>close</th>\n","      <th>adj_close</th>\n","      <th>volume</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1993-01-29</td>\n","      <td>43.96875</td>\n","      <td>43.96875</td>\n","      <td>43.75000</td>\n","      <td>43.93750</td>\n","      <td>25.799770</td>\n","      <td>1003200</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>1993-02-01</td>\n","      <td>43.96875</td>\n","      <td>44.25000</td>\n","      <td>43.96875</td>\n","      <td>44.25000</td>\n","      <td>25.983273</td>\n","      <td>480500</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1993-02-02</td>\n","      <td>44.21875</td>\n","      <td>44.37500</td>\n","      <td>44.12500</td>\n","      <td>44.34375</td>\n","      <td>26.038315</td>\n","      <td>201300</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>1993-02-03</td>\n","      <td>44.40625</td>\n","      <td>44.84375</td>\n","      <td>44.37500</td>\n","      <td>44.81250</td>\n","      <td>26.313566</td>\n","      <td>529400</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>1993-02-04</td>\n","      <td>44.96875</td>\n","      <td>45.09375</td>\n","      <td>44.46875</td>\n","      <td>45.00000</td>\n","      <td>26.423655</td>\n","      <td>531500</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["         date      open      high       low     close  adj_close   volume\n","0  1993-01-29  43.96875  43.96875  43.75000  43.93750  25.799770  1003200\n","1  1993-02-01  43.96875  44.25000  43.96875  44.25000  25.983273   480500\n","2  1993-02-02  44.21875  44.37500  44.12500  44.34375  26.038315   201300\n","3  1993-02-03  44.40625  44.84375  44.37500  44.81250  26.313566   529400\n","4  1993-02-04  44.96875  45.09375  44.46875  45.00000  26.423655   531500"]},"metadata":{},"execution_count":2}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":388},"id":"u7xYwOCcWZFE","executionInfo":{"status":"ok","timestamp":1631992634682,"user_tz":240,"elapsed":6,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"ade1ef11-952e-44db-ecca-f156361a9ac3"},"source":["df.describe(include=\"all\")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>date</th>\n","      <th>open</th>\n","      <th>high</th>\n","      <th>low</th>\n","      <th>close</th>\n","      <th>adj_close</th>\n","      <th>volume</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>count</th>\n","      <td>7193</td>\n","      <td>7193.000000</td>\n","      <td>7193.000000</td>\n","      <td>7193.000000</td>\n","      <td>7193.000000</td>\n","      <td>7193.000000</td>\n","      <td>7.193000e+03</td>\n","    </tr>\n","    <tr>\n","      <th>unique</th>\n","      <td>7193</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>top</th>\n","      <td>2000-10-09</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>freq</th>\n","      <td>1</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>mean</th>\n","      <td>NaN</td>\n","      <td>149.569786</td>\n","      <td>150.446365</td>\n","      <td>148.596184</td>\n","      <td>149.573008</td>\n","      <td>124.130425</td>\n","      <td>8.432958e+07</td>\n","    </tr>\n","    <tr>\n","      <th>std</th>\n","      <td>NaN</td>\n","      <td>80.710651</td>\n","      <td>81.049916</td>\n","      <td>80.339373</td>\n","      <td>80.732359</td>\n","      <td>86.543832</td>\n","      <td>9.571367e+07</td>\n","    </tr>\n","    <tr>\n","      <th>min</th>\n","      <td>NaN</td>\n","      <td>43.343750</td>\n","      <td>43.531250</td>\n","      <td>42.812500</td>\n","      <td>43.406250</td>\n","      <td>25.487831</td>\n","      <td>5.200000e+03</td>\n","    </tr>\n","    <tr>\n","      <th>25%</th>\n","      <td>NaN</td>\n","      <td>100.739998</td>\n","      <td>101.593750</td>\n","      <td>99.790001</td>\n","      <td>100.699997</td>\n","      <td>71.142609</td>\n","      <td>8.162800e+06</td>\n","    </tr>\n","    <tr>\n","      <th>50%</th>\n","      <td>NaN</td>\n","      <td>128.125000</td>\n","      <td>128.860001</td>\n","      <td>127.269997</td>\n","      <td>128.187500</td>\n","      <td>93.903046</td>\n","      <td>5.864900e+07</td>\n","    </tr>\n","    <tr>\n","      <th>75%</th>\n","      <td>NaN</td>\n","      <td>190.369995</td>\n","      <td>191.820007</td>\n","      <td>188.789993</td>\n","      <td>190.300003</td>\n","      <td>168.407654</td>\n","      <td>1.195754e+08</td>\n","    </tr>\n","    <tr>\n","      <th>max</th>\n","      <td>NaN</td>\n","      <td>445.589996</td>\n","      <td>447.109985</td>\n","      <td>445.070007</td>\n","      <td>446.970001</td>\n","      <td>446.970001</td>\n","      <td>8.710263e+08</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["              date         open  ...    adj_close        volume\n","count         7193  7193.000000  ...  7193.000000  7.193000e+03\n","unique        7193          NaN  ...          NaN           NaN\n","top     2000-10-09          NaN  ...          NaN           NaN\n","freq             1          NaN  ...          NaN           NaN\n","mean           NaN   149.569786  ...   124.130425  8.432958e+07\n","std            NaN    80.710651  ...    86.543832  9.571367e+07\n","min            NaN    43.343750  ...    25.487831  5.200000e+03\n","25%            NaN   100.739998  ...    71.142609  8.162800e+06\n","50%            NaN   128.125000  ...    93.903046  5.864900e+07\n","75%            NaN   190.369995  ...   168.407654  1.195754e+08\n","max            NaN   445.589996  ...   446.970001  8.710263e+08\n","\n","[11 rows x 7 columns]"]},"metadata":{},"execution_count":3}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"x4Bjkda4Xzj2","executionInfo":{"status":"ok","timestamp":1631992634683,"user_tz":240,"elapsed":6,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"f9965006-be46-4772-c355-8674a21ce72e"},"source":["df.dtypes"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["date          object\n","open         float64\n","high         float64\n","low          float64\n","close        float64\n","adj_close    float64\n","volume         int64\n","dtype: object"]},"metadata":{},"execution_count":4}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":554},"id":"dd6TIeOsX5hm","executionInfo":{"status":"ok","timestamp":1631992634822,"user_tz":240,"elapsed":144,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"e61c86cd-9581-424a-8094-0d0ab3fec22f"},"source":["df[\"date\"] = pd.to_datetime(df[\"date\"])\n","df = df.sort_values(by=\"date\", ascending=True)\n","df.describe(include=\"all\")"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n","  This is separate from the ipykernel package so we can avoid doing imports until\n"]},{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>date</th>\n","      <th>open</th>\n","      <th>high</th>\n","      <th>low</th>\n","      <th>close</th>\n","      <th>adj_close</th>\n","      <th>volume</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>count</th>\n","      <td>7193</td>\n","      <td>7193.000000</td>\n","      <td>7193.000000</td>\n","      <td>7193.000000</td>\n","      <td>7193.000000</td>\n","      <td>7193.000000</td>\n","      <td>7.193000e+03</td>\n","    </tr>\n","    <tr>\n","      <th>unique</th>\n","      <td>7193</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>top</th>\n","      <td>2007-09-10 00:00:00</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>freq</th>\n","      <td>1</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>first</th>\n","      <td>1993-01-29 00:00:00</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>last</th>\n","      <td>2021-08-20 00:00:00</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>mean</th>\n","      <td>NaN</td>\n","      <td>149.569786</td>\n","      <td>150.446365</td>\n","      <td>148.596184</td>\n","      <td>149.573008</td>\n","      <td>124.130425</td>\n","      <td>8.432958e+07</td>\n","    </tr>\n","    <tr>\n","      <th>std</th>\n","      <td>NaN</td>\n","      <td>80.710651</td>\n","      <td>81.049916</td>\n","      <td>80.339373</td>\n","      <td>80.732359</td>\n","      <td>86.543832</td>\n","      <td>9.571367e+07</td>\n","    </tr>\n","    <tr>\n","      <th>min</th>\n","      <td>NaN</td>\n","      <td>43.343750</td>\n","      <td>43.531250</td>\n","      <td>42.812500</td>\n","      <td>43.406250</td>\n","      <td>25.487831</td>\n","      <td>5.200000e+03</td>\n","    </tr>\n","    <tr>\n","      <th>25%</th>\n","      <td>NaN</td>\n","      <td>100.739998</td>\n","      <td>101.593750</td>\n","      <td>99.790001</td>\n","      <td>100.699997</td>\n","      <td>71.142609</td>\n","      <td>8.162800e+06</td>\n","    </tr>\n","    <tr>\n","      <th>50%</th>\n","      <td>NaN</td>\n","      <td>128.125000</td>\n","      <td>128.860001</td>\n","      <td>127.269997</td>\n","      <td>128.187500</td>\n","      <td>93.903046</td>\n","      <td>5.864900e+07</td>\n","    </tr>\n","    <tr>\n","      <th>75%</th>\n","      <td>NaN</td>\n","      <td>190.369995</td>\n","      <td>191.820007</td>\n","      <td>188.789993</td>\n","      <td>190.300003</td>\n","      <td>168.407654</td>\n","      <td>1.195754e+08</td>\n","    </tr>\n","    <tr>\n","      <th>max</th>\n","      <td>NaN</td>\n","      <td>445.589996</td>\n","      <td>447.109985</td>\n","      <td>445.070007</td>\n","      <td>446.970001</td>\n","      <td>446.970001</td>\n","      <td>8.710263e+08</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                       date         open  ...    adj_close        volume\n","count                  7193  7193.000000  ...  7193.000000  7.193000e+03\n","unique                 7193          NaN  ...          NaN           NaN\n","top     2007-09-10 00:00:00          NaN  ...          NaN           NaN\n","freq                      1          NaN  ...          NaN           NaN\n","first   1993-01-29 00:00:00          NaN  ...          NaN           NaN\n","last    2021-08-20 00:00:00          NaN  ...          NaN           NaN\n","mean                    NaN   149.569786  ...   124.130425  8.432958e+07\n","std                     NaN    80.710651  ...    86.543832  9.571367e+07\n","min                     NaN    43.343750  ...    25.487831  5.200000e+03\n","25%                     NaN   100.739998  ...    71.142609  8.162800e+06\n","50%                     NaN   128.125000  ...    93.903046  5.864900e+07\n","75%                     NaN   190.369995  ...   168.407654  1.195754e+08\n","max                     NaN   445.589996  ...   446.970001  8.710263e+08\n","\n","[13 rows x 7 columns]"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vcaCtBgfx7ml","executionInfo":{"status":"ok","timestamp":1631992634823,"user_tz":240,"elapsed":10,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"c00840d0-5c8d-4e30-d165-60e4642c654d"},"source":["df.dtypes"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["date         datetime64[ns]\n","open                float64\n","high                float64\n","low                 float64\n","close               float64\n","adj_close           float64\n","volume                int64\n","dtype: object"]},"metadata":{},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"64qZxzKqQqMT"},"source":["## Illustrative feature engineering"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":203},"id":"TLou4u3dQpJp","executionInfo":{"status":"ok","timestamp":1631992634823,"user_tz":240,"elapsed":10,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"3ceada9d-a4e9-480e-e1e7-94641c27147b"},"source":["df[\"open_close_delta\"] = df[\"close\"] / df[\"open\"]\n","df.head(5)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>date</th>\n","      <th>open</th>\n","      <th>high</th>\n","      <th>low</th>\n","      <th>close</th>\n","      <th>adj_close</th>\n","      <th>volume</th>\n","      <th>open_close_delta</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1993-01-29</td>\n","      <td>43.96875</td>\n","      <td>43.96875</td>\n","      <td>43.75000</td>\n","      <td>43.93750</td>\n","      <td>25.799770</td>\n","      <td>1003200</td>\n","      <td>0.999289</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>1993-02-01</td>\n","      <td>43.96875</td>\n","      <td>44.25000</td>\n","      <td>43.96875</td>\n","      <td>44.25000</td>\n","      <td>25.983273</td>\n","      <td>480500</td>\n","      <td>1.006397</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1993-02-02</td>\n","      <td>44.21875</td>\n","      <td>44.37500</td>\n","      <td>44.12500</td>\n","      <td>44.34375</td>\n","      <td>26.038315</td>\n","      <td>201300</td>\n","      <td>1.002827</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>1993-02-03</td>\n","      <td>44.40625</td>\n","      <td>44.84375</td>\n","      <td>44.37500</td>\n","      <td>44.81250</td>\n","      <td>26.313566</td>\n","      <td>529400</td>\n","      <td>1.009148</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>1993-02-04</td>\n","      <td>44.96875</td>\n","      <td>45.09375</td>\n","      <td>44.46875</td>\n","      <td>45.00000</td>\n","      <td>26.423655</td>\n","      <td>531500</td>\n","      <td>1.000695</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["        date      open      high  ...  adj_close   volume  open_close_delta\n","0 1993-01-29  43.96875  43.96875  ...  25.799770  1003200          0.999289\n","1 1993-02-01  43.96875  44.25000  ...  25.983273   480500          1.006397\n","2 1993-02-02  44.21875  44.37500  ...  26.038315   201300          1.002827\n","3 1993-02-03  44.40625  44.84375  ...  26.313566   529400          1.009148\n","4 1993-02-04  44.96875  45.09375  ...  26.423655   531500          1.000695\n","\n","[5 rows x 8 columns]"]},"metadata":{},"execution_count":7}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":307},"id":"xmP32uclbuLx","executionInfo":{"status":"ok","timestamp":1631992634824,"user_tz":240,"elapsed":9,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"5807a038-d61b-4717-b62d-ae455b6a10e4"},"source":["df[\"day_of_week\"] = df[\"date\"].dt.dayofweek\n","df[\"day_of_week\"] = df[\"day_of_week\"].apply(lambda x: \"monday\" if x == 0 else x)\n","df.head(5)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>date</th>\n","      <th>open</th>\n","      <th>high</th>\n","      <th>low</th>\n","      <th>close</th>\n","      <th>adj_close</th>\n","      <th>volume</th>\n","      <th>open_close_delta</th>\n","      <th>day_of_week</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1993-01-29</td>\n","      <td>43.96875</td>\n","      <td>43.96875</td>\n","      <td>43.75000</td>\n","      <td>43.93750</td>\n","      <td>25.799770</td>\n","      <td>1003200</td>\n","      <td>0.999289</td>\n","      <td>4</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>1993-02-01</td>\n","      <td>43.96875</td>\n","      <td>44.25000</td>\n","      <td>43.96875</td>\n","      <td>44.25000</td>\n","      <td>25.983273</td>\n","      <td>480500</td>\n","      <td>1.006397</td>\n","      <td>monday</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>1993-02-02</td>\n","      <td>44.21875</td>\n","      <td>44.37500</td>\n","      <td>44.12500</td>\n","      <td>44.34375</td>\n","      <td>26.038315</td>\n","      <td>201300</td>\n","      <td>1.002827</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>1993-02-03</td>\n","      <td>44.40625</td>\n","      <td>44.84375</td>\n","      <td>44.37500</td>\n","      <td>44.81250</td>\n","      <td>26.313566</td>\n","      <td>529400</td>\n","      <td>1.009148</td>\n","      <td>2</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>1993-02-04</td>\n","      <td>44.96875</td>\n","      <td>45.09375</td>\n","      <td>44.46875</td>\n","      <td>45.00000</td>\n","      <td>26.423655</td>\n","      <td>531500</td>\n","      <td>1.000695</td>\n","      <td>3</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["        date      open      high  ...   volume  open_close_delta  day_of_week\n","0 1993-01-29  43.96875  43.96875  ...  1003200          0.999289            4\n","1 1993-02-01  43.96875  44.25000  ...   480500          1.006397       monday\n","2 1993-02-02  44.21875  44.37500  ...   201300          1.002827            1\n","3 1993-02-03  44.40625  44.84375  ...   529400          1.009148            2\n","4 1993-02-04  44.96875  45.09375  ...   531500          1.000695            3\n","\n","[5 rows x 9 columns]"]},"metadata":{},"execution_count":8}]},{"cell_type":"markdown","metadata":{"id":"44Pr1QsabryC"},"source":["## Define Label"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"J6CiP2GpReQ2","executionInfo":{"status":"ok","timestamp":1631992634824,"user_tz":240,"elapsed":8,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"79c44bda-0d97-4e01-9358-d1c599f31706"},"source":["df[\"return\"] = df[\"adj_close\"] / df[\"adj_close\"].shift(1)\n","df[\"label\"] = df[\"return\"].shift(-1) # today's features are used to forecast tomorrow's return\n","# setup label as a classification problem {0, 1}\n","df[\"label\"] = df[\"label\"].apply(lambda x: 1.0 if x > 1.005 else 0.0)\n","print(df.loc[:, [\"date\", \"adj_close\", \"return\", \"label\"]].head(5))\n","print(df[\"label\"].value_counts(ascending=False))"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["        date  adj_close    return  label\n","0 1993-01-29  25.799770       NaN    1.0\n","1 1993-02-01  25.983273  1.007113    0.0\n","2 1993-02-02  26.038315  1.002118    1.0\n","3 1993-02-03  26.313566  1.010571    0.0\n","4 1993-02-04  26.423655  1.004184    0.0\n","0.0    5144\n","1.0    2049\n","Name: label, dtype: int64\n"]}]},{"cell_type":"markdown","metadata":{"id":"xj5NGSs9Zp1i"},"source":["## Train/test split\n","Time series dataset: Train test split by date to avoid leakage"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_KiNBDEtY7VY","executionInfo":{"status":"ok","timestamp":1631992634960,"user_tz":240,"elapsed":7,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"f3e00f4e-f5fb-4afd-8b0f-72464aa376cc"},"source":["train_df = df.loc[pd.Timestamp(\"2016-12-31\") >= df[\"date\"], :]\n","print(len(train_df))\n","print(train_df[\"date\"].describe())\n","test_df = df.loc[pd.Timestamp(\"2016-12-31\") < df[\"date\"], :]\n","print(len(test_df))\n","print(test_df[\"date\"].describe())"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["6026\n","count                    6026\n","unique                   6026\n","top       1999-08-23 00:00:00\n","freq                        1\n","first     1993-01-29 00:00:00\n","last      2016-12-30 00:00:00\n","Name: date, dtype: object\n","1167\n","count                    1167\n","unique                   1167\n","top       2017-10-30 00:00:00\n","freq                        1\n","first     2017-01-03 00:00:00\n","last      2021-08-20 00:00:00\n","Name: date, dtype: object\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n","  This is separate from the ipykernel package so we can avoid doing imports until\n","/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n","  \n"]}]},{"cell_type":"markdown","metadata":{"id":"AlfnH-gwwAUX"},"source":["## Feature transformation pipeline"]},{"cell_type":"code","metadata":{"id":"WGX99oBTXkqj"},"source":["numerical_features = [\"volume\", \"open_close_delta\"]\n","categorical_features = [\"day_of_week\"]"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"DQsWyNcFW5_s"},"source":["class FeatureSelector(BaseEstimator, TransformerMixin):\n","    def __init__(self, feature_names):\n","        self.feature_names = feature_names   \n","    def fit( self, X, y = None ):\n","        return self\n","    def transform(self, X, y=None):\n","        return X.loc[:, self.feature_names].copy(deep=True)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"cojolSPBXXmu"},"source":["numerical_pipeline = Pipeline(steps = [ \n","    (\"num_selector\", FeatureSelector(numerical_features)),\n","    (\"imputer\", SimpleImputer(strategy=\"median\")),\n","    (\"std_scaler\", StandardScaler()) \n","])"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"MwPqBp3EYeGQ"},"source":["categorical_pipeline = Pipeline(steps = [ \n","    (\"num_selector\", FeatureSelector(categorical_features)),\n","    (\"ohe\", OneHotEncoder(\n","        handle_unknown=\"ignore\", \n","        sparse=False,\n","        categories=[\n","            df[\"day_of_week\"].unique()\n","        ])\n","    ) \n","])"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"ssLPKpHRwMg9"},"source":["### Example feature engineering inside pipeline"]},{"cell_type":"code","metadata":{"id":"1saG5wjPyhcg"},"source":["class DailyTrendFeature(BaseEstimator, TransformerMixin):\n","    def __init__(self):\n","        pass\n","    def fit( self, X, y = None ):\n","        return self\n","    def transform(self, X, y=None):\n","        X.loc[:, \"open_close_delta\"] = X[\"close\"] / X[\"open\"]\n","        def daily_trend(row):\n","            if 0.99 > row[\"open_close_delta\"]: # assume 'down' day when prices fall > 1% from open\n","                row[\"daily_trend\"] = \"down\"\n","            elif 1.01 < row[\"open_close_delta\"]: # assume 'up' day when prices rise > 1% from open\n","                row[\"daily_trend\"] = \"up\"\n","            else:\n","                row[\"daily_trend\"] = \"flat\"\n","            return row\n","        X = X.apply(daily_trend, axis=1)\n","        return X\n","\n","daily_trend_feature_pipeline = Pipeline(steps = [ \n","    (\"selector\", FeatureSelector([\"open\", \"close\"])),\n","    (\"feature_engineering\", DailyTrendFeature()),\n","    (\"selector_new\", FeatureSelector([\"daily_trend\"])),\n","    (\"ohe\", OneHotEncoder(\n","        handle_unknown=\"ignore\", \n","        sparse=False,\n","        categories=[\n","            [\"up\", \"down\", \"flat\"],\n","        ])\n","    ) \n","])"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"USuhkKMumkCv","executionInfo":{"status":"ok","timestamp":1631992635108,"user_tz":240,"elapsed":151,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"abb02a0e-a6fe-4897-9a0d-f94f1bab63be"},"source":["def test_new_feature_pipeline():\n","    test_df = train_df.sample(5).copy(deep=True).reset_index()\n","    print(test_df.loc[:, [\"return\"]])\n","    sample_transforms = daily_trend_feature_pipeline.fit_transform(\n","        test_df, \n","        test_df[\"label\"]\n","    )\n","    print(pd.DataFrame(\n","        sample_transforms, \n","        columns=daily_trend_feature_pipeline.named_steps[\"ohe\"].get_feature_names()\n","    ))\n","test_new_feature_pipeline()"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["     return\n","0  0.991621\n","1  0.997277\n","2  0.981978\n","3  0.996962\n","4  1.012702\n","   x0_up  x0_down  x0_flat\n","0    0.0      0.0      1.0\n","1    0.0      0.0      1.0\n","2    0.0      0.0      1.0\n","3    0.0      1.0      0.0\n","4    0.0      0.0      1.0\n"]}]},{"cell_type":"code","metadata":{"id":"GFN-s6e0Z-_I"},"source":["feature_pipeline = FeatureUnion(\n","    n_jobs=-1, \n","    transformer_list=[ \n","        (\"numerical_pipeline\", numerical_pipeline),\n","        (\"categorical_pipeline\", categorical_pipeline),\n","        (\"daily_trend_feature_pipeline\", daily_trend_feature_pipeline),\n","    ]\n",")"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":493},"id":"WA0POsiZoGEC","executionInfo":{"status":"ok","timestamp":1631992636373,"user_tz":240,"elapsed":1268,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"37d83718-6f8d-498a-f871-0205ca24895b"},"source":["def test_feature_pipeline():\n","    test_df = train_df.sample(5).copy(deep=True).reset_index()\n","    display(test_df)\n","    feature_pipeline.fit(test_df, test_df[\"label\"])\n","    display(pd.DataFrame(feature_pipeline.transform(test_df),\n","            columns = (\n","                numerical_features \n","                + list(feature_pipeline.transformer_list[1][1][\"ohe\"].get_feature_names())\n","                + list(feature_pipeline.transformer_list[2][1][\"ohe\"].get_feature_names())\n","            )\n","        )\n","    )\n","test_feature_pipeline()"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>index</th>\n","      <th>date</th>\n","      <th>open</th>\n","      <th>high</th>\n","      <th>low</th>\n","      <th>close</th>\n","      <th>adj_close</th>\n","      <th>volume</th>\n","      <th>open_close_delta</th>\n","      <th>day_of_week</th>\n","      <th>return</th>\n","      <th>label</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>5215</td>\n","      <td>2013-10-14</td>\n","      <td>169.210007</td>\n","      <td>171.080002</td>\n","      <td>169.080002</td>\n","      <td>170.940002</td>\n","      <td>147.378723</td>\n","      <td>112106000</td>\n","      <td>1.010224</td>\n","      <td>monday</td>\n","      <td>1.003994</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>5924</td>\n","      <td>2016-08-08</td>\n","      <td>218.399994</td>\n","      <td>218.520004</td>\n","      <td>217.740005</td>\n","      <td>218.050003</td>\n","      <td>198.728592</td>\n","      <td>39906500</td>\n","      <td>0.998397</td>\n","      <td>monday</td>\n","      <td>0.999404</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>821</td>\n","      <td>1996-04-30</td>\n","      <td>65.437500</td>\n","      <td>65.562500</td>\n","      <td>65.125000</td>\n","      <td>65.390625</td>\n","      <td>41.525272</td>\n","      <td>184400</td>\n","      <td>0.999284</td>\n","      <td>1</td>\n","      <td>0.999284</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>973</td>\n","      <td>1996-12-04</td>\n","      <td>74.875000</td>\n","      <td>75.062500</td>\n","      <td>74.093750</td>\n","      <td>74.953125</td>\n","      <td>48.096256</td>\n","      <td>2365100</td>\n","      <td>1.001043</td>\n","      <td>2</td>\n","      <td>1.002717</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>2043</td>\n","      <td>2001-03-05</td>\n","      <td>124.150002</td>\n","      <td>124.779999</td>\n","      <td>123.809998</td>\n","      <td>124.739998</td>\n","      <td>84.509819</td>\n","      <td>5293200</td>\n","      <td>1.004752</td>\n","      <td>monday</td>\n","      <td>1.009142</td>\n","      <td>1.0</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["   index       date        open  ...  day_of_week    return  label\n","0   5215 2013-10-14  169.210007  ...       monday  1.003994    0.0\n","1   5924 2016-08-08  218.399994  ...       monday  0.999404    0.0\n","2    821 1996-04-30   65.437500  ...            1  0.999284    0.0\n","3    973 1996-12-04   74.875000  ...            2  1.002717    0.0\n","4   2043 2001-03-05  124.150002  ...       monday  1.009142    1.0\n","\n","[5 rows x 12 columns]"]},"metadata":{}},{"output_type":"display_data","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>volume</th>\n","      <th>open_close_delta</th>\n","      <th>x0_4</th>\n","      <th>x0_monday</th>\n","      <th>x0_1</th>\n","      <th>x0_2</th>\n","      <th>x0_3</th>\n","      <th>x0_up</th>\n","      <th>x0_down</th>\n","      <th>x0_flat</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1.880130</td>\n","      <td>1.728391</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>0.186182</td>\n","      <td>-1.002947</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>-0.745779</td>\n","      <td>-0.798280</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>-0.694616</td>\n","      <td>-0.391867</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>-0.625917</td>\n","      <td>0.464703</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["     volume  open_close_delta  x0_4  x0_monday  ...  x0_3  x0_up  x0_down  x0_flat\n","0  1.880130          1.728391   0.0        1.0  ...   0.0    1.0      0.0      0.0\n","1  0.186182         -1.002947   0.0        1.0  ...   0.0    0.0      0.0      1.0\n","2 -0.745779         -0.798280   0.0        0.0  ...   0.0    0.0      0.0      1.0\n","3 -0.694616         -0.391867   0.0        0.0  ...   0.0    0.0      0.0      1.0\n","4 -0.625917          0.464703   0.0        1.0  ...   0.0    0.0      0.0      1.0\n","\n","[5 rows x 10 columns]"]},"metadata":{}}]},{"cell_type":"markdown","metadata":{"id":"ANUdza9f2cY6"},"source":["## Model"]},{"cell_type":"code","metadata":{"id":"FARmX_UJ2eyk"},"source":["model_pipeline = Pipeline(steps=[\n","    (\"feature_pipeline\", feature_pipeline),\n","    (\"model\", LogisticRegression())\n","])\n","param_grid = [\n","    {\n","        \"feature_pipeline__numerical_pipeline__imputer__strategy\": [\"mean\", \"median\"],\n","        \"model\": [LogisticRegression()],\n","        \"model__C\": [0.1, 1.0, 10],\n","    },\n","    {\n","        \"feature_pipeline__numerical_pipeline__imputer__strategy\": [\"mean\", \"median\"],\n","        \"model\": [RandomForestClassifier()],\n","        \"model__max_depth\": [3.0, 5.0, 7.0],\n","    }\n","]\n","grid_search = GridSearchCV(\n","    model_pipeline, \n","    param_grid, \n","    cv=TimeSeriesSplit(n_splits=5),\n","    scoring=\"roc_auc\",\n","    refit=True,\n","    n_jobs=-1\n",")\n","# grid_search"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"J6CeX92j3g5Y","executionInfo":{"status":"ok","timestamp":1631992807216,"user_tz":240,"elapsed":170846,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"c913d8ab-f4eb-4c39-963e-3c8bb11b4207"},"source":["now = datetime.datetime.now()\n","grid_search.fit(train_df, train_df[\"label\"])\n","print(datetime.datetime.now() - now)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["0:02:50.686868\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9nWIva1Rpfsk","executionInfo":{"status":"ok","timestamp":1631992807217,"user_tz":240,"elapsed":14,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"c5b65bac-43d7-45ff-d6d2-8c4a15a207c1"},"source":["print(f\"Best params: {grid_search.best_params_}\")\n","print(f\"Best score: {grid_search.best_score_}\")"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Best params: {'feature_pipeline__numerical_pipeline__imputer__strategy': 'mean', 'model': LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n","                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n","                   multi_class='auto', n_jobs=None, penalty='l2',\n","                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n","                   warm_start=False), 'model__C': 0.1}\n","Best score: 0.5765670811118563\n"]}]},{"cell_type":"markdown","metadata":{"id":"Aj3up6IG5sey"},"source":["## Metrics"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"19LMO2kBrx6I","executionInfo":{"status":"ok","timestamp":1631992811877,"user_tz":240,"elapsed":4666,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"659139eb-f9e6-4b91-bb86-375204a53228"},"source":["metrics.roc_auc_score(\n","    y_true=train_df[\"label\"],\n","    y_score=grid_search.predict(train_df),\n","    average=\"weighted\"\n",")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.5041638147080223"]},"metadata":{},"execution_count":22}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"FG6oYfoFpoYF","executionInfo":{"status":"ok","timestamp":1631992812704,"user_tz":240,"elapsed":847,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"2c43416d-28c2-43d1-b4d2-516508eb2ddf"},"source":["metrics.roc_auc_score(\n","    y_true=test_df[\"label\"],\n","    y_score=grid_search.predict(test_df),\n","    average=\"weighted\"\n",")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.5009555705544877"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","metadata":{"id":"F9ChyNSLjb8T"},"source":["## Export notebook as HTML"]},{"cell_type":"code","metadata":{"id":"_7nuQJ2GaxyM","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1631998376733,"user_tz":240,"elapsed":1680,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"49819423-c1b4-45b4-a87e-fc619b86eed9"},"source":["%%shell\n","jupyter nbconvert --to html '/content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb'"],"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["[NbConvertApp] Converting notebook /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb to html\n","[NbConvertApp] Writing 338767 bytes to /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.html\n"]},{"output_type":"execute_result","data":{"text/plain":[""]},"metadata":{},"execution_count":1}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jKfbk43ueWB_","executionInfo":{"status":"ok","timestamp":1631998488759,"user_tz":240,"elapsed":1388,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"2e937b6f-9d60-463b-d268-edf63192ab0e"},"source":["%%shell\n","# ### html with outputs\n","jupyter nbconvert --to html  --no-input --no-prompt '/content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb' --output sklearn_pipe_no_code.html"],"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["[NbConvertApp] Converting notebook /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb to html\n","[NbConvertApp] Writing 299986 bytes to /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe_no_code.html\n"]},{"output_type":"execute_result","data":{"text/plain":[""]},"metadata":{},"execution_count":2}]},{"cell_type":"code","metadata":{"id":"3th4ahEqe3Wd"},"source":[""],"execution_count":null,"outputs":[]}]}
  3. lstm_synthetic_data.ipynb lstm_synthetic_data.ipynb
    1
    {
    2
      "nbformat": 4,
    3
      "nbformat_minor": 0,
    4
      "metadata": {
    5
        "colab": {
  4. google_100_carbon_free.ipynb google_100_carbon_free.ipynb
    1
    {
    2
      "nbformat": 4,
    3
      "nbformat_minor": 0,
    4
      "metadata": {
    5
        "colab": {
  5. notebook notebook Public

    Jupyter Notebook