diff --git a/.github/workflows/cicd.yaml b/.github/workflows/docs.yaml similarity index 82% rename from .github/workflows/cicd.yaml rename to .github/workflows/docs.yaml index 46a2000..44fc5da 100644 --- a/.github/workflows/cicd.yaml +++ b/.github/workflows/docs.yaml @@ -1,4 +1,4 @@ -name: cicd +name: docs permissions: contents: write @@ -11,8 +11,9 @@ on: paths: - 'justfile' - 'website/**' + - 'pyproject.toml' - 'dev-requirements.txt' - - '.github/workflows/cicd.yaml' + - '.github/workflows/docs.yaml' jobs: deploy: @@ -31,8 +32,11 @@ jobs: - uses: quarto-dev/quarto-actions/setup@v2 + - name: setup uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + - name: install requirements - run: pip install uv && just setup + run: just setup - name: build site run: | diff --git a/eda.ipynb b/eda.ipynb index 674a137..3f00882 100644 --- a/eda.ipynb +++ b/eda.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -24,6 +24,7 @@ "from icarus.config import (\n", " DATA_DIR,\n", " RAW_DATA_DIR,\n", + " PENGUINS_TABLE,\n", " BUY_SELL_TABLE,\n", " SOCIAL_MEDIA_TABLE,\n", ")\n", @@ -36,9 +37,130 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['buy_sell.delta', 'social_media.delta', 'penguins.delta']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog = Catalog()\n", + "catalog.list_tables()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n", + "┃ extracted_at ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ body_mass_g ┃ sex ┃ year ┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n", + "│ string │ string │ string │ float64 │ float64 │ int64 │ int64 │ string │ int64 │\n", + "├────────────────────────────┼─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n", + "│ 2024-08-25T19:17:49.803712 │ Adelie │ Torgersen │ 39.3 │ 20.6 │ 190 │ 3650 │ male │ 2007 │\n", + "│ 2024-08-25T19:17:49.803712 │ Gentoo │ Biscoe │ 43.3 │ 14.0 │ 208 │ 4575 │ female │ 2009 │\n", + "│ 2024-08-25T19:17:49.803712 │ Adelie │ Torgersen │ 38.7 │ 19.0 │ 195 │ 3450 │ female │ 2007 │\n", + "│ 2024-08-25T19:17:49.803712 │ Adelie │ Dream │ 39.5 │ 16.7 │ 178 │ 3250 │ female │ 2007 │\n", + "│ 2024-08-25T19:17:49.803712 │ Adelie │ Dream │ 36.4 │ 17.0 │ 195 │ 3325 │ female │ 2007 │\n", + "│ 2024-08-25T19:17:49.803712 │ Adelie │ Torgersen │ 34.4 │ 18.4 │ 184 │ 3325 │ female │ 2007 │\n", + "│ 2024-08-25T19:17:49.803712 │ Gentoo │ Biscoe │ 52.1 │ 17.0 │ 230 │ 5550 │ male │ 2009 │\n", + "│ 2024-08-25T19:17:49.803712 │ Adelie │ Biscoe │ 38.2 │ 18.1 │ 185 │ 3950 │ male │ 2007 │\n", + "│ 2024-08-25T19:17:49.803712 │ Adelie │ Biscoe │ 35.3 │ 18.9 │ 187 │ 3800 │ female │ 2007 │\n", + "│ 2024-08-25T19:17:49.803712 │ Adelie │ Biscoe │ 40.5 │ 17.9 │ 187 │ 3200 │ female │ 2007 │\n", + "│ … │ … │ … │ … │ … │ … │ … │ … │ … │\n", + "└────────────────────────────┴─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n", + "\n" + ], + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mextracted_at\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mspecies\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1misland\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbill_length_mm\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbill_depth_mm\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mflipper_length_mm\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbody_mass_g\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msex\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1myear\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n", + "│ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │\n", + "├────────────────────────────┼─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mTorgersen\u001b[0m │ \u001b[1;36m39.3\u001b[0m │ \u001b[1;36m20.6\u001b[0m │ \u001b[1;36m190\u001b[0m │ \u001b[1;36m3650\u001b[0m │ \u001b[32mmale \u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mGentoo \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m43.3\u001b[0m │ \u001b[1;36m14.0\u001b[0m │ \u001b[1;36m208\u001b[0m │ \u001b[1;36m4575\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2009\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mTorgersen\u001b[0m │ \u001b[1;36m38.7\u001b[0m │ \u001b[1;36m19.0\u001b[0m │ \u001b[1;36m195\u001b[0m │ \u001b[1;36m3450\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mDream \u001b[0m │ \u001b[1;36m39.5\u001b[0m │ \u001b[1;36m16.7\u001b[0m │ \u001b[1;36m178\u001b[0m │ \u001b[1;36m3250\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mDream \u001b[0m │ \u001b[1;36m36.4\u001b[0m │ \u001b[1;36m17.0\u001b[0m │ \u001b[1;36m195\u001b[0m │ \u001b[1;36m3325\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mTorgersen\u001b[0m │ \u001b[1;36m34.4\u001b[0m │ \u001b[1;36m18.4\u001b[0m │ \u001b[1;36m184\u001b[0m │ \u001b[1;36m3325\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mGentoo \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m52.1\u001b[0m │ \u001b[1;36m17.0\u001b[0m │ \u001b[1;36m230\u001b[0m │ \u001b[1;36m5550\u001b[0m │ \u001b[32mmale \u001b[0m │ \u001b[1;36m2009\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m38.2\u001b[0m │ \u001b[1;36m18.1\u001b[0m │ \u001b[1;36m185\u001b[0m │ \u001b[1;36m3950\u001b[0m │ \u001b[32mmale \u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m35.3\u001b[0m │ \u001b[1;36m18.9\u001b[0m │ \u001b[1;36m187\u001b[0m │ \u001b[1;36m3800\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m40.5\u001b[0m │ \u001b[1;36m17.9\u001b[0m │ \u001b[1;36m187\u001b[0m │ \u001b[1;36m3200\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │\n", + "└────────────────────────────┴─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t = catalog.table(PENGUINS_TABLE)\n", + "t" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "┌───────────────┐\n", + "│ \u001b[1;35mnp.int64\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m344\u001b[0m\u001b[1m)\u001b[0m │\n", + "└───────────────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'115,740.74'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "f\"{10_000_000_000 / (60 * 60 * 24):,.2f}\"" ] @@ -283,7 +405,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.4" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 4930f99..154de73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ packages = ["src/icarus"] [project] name = "icarus-cds" -version = "0.2.0" +version = "0.3.0" authors = [{ name = "Cody", email = "cody@dkdc.dev" }] description = "Python composable data stack solution accelerator" readme = "readme.md" @@ -28,7 +28,7 @@ dependencies = [ # cloud 'gcsfs', # data - 'ibis-framework[duckdb,polars,deltalake]', + 'ibis-framework[duckdb,polars,deltalake,examples]', 'Faker', # visualization 'plotly', diff --git a/src/icarus/catalog.py b/src/icarus/catalog.py index 4365287..09bdf20 100644 --- a/src/icarus/catalog.py +++ b/src/icarus/catalog.py @@ -17,6 +17,9 @@ def delta_table_path(table_name: str) -> str: def read_table(table_name: str) -> ibis.Table: if CLOUD: import gcsfs + import warnings + + warnings.filterwarnings("ignore") fs = gcsfs.GCSFileSystem() ibis.get_backend().register_filesystem(fs) @@ -31,6 +34,9 @@ def read_table(table_name: str) -> ibis.Table: def write_table(t: ibis.Table, table_name: str) -> None: if CLOUD: import gcsfs + import warnings + + warnings.filterwarnings("ignore") fs = gcsfs.GCSFileSystem() ibis.get_backend().register_filesystem(fs) diff --git a/src/icarus/cli.py b/src/icarus/cli.py index 001f8c5..d72eb2c 100644 --- a/src/icarus/cli.py +++ b/src/icarus/cli.py @@ -6,11 +6,13 @@ from icarus.config import ( DATA_DIR, RAW_DATA_DIR, + PENGUINS_TABLE, BUY_SELL_TABLE, SOCIAL_MEDIA_TABLE, ) from icarus.catalog import delta_table_filename -from icarus.investments.run import main as run_main +from icarus.penguins.run import main as penguins_run_main +from icarus.investments.run import main as investments_run_main from icarus.synthetic_data.investments import ( gen_buy_sell_batch, gen_social_media_batch, @@ -25,13 +27,16 @@ # typer apps app = typer.Typer(help="Icarus: soaring beyond limits.", **TYPER_KWARGS) +run_app = typer.Typer(help="Run the ETL job.", **TYPER_KWARGS) clean_app = typer.Typer(help="Clean the data lake.", **TYPER_KWARGS) # add subcommands app.add_typer(clean_app, name="clean") +app.add_typer(run_app, name="run") # add subcommand aliases app.add_typer(clean_app, name="c", hidden=True) +app.add_typer(run_app, name="r", hidden=True) # helper functions @@ -71,9 +76,25 @@ def gen(): typer.echo(f"error: {e}") -@app.command() -@app.command("etl", hidden=True) -def run( +@run_app.command() +def penguins( + override: bool = typer.Option( + False, "--override", "-o", help="Override checks", show_default=True + ), +): + """Run ETL.""" + + # ensure raw data exists + if not override and not check_raw_data_exists(): + return + + # run the ETL job + typer.echo("running ETL job...") + penguins_run_main() + + +@run_app.command() +def investments( override: bool = typer.Option( False, "--override", "-o", help="Override checks", show_default=True ), @@ -86,7 +107,7 @@ def run( # run the ETL job typer.echo("running ETL job...") - run_main() + investments_run_main() @app.command("app") @@ -108,7 +129,7 @@ def clean_lake( if not override and not check_data_lake_exists(): return - tables = [BUY_SELL_TABLE, SOCIAL_MEDIA_TABLE] + tables = [PENGUINS_TABLE, BUY_SELL_TABLE, SOCIAL_MEDIA_TABLE] tables = [delta_table_filename(table) for table in tables] for table in tables: diff --git a/src/icarus/config.py b/src/icarus/config.py index 647a882..8bd77ef 100644 --- a/src/icarus/config.py +++ b/src/icarus/config.py @@ -3,5 +3,6 @@ DATA_DIR = "datalake" RAW_DATA_DIR = "_raw" +PENGUINS_TABLE = "penguins" BUY_SELL_TABLE = "buy_sell" SOCIAL_MEDIA_TABLE = "social_media" diff --git a/src/icarus/investments/run.py b/src/icarus/investments/run.py index cf275ac..87d8b97 100644 --- a/src/icarus/investments/run.py +++ b/src/icarus/investments/run.py @@ -20,26 +20,10 @@ def main(): extract_buy_sell_t = extract_buy_sell() extract_social_media_t = extract_social_media() - # data validation - assert ( - extract_buy_sell_t.count().to_pyarrow().as_py() > 0 - ), "No extracted buy/sell data" - assert ( - extract_social_media_t.count().to_pyarrow().as_py() > 0 - ), "No extracted social media data" - # transform transform_buy_sell_t = transform_buy_sell(extract_buy_sell_t) transform_social_media_t = transform_social_media(extract_social_media_t) - # data validation - assert ( - transform_buy_sell_t.count().to_pyarrow().as_py() > 0 - ), "No transformed buy/sell data" - assert ( - transform_social_media_t.count().to_pyarrow().as_py() > 0 - ), "No transformed social media data" - # load catalog.write_table(transform_buy_sell_t, BUY_SELL_TABLE) catalog.write_table(transform_social_media_t, SOCIAL_MEDIA_TABLE) diff --git a/src/icarus/investments/transform.py b/src/icarus/investments/transform.py index 2837113..2574a14 100644 --- a/src/icarus/investments/transform.py +++ b/src/icarus/investments/transform.py @@ -23,25 +23,23 @@ def postprocess(t: ibis.Table) -> ibis.Table: # data assets -def buy_sell(bronze_buy_sell): +def buy_sell(t: ibis.Table) -> ibis.Table: """Transform ticker buy/sell data.""" def transform(t): t = t.mutate(t["buy_sell"].unnest()).unpack("buy_sell") return t - buy_sell = bronze_buy_sell.pipe(preprocess).pipe(transform).pipe(postprocess) + buy_sell = t.pipe(preprocess).pipe(transform).pipe(postprocess) return buy_sell -def social_media(bronze_social_media): +def social_media(t: ibis.Table) -> ibis.Table: """Transform ticker social media data.""" def transform(t): t = t.unpack("social_media_post") return t - social_media = ( - bronze_social_media.pipe(preprocess).pipe(transform).pipe(postprocess) - ) + social_media = t.pipe(preprocess).pipe(transform).pipe(postprocess) return social_media diff --git a/src/icarus/penguins/__init__.py b/src/icarus/penguins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/icarus/penguins/extract.py b/src/icarus/penguins/extract.py new file mode 100644 index 0000000..ddc48be --- /dev/null +++ b/src/icarus/penguins/extract.py @@ -0,0 +1,32 @@ +# imports +import ibis + +from datetime import datetime + +# set extracted_at timestamp +# note we don't use ibis.now() to ensure it's the same... +# ...for all tables/rows on a given run +extracted_at = datetime.utcnow().isoformat() + + +# functions +def add_extracted_at(t: ibis.Table) -> ibis.Table: + """Add extracted_at column to table""" + + # add extracted_at column and relocate it to the first position + t = t.mutate(extracted_at=ibis.literal(extracted_at)).relocate("extracted_at") + + return t + + +# data assets +def penguins() -> ibis.Table: + """Extract penguins data""" + + # read in raw data + penguins = ibis.examples.penguins.fetch() + + # add extracted_at column + penguins = penguins.pipe(add_extracted_at) + + return penguins diff --git a/src/icarus/penguins/run.py b/src/icarus/penguins/run.py new file mode 100644 index 0000000..67804ec --- /dev/null +++ b/src/icarus/penguins/run.py @@ -0,0 +1,22 @@ +# imports +from icarus.config import PENGUINS_TABLE +from icarus.catalog import Catalog +from icarus.penguins.extract import ( + penguins as extract_penguins, +) +from icarus.penguins.transform import penguins as transform_penguins + + +# functions +def main(): + # instantiate catalog + catalog = Catalog() + + # extract + extract_penguins_t = extract_penguins() + + # transform + transform_penguins_t = transform_penguins(extract_penguins_t) + + # load + catalog.write_table(transform_penguins_t, PENGUINS_TABLE) diff --git a/src/icarus/penguins/transform.py b/src/icarus/penguins/transform.py new file mode 100644 index 0000000..5492ed7 --- /dev/null +++ b/src/icarus/penguins/transform.py @@ -0,0 +1,33 @@ +# imports +import ibis +import ibis.selectors as s + + +# functions +def preprocess(t: ibis.Table) -> ibis.Table: + """Common preprocessing steps""" + + # ensure unique records + t = t.distinct(on=~s.c("extracted_at"), keep="first").order_by("extracted_at") + + return t + + +def postprocess(t: ibis.Table) -> ibis.Table: + """Common postprocessing steps""" + + # ensure consistent column casing + t = t.rename("snake_case") + + return t + + +# data assets +def penguins(t: ibis.Table) -> ibis.Table: + """Transform penguins data.""" + + def transform(t): + return t + + penguins = t.pipe(preprocess).pipe(transform).pipe(postprocess) + return penguins diff --git a/website/eda.qmd b/website/eda.qmd new file mode 100644 index 0000000..e87f4d3 --- /dev/null +++ b/website/eda.qmd @@ -0,0 +1,159 @@ +--- +error: True +--- + +# Exploratory data analysis (EDA) + +Use this notebook to explore and visualize the data. + +```{python} +import os +import ibis +import ibis.selectors as s + +import plotly.express as px + +from icarus.config import ( + DATA_DIR, + RAW_DATA_DIR, + BUY_SELL_TABLE, + SOCIAL_MEDIA_TABLE, +) +from icarus.catalog import Catalog +from icarus.synthetic_data.investments import fake, seed_table + +px.defaults.template = "plotly_dark" +ibis.options.interactive = True +``` + +```{python} +f"{10_000_000_000 / (60 * 60 * 24):,.2f}" +``` + +```{python} +lookback = ibis.interval(days=1) +step = ibis.interval(seconds=1) +``` + +```{python} +t = ( + ibis.range(ibis.now() - lookback, ibis.now(), step=step) + .unnest() + .name("timestamp") + .as_table() + .mutate( + index=(ibis.row_number().over(order_by="timestamp")), + symbol=ibis.random(), + bid=ibis.random(), + bid_size=ibis.random(), + ask=ibis.random(), + ask_size=ibis.random(), + **{c: 2 * (ibis.random() - 0.5) for c in ["a", "b", "c"]}, + **{ + c: ibis.null(type=str) + for c in list(map(chr, range(ord("d"), ord("z") + 1))) + }, + ) + .relocate("index", "timestamp") +) +t +``` + +```{python} +import random + +symbols = [chr(i) for i in range(ord("A"), ord("Z") + 1)] +symbols = [ + f"{a}{b}{c}{d}" for a in symbols for b in symbols for c in symbols for d in symbols +] +symbols = random.sample(symbols, 500) +symbols[:5] +``` + +```{python} +symbols_case = ibis.case() +for i, symbol in enumerate(symbols): + symbols_case = symbols_case.when( + (i / len(symbols) < ibis._["symbol"]) + & (ibis._["symbol"] < ((i + 1) / len(symbols))), + symbol, + ) +symbols_case = symbols_case.else_(ibis.null()).end() +``` + +```{python} +t = t.mutate( + symbol=symbols_case, + bid_size=(ibis._["bid_size"] * 1000).cast("int64"), + ask_size=(ibis._["ask_size"] * 1000).cast("int64"), +) +t +``` + +```{python} +t +``` + +```{python} +seed_table +``` + +```{python} +catalog = Catalog() +catalog.list_tables() +``` + +```{python} +seed_table +``` + +```{python} +data_glob = os.path.join(DATA_DIR, RAW_DATA_DIR, BUY_SELL_TABLE, "*.parquet") +bs = ibis.read_parquet(data_glob) +bs +``` + +```{python} +bs = catalog.table("buy_sell") +bs +``` + +```{python} +t = bs.group_by("ticker").agg(count=ibis._.count()).order_by(ibis.desc("count")) +t +``` + +```{python} +px.bar( + t, + x="ticker", + y="count", + title="Number of trades per symbol", + color="ticker", +) +``` + +```{python} +bs.count() +``` + +```{python} +bs["location"].value_counts() +``` + +```{python} +f"{bs.count().to_pyarrow().as_py():,}" +``` + +```{python} +data_glob = os.path.join(DATA_DIR, RAW_DATA_DIR, SOCIAL_MEDIA_TABLE, "*.parquet") +sm = ibis.read_parquet(data_glob) +sm +``` + +```{python} +sm = catalog.table("social_media") +sm +``` + + diff --git a/website/onboarding.qmd b/website/onboarding.qmd index 19395da..51589ea 100644 --- a/website/onboarding.qmd +++ b/website/onboarding.qmd @@ -4,33 +4,26 @@ Welcome to Icarus! ## Setup -Clone the repo: +::: {.panel-tabset} -```bash -gh repo clone lostmygithubaccount/Icarus -``` - -Change into it: - -```bash -cd Icarus -``` - -## Installation +## Development -[Install `just`](https://github.com/casey/just). Then: +Install [`gh`](https://github.com/cli/cli) and [`just`](https://github.com/casey/just) and [`uv`](https://github.com/astral-sh/uv), then: ```bash +gh repo clone ibis-project/Icarus +cd Icarus just setup +. .venv/bin/activate ``` -And activate the virtual environment: +## Pip ```bash -. .venv/bin/activate +pip install icarus-cds ``` -The `icarus` CLI is now available. +::: ## Generating synthetic data @@ -40,12 +33,12 @@ Run: icarus gen ``` -## Running ETL +## Running a project Run: ```bash -icarus etl +icarus run ``` ## Open application