diff --git a/.github/workflows/cicd.yaml b/.github/workflows/docs.yaml similarity index 82% rename from .github/workflows/cicd.yaml rename to .github/workflows/docs.yaml index 46a2000..44fc5da 100644 --- a/.github/workflows/cicd.yaml +++ b/.github/workflows/docs.yaml @@ -1,4 +1,4 @@ -name: cicd +name: docs permissions: contents: write @@ -11,8 +11,9 @@ on: paths: - 'justfile' - 'website/**' + - 'pyproject.toml' - 'dev-requirements.txt' - - '.github/workflows/cicd.yaml' + - '.github/workflows/docs.yaml' jobs: deploy: @@ -31,8 +32,11 @@ jobs: - uses: quarto-dev/quarto-actions/setup@v2 + - name: setup uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + - name: install requirements - run: pip install uv && just setup + run: just setup - name: build site run: | diff --git a/eda.ipynb b/eda.ipynb index 674a137..3f00882 100644 --- a/eda.ipynb +++ b/eda.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -24,6 +24,7 @@ "from icarus.config import (\n", " DATA_DIR,\n", " RAW_DATA_DIR,\n", + " PENGUINS_TABLE,\n", " BUY_SELL_TABLE,\n", " SOCIAL_MEDIA_TABLE,\n", ")\n", @@ -36,9 +37,130 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['buy_sell.delta', 'social_media.delta', 'penguins.delta']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog = Catalog()\n", + "catalog.list_tables()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n",
+       "┃ extracted_at                species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n",
+       "│ stringstringstringfloat64float64int64int64stringint64 │\n",
+       "├────────────────────────────┼─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n",
+       "│ 2024-08-25T19:17:49.803712Adelie Torgersen39.320.61903650male  2007 │\n",
+       "│ 2024-08-25T19:17:49.803712Gentoo Biscoe   43.314.02084575female2009 │\n",
+       "│ 2024-08-25T19:17:49.803712Adelie Torgersen38.719.01953450female2007 │\n",
+       "│ 2024-08-25T19:17:49.803712Adelie Dream    39.516.71783250female2007 │\n",
+       "│ 2024-08-25T19:17:49.803712Adelie Dream    36.417.01953325female2007 │\n",
+       "│ 2024-08-25T19:17:49.803712Adelie Torgersen34.418.41843325female2007 │\n",
+       "│ 2024-08-25T19:17:49.803712Gentoo Biscoe   52.117.02305550male  2009 │\n",
+       "│ 2024-08-25T19:17:49.803712Adelie Biscoe   38.218.11853950male  2007 │\n",
+       "│ 2024-08-25T19:17:49.803712Adelie Biscoe   35.318.91873800female2007 │\n",
+       "│ 2024-08-25T19:17:49.803712Adelie Biscoe   40.517.91873200female2007 │\n",
+       "│  │\n",
+       "└────────────────────────────┴─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘\n",
+       "
\n" + ], + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mextracted_at\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mspecies\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1misland\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbill_length_mm\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbill_depth_mm\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mflipper_length_mm\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbody_mass_g\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msex\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1myear\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩\n", + "│ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │\n", + "├────────────────────────────┼─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mTorgersen\u001b[0m │ \u001b[1;36m39.3\u001b[0m │ \u001b[1;36m20.6\u001b[0m │ \u001b[1;36m190\u001b[0m │ \u001b[1;36m3650\u001b[0m │ \u001b[32mmale \u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mGentoo \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m43.3\u001b[0m │ \u001b[1;36m14.0\u001b[0m │ \u001b[1;36m208\u001b[0m │ \u001b[1;36m4575\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2009\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mTorgersen\u001b[0m │ \u001b[1;36m38.7\u001b[0m │ \u001b[1;36m19.0\u001b[0m │ \u001b[1;36m195\u001b[0m │ \u001b[1;36m3450\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mDream \u001b[0m │ \u001b[1;36m39.5\u001b[0m │ \u001b[1;36m16.7\u001b[0m │ \u001b[1;36m178\u001b[0m │ \u001b[1;36m3250\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mDream \u001b[0m │ \u001b[1;36m36.4\u001b[0m │ \u001b[1;36m17.0\u001b[0m │ \u001b[1;36m195\u001b[0m │ \u001b[1;36m3325\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mTorgersen\u001b[0m │ \u001b[1;36m34.4\u001b[0m │ \u001b[1;36m18.4\u001b[0m │ \u001b[1;36m184\u001b[0m │ \u001b[1;36m3325\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mGentoo \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m52.1\u001b[0m │ \u001b[1;36m17.0\u001b[0m │ \u001b[1;36m230\u001b[0m │ \u001b[1;36m5550\u001b[0m │ \u001b[32mmale \u001b[0m │ \u001b[1;36m2009\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m38.2\u001b[0m │ \u001b[1;36m18.1\u001b[0m │ \u001b[1;36m185\u001b[0m │ \u001b[1;36m3950\u001b[0m │ \u001b[32mmale \u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m35.3\u001b[0m │ \u001b[1;36m18.9\u001b[0m │ \u001b[1;36m187\u001b[0m │ \u001b[1;36m3800\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[32m2024-08-25T19:17:49.803712\u001b[0m │ \u001b[32mAdelie \u001b[0m │ \u001b[32mBiscoe \u001b[0m │ \u001b[1;36m40.5\u001b[0m │ \u001b[1;36m17.9\u001b[0m │ \u001b[1;36m187\u001b[0m │ \u001b[1;36m3200\u001b[0m │ \u001b[32mfemale\u001b[0m │ \u001b[1;36m2007\u001b[0m │\n", + "│ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │ \u001b[2m…\u001b[0m │\n", + "└────────────────────────────┴─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t = catalog.table(PENGUINS_TABLE)\n", + "t" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "┌───────────────┐\n",
+       "│ \u001b[1;35mnp.int64\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m344\u001b[0m\u001b[1m)\u001b[0m │\n",
+       "└───────────────┘"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'115,740.74'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "f\"{10_000_000_000 / (60 * 60 * 24):,.2f}\""
    ]
@@ -283,7 +405,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.8"
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index 4930f99..154de73 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ packages = ["src/icarus"]
 
 [project]
 name = "icarus-cds"
-version = "0.2.0"
+version = "0.3.0"
 authors = [{ name = "Cody", email = "cody@dkdc.dev" }]
 description = "Python composable data stack solution accelerator"
 readme = "readme.md"
@@ -28,7 +28,7 @@ dependencies = [
     # cloud
     'gcsfs',
     # data
-    'ibis-framework[duckdb,polars,deltalake]',
+    'ibis-framework[duckdb,polars,deltalake,examples]',
     'Faker',
     # visualization
     'plotly',
diff --git a/src/icarus/catalog.py b/src/icarus/catalog.py
index 4365287..09bdf20 100644
--- a/src/icarus/catalog.py
+++ b/src/icarus/catalog.py
@@ -17,6 +17,9 @@ def delta_table_path(table_name: str) -> str:
 def read_table(table_name: str) -> ibis.Table:
     if CLOUD:
         import gcsfs
+        import warnings
+
+        warnings.filterwarnings("ignore")
 
         fs = gcsfs.GCSFileSystem()
         ibis.get_backend().register_filesystem(fs)
@@ -31,6 +34,9 @@ def read_table(table_name: str) -> ibis.Table:
 def write_table(t: ibis.Table, table_name: str) -> None:
     if CLOUD:
         import gcsfs
+        import warnings
+
+        warnings.filterwarnings("ignore")
 
         fs = gcsfs.GCSFileSystem()
         ibis.get_backend().register_filesystem(fs)
diff --git a/src/icarus/cli.py b/src/icarus/cli.py
index 001f8c5..d72eb2c 100644
--- a/src/icarus/cli.py
+++ b/src/icarus/cli.py
@@ -6,11 +6,13 @@
 from icarus.config import (
     DATA_DIR,
     RAW_DATA_DIR,
+    PENGUINS_TABLE,
     BUY_SELL_TABLE,
     SOCIAL_MEDIA_TABLE,
 )
 from icarus.catalog import delta_table_filename
-from icarus.investments.run import main as run_main
+from icarus.penguins.run import main as penguins_run_main
+from icarus.investments.run import main as investments_run_main
 from icarus.synthetic_data.investments import (
     gen_buy_sell_batch,
     gen_social_media_batch,
@@ -25,13 +27,16 @@
 
 # typer apps
 app = typer.Typer(help="Icarus: soaring beyond limits.", **TYPER_KWARGS)
+run_app = typer.Typer(help="Run the ETL job.", **TYPER_KWARGS)
 clean_app = typer.Typer(help="Clean the data lake.", **TYPER_KWARGS)
 
 # add subcommands
 app.add_typer(clean_app, name="clean")
+app.add_typer(run_app, name="run")
 
 # add subcommand aliases
 app.add_typer(clean_app, name="c", hidden=True)
+app.add_typer(run_app, name="r", hidden=True)
 
 
 # helper functions
@@ -71,9 +76,25 @@ def gen():
         typer.echo(f"error: {e}")
 
 
-@app.command()
-@app.command("etl", hidden=True)
-def run(
+@run_app.command()
+def penguins(
+    override: bool = typer.Option(
+        False, "--override", "-o", help="Override checks", show_default=True
+    ),
+):
+    """Run ETL."""
+
+    # ensure raw data exists
+    if not override and not check_raw_data_exists():
+        return
+
+    # run the ETL job
+    typer.echo("running ETL job...")
+    penguins_run_main()
+
+
+@run_app.command()
+def investments(
     override: bool = typer.Option(
         False, "--override", "-o", help="Override checks", show_default=True
     ),
@@ -86,7 +107,7 @@ def run(
 
     # run the ETL job
     typer.echo("running ETL job...")
-    run_main()
+    investments_run_main()
 
 
 @app.command("app")
@@ -108,7 +129,7 @@ def clean_lake(
     if not override and not check_data_lake_exists():
         return
 
-    tables = [BUY_SELL_TABLE, SOCIAL_MEDIA_TABLE]
+    tables = [PENGUINS_TABLE, BUY_SELL_TABLE, SOCIAL_MEDIA_TABLE]
     tables = [delta_table_filename(table) for table in tables]
 
     for table in tables:
diff --git a/src/icarus/config.py b/src/icarus/config.py
index 647a882..8bd77ef 100644
--- a/src/icarus/config.py
+++ b/src/icarus/config.py
@@ -3,5 +3,6 @@
 
 DATA_DIR = "datalake"
 RAW_DATA_DIR = "_raw"
+PENGUINS_TABLE = "penguins"
 BUY_SELL_TABLE = "buy_sell"
 SOCIAL_MEDIA_TABLE = "social_media"
diff --git a/src/icarus/investments/run.py b/src/icarus/investments/run.py
index cf275ac..87d8b97 100644
--- a/src/icarus/investments/run.py
+++ b/src/icarus/investments/run.py
@@ -20,26 +20,10 @@ def main():
     extract_buy_sell_t = extract_buy_sell()
     extract_social_media_t = extract_social_media()
 
-    # data validation
-    assert (
-        extract_buy_sell_t.count().to_pyarrow().as_py() > 0
-    ), "No extracted buy/sell data"
-    assert (
-        extract_social_media_t.count().to_pyarrow().as_py() > 0
-    ), "No extracted social media data"
-
     # transform
     transform_buy_sell_t = transform_buy_sell(extract_buy_sell_t)
     transform_social_media_t = transform_social_media(extract_social_media_t)
 
-    # data validation
-    assert (
-        transform_buy_sell_t.count().to_pyarrow().as_py() > 0
-    ), "No transformed buy/sell data"
-    assert (
-        transform_social_media_t.count().to_pyarrow().as_py() > 0
-    ), "No transformed social media data"
-
     # load
     catalog.write_table(transform_buy_sell_t, BUY_SELL_TABLE)
     catalog.write_table(transform_social_media_t, SOCIAL_MEDIA_TABLE)
diff --git a/src/icarus/investments/transform.py b/src/icarus/investments/transform.py
index 2837113..2574a14 100644
--- a/src/icarus/investments/transform.py
+++ b/src/icarus/investments/transform.py
@@ -23,25 +23,23 @@ def postprocess(t: ibis.Table) -> ibis.Table:
 
 
 # data assets
-def buy_sell(bronze_buy_sell):
+def buy_sell(t: ibis.Table) -> ibis.Table:
     """Transform ticker buy/sell data."""
 
     def transform(t):
         t = t.mutate(t["buy_sell"].unnest()).unpack("buy_sell")
         return t
 
-    buy_sell = bronze_buy_sell.pipe(preprocess).pipe(transform).pipe(postprocess)
+    buy_sell = t.pipe(preprocess).pipe(transform).pipe(postprocess)
     return buy_sell
 
 
-def social_media(bronze_social_media):
+def social_media(t: ibis.Table) -> ibis.Table:
     """Transform ticker social media data."""
 
     def transform(t):
         t = t.unpack("social_media_post")
         return t
 
-    social_media = (
-        bronze_social_media.pipe(preprocess).pipe(transform).pipe(postprocess)
-    )
+    social_media = t.pipe(preprocess).pipe(transform).pipe(postprocess)
     return social_media
diff --git a/src/icarus/penguins/__init__.py b/src/icarus/penguins/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/icarus/penguins/extract.py b/src/icarus/penguins/extract.py
new file mode 100644
index 0000000..ddc48be
--- /dev/null
+++ b/src/icarus/penguins/extract.py
@@ -0,0 +1,32 @@
+# imports
+import ibis
+
+from datetime import datetime
+
+# set extracted_at timestamp
+# note we don't use ibis.now() to ensure it's the same...
+# ...for all tables/rows on a given run
+extracted_at = datetime.utcnow().isoformat()
+
+
+# functions
+def add_extracted_at(t: ibis.Table) -> ibis.Table:
+    """Add extracted_at column to table"""
+
+    # add extracted_at column and relocate it to the first position
+    t = t.mutate(extracted_at=ibis.literal(extracted_at)).relocate("extracted_at")
+
+    return t
+
+
+# data assets
+def penguins() -> ibis.Table:
+    """Extract penguins data"""
+
+    # read in raw data
+    penguins = ibis.examples.penguins.fetch()
+
+    # add extracted_at column
+    penguins = penguins.pipe(add_extracted_at)
+
+    return penguins
diff --git a/src/icarus/penguins/run.py b/src/icarus/penguins/run.py
new file mode 100644
index 0000000..67804ec
--- /dev/null
+++ b/src/icarus/penguins/run.py
@@ -0,0 +1,22 @@
+# imports
+from icarus.config import PENGUINS_TABLE
+from icarus.catalog import Catalog
+from icarus.penguins.extract import (
+    penguins as extract_penguins,
+)
+from icarus.penguins.transform import penguins as transform_penguins
+
+
+# functions
+def main():
+    # instantiate catalog
+    catalog = Catalog()
+
+    # extract
+    extract_penguins_t = extract_penguins()
+
+    # transform
+    transform_penguins_t = transform_penguins(extract_penguins_t)
+
+    # load
+    catalog.write_table(transform_penguins_t, PENGUINS_TABLE)
diff --git a/src/icarus/penguins/transform.py b/src/icarus/penguins/transform.py
new file mode 100644
index 0000000..5492ed7
--- /dev/null
+++ b/src/icarus/penguins/transform.py
@@ -0,0 +1,33 @@
+# imports
+import ibis
+import ibis.selectors as s
+
+
+# functions
+def preprocess(t: ibis.Table) -> ibis.Table:
+    """Common preprocessing steps"""
+
+    # ensure unique records
+    t = t.distinct(on=~s.c("extracted_at"), keep="first").order_by("extracted_at")
+
+    return t
+
+
+def postprocess(t: ibis.Table) -> ibis.Table:
+    """Common postprocessing steps"""
+
+    # ensure consistent column casing
+    t = t.rename("snake_case")
+
+    return t
+
+
+# data assets
+def penguins(t: ibis.Table) -> ibis.Table:
+    """Transform penguins data."""
+
+    def transform(t):
+        return t
+
+    penguins = t.pipe(preprocess).pipe(transform).pipe(postprocess)
+    return penguins
diff --git a/website/eda.qmd b/website/eda.qmd
new file mode 100644
index 0000000..e87f4d3
--- /dev/null
+++ b/website/eda.qmd
@@ -0,0 +1,159 @@
+---
+error: True
+---
+
+# Exploratory data analysis (EDA)
+
+Use this notebook to explore and visualize the data.
+
+```{python}
+import os
+import ibis
+import ibis.selectors as s
+
+import plotly.express as px
+
+from icarus.config import (
+    DATA_DIR,
+    RAW_DATA_DIR,
+    BUY_SELL_TABLE,
+    SOCIAL_MEDIA_TABLE,
+)
+from icarus.catalog import Catalog
+from icarus.synthetic_data.investments import fake, seed_table
+
+px.defaults.template = "plotly_dark"
+ibis.options.interactive = True
+```
+
+```{python}
+f"{10_000_000_000 / (60 * 60 * 24):,.2f}"
+```
+
+```{python}
+lookback = ibis.interval(days=1)
+step = ibis.interval(seconds=1)
+```
+
+```{python}
+t = (
+    ibis.range(ibis.now() - lookback, ibis.now(), step=step)
+    .unnest()
+    .name("timestamp")
+    .as_table()
+    .mutate(
+        index=(ibis.row_number().over(order_by="timestamp")),
+        symbol=ibis.random(),
+        bid=ibis.random(),
+        bid_size=ibis.random(),
+        ask=ibis.random(),
+        ask_size=ibis.random(),
+        **{c: 2 * (ibis.random() - 0.5) for c in ["a", "b", "c"]},
+        **{
+            c: ibis.null(type=str)
+            for c in list(map(chr, range(ord("d"), ord("z") + 1)))
+        },
+    )
+    .relocate("index", "timestamp")
+)
+t
+```
+
+```{python}
+import random
+
+symbols = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
+symbols = [
+    f"{a}{b}{c}{d}" for a in symbols for b in symbols for c in symbols for d in symbols
+]
+symbols = random.sample(symbols, 500)
+symbols[:5]
+```
+
+```{python}
+symbols_case = ibis.case()
+for i, symbol in enumerate(symbols):
+    symbols_case = symbols_case.when(
+        (i / len(symbols) < ibis._["symbol"])
+        & (ibis._["symbol"] < ((i + 1) / len(symbols))),
+        symbol,
+    )
+symbols_case = symbols_case.else_(ibis.null()).end()
+```
+
+```{python}
+t = t.mutate(
+    symbol=symbols_case,
+    bid_size=(ibis._["bid_size"] * 1000).cast("int64"),
+    ask_size=(ibis._["ask_size"] * 1000).cast("int64"),
+)
+t
+```
+
+```{python}
+t
+```
+
+```{python}
+seed_table
+```
+
+```{python}
+catalog = Catalog()
+catalog.list_tables()
+```
+
+```{python}
+seed_table
+```
+
+```{python}
+data_glob = os.path.join(DATA_DIR, RAW_DATA_DIR, BUY_SELL_TABLE, "*.parquet")
+bs = ibis.read_parquet(data_glob)
+bs
+```
+
+```{python}
+bs = catalog.table("buy_sell")
+bs
+```
+
+```{python}
+t = bs.group_by("ticker").agg(count=ibis._.count()).order_by(ibis.desc("count"))
+t
+```
+
+```{python}
+px.bar(
+    t,
+    x="ticker",
+    y="count",
+    title="Number of trades per symbol",
+    color="ticker",
+)
+```
+
+```{python}
+bs.count()
+```
+
+```{python}
+bs["location"].value_counts()
+```
+
+```{python}
+f"{bs.count().to_pyarrow().as_py():,}"
+```
+
+```{python}
+data_glob = os.path.join(DATA_DIR, RAW_DATA_DIR, SOCIAL_MEDIA_TABLE, "*.parquet")
+sm = ibis.read_parquet(data_glob)
+sm
+```
+
+```{python}
+sm = catalog.table("social_media")
+sm
+```
+
+
diff --git a/website/onboarding.qmd b/website/onboarding.qmd
index 19395da..51589ea 100644
--- a/website/onboarding.qmd
+++ b/website/onboarding.qmd
@@ -4,33 +4,26 @@ Welcome to Icarus!
 
 ## Setup
 
-Clone the repo:
+::: {.panel-tabset}
 
-```bash
-gh repo clone lostmygithubaccount/Icarus
-```
-
-Change into it:
-    
-```bash
-cd Icarus
-```
-
-## Installation
+## Development
 
-[Install `just`](https://github.com/casey/just). Then:
+Install [`gh`](https://github.com/cli/cli) and [`just`](https://github.com/casey/just) and [`uv`](https://github.com/astral-sh/uv), then:
 
 ```bash
+gh repo clone ibis-project/Icarus
+cd Icarus
 just setup
+. .venv/bin/activate
 ```
 
-And activate the virtual environment:
+## Pip
 
 ```bash
-. .venv/bin/activate
+pip install icarus-cds
 ```
 
-The `icarus` CLI is now available.
+:::
 
 ## Generating synthetic data
 
@@ -40,12 +33,12 @@ Run:
 icarus gen
 ```
 
-## Running ETL
+## Running a project
 
 Run:
 
 ```bash
-icarus etl
+icarus run
 ```
 
 ## Open application