nismod · mjaquiery · Dec 4, 2024 · Dec 4, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 node_modules/
 tileserver/raster/data
 tileserver/vector/data
+tileserver/stacks
 incoming_data/
 intermediate_data/
 *.zip
@@ -20,6 +21,8 @@ public/*.csv
 public/**/*.csv
 .snakemake
 
+notebooks
+
 # local config
 envs
 .env*

diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
@@ -87,6 +87,23 @@ services:
       - "traefik.http.middlewares.raster-tileserver-stripprefix.stripprefix.prefixes=/raster"
       - "traefik.http.services.raster-tileserver.loadbalancer.server.port=5000"
 
+  pixel-driller:
+    build: ./pixel_driller
+    volumes:
+      - ./tileserver/stacks:/data
+      - ./pixel_driller:/code
+    ports:
+      - 5080:80
+    environment:
+      - PIXEL_STACK_DATA_DIR=/data
+    labels:
+      - "traefik.enable=true"
+      - "traefik.http.routers.pixel-driller.rule=Host(`localhost`) && PathPrefix(`/pixel`)"
+      - "traefik.http.routers.pixel-driller.entrypoints=web"
+      - "traefik.http.routers.pixel-driller.middlewares=pixel-driller-stripprefix"
+      - "traefik.http.middlewares.pixel-driller-stripprefix.stripprefix.prefixes=/pixel"
+      - "traefik.http.services.pixel-driller.loadbalancer.server.port=80"
+
   db:
     image: kartoza/postgis:14-3.1
     volumes:

diff --git a/pixel_driller/.env.example b/pixel_driller/.env.example
@@ -0,0 +1,2 @@
+DATA_SOURCE_DIR=../tileserver/raster/data
+ADMIN_CODE=something_very_secret_but_url_encodable
diff --git a/pixel_driller/.gitignore b/pixel_driller/.gitignore
@@ -0,0 +1,5 @@
+.venv/*
+dummy_data/*
+tests/fixtures/output/*
+tests/fixtures/test.zarr
+.idea/*
diff --git a/pixel_driller/Dockerfile b/pixel_driller/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.12
+
+WORKDIR /code
+
+COPY . /code
+
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+
+#CMD ["fastapi", "run", "main.py", "--port", "80"]
+
+# If running behind a proxy like Nginx or Traefik add --proxy-headers
+CMD ["fastapi", "run", "main.py", "--port", "80", "--proxy-headers"]
diff --git a/pixel_driller/README.md b/pixel_driller/README.md
@@ -0,0 +1,44 @@
+# Prototyping pixel query endpoint
+
+Run tests:
+
+```bash
+pushd tests/fixtures
+python make_fixtures.py
+popd
+
+python -m unittest tests/test_*.py
+```
+
+Run ingest:
+
+```bash
+mkdir -p ../tileserver/stacks
+python ingest.py /path/to/jamaica-infrastructure/processed_data/ ../tileserver/stacks
+```
+
+Backup to tar:
+
+```bash
+tar cvf 2024-12-05_jamaica.infrastructureresilience.org_tileserver_stacks.tar tileserver/stacks
+```
+
+Run service:
+
+```bash
+docker compose -f docker-compose.dev.yml up pixel-driller
+```
+
+```bash
+curl http://localhost:5080/-78.0/18.5
+```
+
+Run the whole app:
+
+```bash
+docker compose -f docker-compose.dev.yml up
+```
+
+```bash
+curl http://localhost/pixel/-78.0/18.5
+```
diff --git a/pixel_driller/__init__.py b/pixel_driller/__init__.py
diff --git a/pixel_driller/ingest.py b/pixel_driller/ingest.py
@@ -0,0 +1,108 @@
+"""Stack rasters into queryable format
+"""
+
+import sys
+from pathlib import Path
+
+import pandas as pd
+import snail.intersection
+import xarray as xr
+from tqdm.auto import tqdm
+
+
+def read_grids(
+    source_path: Path, layers: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    # First read metadata from each raster file
+    # keep track of unique grid definitions, give them ids
+    grid_lookup: dict[snail.intersection.GridDefinition, str] = {}
+    layer_grid_ids: list[str] = []
+
+    for layer in tqdm(
+        layers.itertuples(), total=len(layers), desc="Reading layer metadata"
+    ):
+        grid_path = source_path / layer.path
+        grid = snail.intersection.GridDefinition.from_raster(grid_path)
+        if grid not in grid_lookup:
+            grid_id = f"grid_{len(grid_lookup)}"
+            grid_lookup[grid] = grid_id
+        else:
+            grid_id = grid_lookup[grid]
+        layer_grid_ids.append(grid_id)
+
+    # Transform unique grid definitions into data table for reference as metadata
+    grid_data = []
+    for grid, grid_id in grid_lookup.items():
+        grid_data.append(
+            {
+                "crs": str(grid.crs),
+                "width": grid.width,
+                "height": grid.height,
+                "transform": grid.transform,
+                "grid_id": grid_id,
+            }
+        )
+    grids = pd.DataFrame(grid_data)
+    layers["grid_id"] = layer_grid_ids
+
+    return layers, grids
+
+
+def stack(
+    source_path: Path, target_path: Path, layers: pd.DataFrame, grids: pd.DataFrame
+):
+    grid_fname_lookup = grids.set_index("grid_id")
+
+    for grid_id, grid_layers in layers.groupby("grid_id"):
+        var = xr.Variable("key", grid_layers.key.tolist())
+        layer_paths = grid_layers.path.tolist()
+        print("Processing", len(layer_paths), "layers for", grid_id)
+        ds = (
+            xr.concat(
+                [
+                    xr.open_dataset(source_path / layer_path, engine="rasterio")
+                    for layer_path in layer_paths
+                ],
+                dim=var,
+            )
+            .squeeze("band", drop=True)
+            .drop_vars("spatial_ref")
+        )
+        # Trade-off in chunk size vs number of files
+        # (smaller chunks -> more files -> slower to write, unknown effect on reads)
+        # 10 10 100 wrote in 1m10 - 180byte to 22k chunk files
+        # 100 100 100 wrote in 1.9s - 11k to 1.1M chunk files
+        # 1000 1000 100 wrote in 1.5s - 1.1M to 9.4M chunk files
+        dsc = ds.chunk({"x": 100, "y": 100, "key": 1000})
+
+        grid_fname = grid_fname_lookup.loc[grid_id, "fname"]
+
+        dsc.to_zarr(target_path / grid_fname, mode="w-")
+
+
+if __name__ == "__main__":
+    try:
+        source_path = Path(sys.argv[1])
+        target_path = Path(sys.argv[2])
+    except IndexError:
+        print("Usage: python ingest.py <source_path> <target_path>")
+        sys.exit()
+
+    # CSV is structured like this:
+    #   hazard,path,rp,rcp,epoch,confidence,key
+    #   coastal,hazards/Coastal_flood_data/Flood_maps_future_climate/RCP26_2050/JamaicaJAM001RCP262050_epsg_32618_RP_1.tif,1,2.6,2050,,coastal__rp_1__rcp_2x6__epoch_2050__conf_None
+    # path is relative to "source_path"
+    # key is a unique compound string key that encodes (hazard,rp,rcp,epoch,confidence)
+    layers_without_grid_ids = pd.read_csv(
+        Path(__file__).parent / ".." / "etl" / "hazard_layers.csv"
+    )
+    layers, grids = read_grids(source_path, layers_without_grid_ids)
+
+    # Conventional filename using grid_id - could drive this with data
+    # e.g. to name like datasets/hazards
+    grids["fname"] = grids.grid_id.apply(lambda grid_id: f"{grid_id}.zarr")
+
+    stack(source_path, target_path, layers, grids)
+
+    layers.to_csv(target_path / "layers.csv")
+    grids.to_csv(target_path / "stacks.csv")
diff --git a/pixel_driller/main.py b/pixel_driller/main.py
@@ -0,0 +1,26 @@
+import os
+from pathlib import Path
+
+import pandas as pd
+from fastapi import FastAPI
+from pyproj import CRS
+
+from .query import point_query, RasterStackMetadata
+
+
+def read_metadata(target_path: Path) -> list[RasterStackMetadata]:
+    datasets = pd.read_csv(target_path / "stacks.csv")
+    return [
+        RasterStackMetadata(ds.grid_id, target_path / ds.fname, CRS(ds.crs))
+        for ds in datasets.itertuples()
+    ]
+
+
+DATA_PATH = os.getenv("PIXEL_STACK_DATA_DIR", "/data")
+DATASETS = read_metadata(Path(DATA_PATH))
+app = FastAPI()
+
+
+@app.get("/{lon}/{lat}")
+async def get_values_at_point(lon: float, lat: float):
+    return point_query(DATASETS, lon, lat)
diff --git a/pixel_driller/query.py b/pixel_driller/query.py
@@ -0,0 +1,45 @@
+from dataclasses import dataclass
+from pathlib import Path
+
+import pandas as pd
+import xarray as xr
+from pyproj import CRS
+from pyproj.transformer import Transformer
+
+
+@dataclass
+class RasterStackMetadata:
+    """Metadata about each stack of rasters"""
+
+    name: str
+    path: Path
+    crs: CRS
+
+
+def point_query(datasets: list[RasterStackMetadata], lon: float, lat: float):
+    """
+    Query a raster file with multiple bands to extract the values at a specific (x, y) coordinate.
+
+    Parameters:
+        datasets (pandas.DataFrame): Metadata about the raster files.
+        x (float): longitude coordinate
+        y (float): latitude coordinate
+
+    Returns:
+        dict: A dictionary where keys are band 'source' tags and values are the pixel values at (x, y).
+    """
+    dfs = []
+    for dataset in datasets:
+        t = Transformer.from_crs("EPSG:4326", dataset.crs)
+        tx, ty = t.transform(lon, lat)
+        df = (
+            xr.open_zarr(dataset.path)
+            .sel(x=tx, y=ty, method="nearest")
+            .drop_vars(["x", "y"])
+            .to_dataframe()
+            .fillna(0)
+            .reset_index()
+        )
+        dfs.append(df)
+    out = pd.concat(dfs)
+    return out.to_dict(orient="records")  # could go straight .to_json(orient="records")
diff --git a/pixel_driller/requirements.txt b/pixel_driller/requirements.txt
@@ -0,0 +1,5 @@
+nismod-snail==0.5.3
+fastapi[standard]==0.115.6
+xarray==2023.10.1
+zarr==2.18.3
+rasterio==1.3.9
diff --git a/pixel_driller/test_main.http b/pixel_driller/test_main.http
@@ -0,0 +1,22 @@
+# Test your FastAPI endpoints
+
+GET http://localhost:5080/1/1
+Accept: application/json
+
+###
+
+GET http://localhost:5080/docs
+Accept: application/json
+
+###
+
+GET http://localhost/pixel/docs
+Accept: application/json
+
+###
+
+
+GET http://localhost/pixel/1/1
+Accept: application/json
+
+###
diff --git a/pixel_driller/tests/__init__.py b/pixel_driller/tests/__init__.py
diff --git a/pixel_driller/tests/fixtures/make_fixtures.py b/pixel_driller/tests/fixtures/make_fixtures.py
@@ -0,0 +1,60 @@
+import numpy as np
+import rasterio
+from rasterio.transform import from_origin
+import xarray as xr
+
+
+def create_raster(
+    file_path,
+    width,
+    height,
+    value=0,
+    crs="EPSG:4326",
+):
+    """
+    Creates a single-band raster with constant or random values, occasionally introducing nodata values.
+
+    Parameters:
+        file_path (str): Path to save the raster.
+        width (int): Width of the raster.
+        height (int): Height of the raster.
+        value (int or None): Constant value for the raster. If None, random values are used.
+        crs (str): Coordinate Reference System for the raster.
+        nodata_value (float or None): Value to use as nodata. If None, no nodata values are set.
+        nodata_frequency (float): Probability of a cell being assigned the nodata value.
+    """
+    transform = from_origin(0, 0, 1, 1)  # Arbitrary transform
+    data = np.full((height, width), value, dtype=np.float32)
+
+    with rasterio.open(
+        file_path,
+        "w",
+        driver="GTiff",
+        height=height,
+        width=width,
+        count=1,
+        dtype=np.float32,
+        crs=crs,
+        transform=transform,
+    ) as dst:
+        dst.write(data, 1)
+
+
+def create_stack():
+    """Creates a zarr dataset for testing"""
+    data = np.arange(16).reshape(2, 2, 4)
+    da = xr.DataArray(
+        data,
+        dims=("x", "y", "key"),
+        coords={"x": [0.0, 0.1], "y": [0.0, 0.1], "key": ["a", "b", "c", "d"]},
+    )
+    ds = xr.Dataset({"band_data": da})
+    ds.to_zarr("test.zarr", mode="w-")
+
+
+if __name__ == "__main__":
+    # Generate single-band rasters
+    for i, letter in enumerate(["a", "b", "c", "d"]):
+        create_raster(f"single_band/{letter}.tif", 10, 10, value=i)
+
+    create_stack()
diff --git a/pixel_driller/tests/fixtures/single_band/.gitkeep b/pixel_driller/tests/fixtures/single_band/.gitkeep
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		DATA_SOURCE_DIR=../tileserver/raster/data
		ADMIN_CODE=something_very_secret_but_url_encodable