Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: pixel driller API #221

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
node_modules/
tileserver/raster/data
tileserver/vector/data
tileserver/stacks
incoming_data/
intermediate_data/
*.zip
Expand All @@ -20,6 +21,8 @@ public/*.csv
public/**/*.csv
.snakemake

notebooks

# local config
envs
.env*
Expand Down
17 changes: 17 additions & 0 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,23 @@ services:
- "traefik.http.middlewares.raster-tileserver-stripprefix.stripprefix.prefixes=/raster"
- "traefik.http.services.raster-tileserver.loadbalancer.server.port=5000"

pixel-driller:
build: ./pixel_driller
volumes:
- ./tileserver/stacks:/data
- ./pixel_driller:/code
ports:
- 5080:80
environment:
- PIXEL_STACK_DATA_DIR=/data
labels:
- "traefik.enable=true"
- "traefik.http.routers.pixel-driller.rule=Host(`localhost`) && PathPrefix(`/pixel`)"
- "traefik.http.routers.pixel-driller.entrypoints=web"
- "traefik.http.routers.pixel-driller.middlewares=pixel-driller-stripprefix"
- "traefik.http.middlewares.pixel-driller-stripprefix.stripprefix.prefixes=/pixel"
- "traefik.http.services.pixel-driller.loadbalancer.server.port=80"

db:
image: kartoza/postgis:14-3.1
volumes:
Expand Down
2 changes: 2 additions & 0 deletions pixel_driller/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DATA_SOURCE_DIR=../tileserver/raster/data
ADMIN_CODE=something_very_secret_but_url_encodable
5 changes: 5 additions & 0 deletions pixel_driller/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.venv/*
dummy_data/*
tests/fixtures/output/*
tests/fixtures/test.zarr
.idea/*
12 changes: 12 additions & 0 deletions pixel_driller/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM python:3.12

WORKDIR /code

COPY . /code

RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt

#CMD ["fastapi", "run", "main.py", "--port", "80"]

# If running behind a proxy like Nginx or Traefik add --proxy-headers
CMD ["fastapi", "run", "main.py", "--port", "80", "--proxy-headers"]
44 changes: 44 additions & 0 deletions pixel_driller/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Prototyping pixel query endpoint

Run tests:

```bash
pushd tests/fixtures
python make_fixtures.py
popd

python -m unittest tests/test_*.py
```

Run ingest:

```bash
mkdir -p ../tileserver/stacks
python ingest.py /path/to/jamaica-infrastructure/processed_data/ ../tileserver/stacks
```

Backup to tar:

```bash
tar cvf 2024-12-05_jamaica.infrastructureresilience.org_tileserver_stacks.tar tileserver/stacks
```

Run service:

```bash
docker compose -f docker-compose.dev.yml up pixel-driller
```

```bash
curl http://localhost:5080/-78.0/18.5
```

Run the whole app:

```bash
docker compose -f docker-compose.dev.yml up
```

```bash
curl http://localhost/pixel/-78.0/18.5
```
Empty file added pixel_driller/__init__.py
Empty file.
108 changes: 108 additions & 0 deletions pixel_driller/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Stack rasters into queryable format
"""

import sys
from pathlib import Path

import pandas as pd
import snail.intersection
import xarray as xr
from tqdm.auto import tqdm


def read_grids(
source_path: Path, layers: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]:
# First read metadata from each raster file
# keep track of unique grid definitions, give them ids
grid_lookup: dict[snail.intersection.GridDefinition, str] = {}
layer_grid_ids: list[str] = []

for layer in tqdm(
layers.itertuples(), total=len(layers), desc="Reading layer metadata"
):
grid_path = source_path / layer.path
grid = snail.intersection.GridDefinition.from_raster(grid_path)
if grid not in grid_lookup:
grid_id = f"grid_{len(grid_lookup)}"
grid_lookup[grid] = grid_id
else:
grid_id = grid_lookup[grid]
layer_grid_ids.append(grid_id)

# Transform unique grid definitions into data table for reference as metadata
grid_data = []
for grid, grid_id in grid_lookup.items():
grid_data.append(
{
"crs": str(grid.crs),
"width": grid.width,
"height": grid.height,
"transform": grid.transform,
"grid_id": grid_id,
}
)
grids = pd.DataFrame(grid_data)
layers["grid_id"] = layer_grid_ids

return layers, grids


def stack(
source_path: Path, target_path: Path, layers: pd.DataFrame, grids: pd.DataFrame
):
grid_fname_lookup = grids.set_index("grid_id")

for grid_id, grid_layers in layers.groupby("grid_id"):
var = xr.Variable("key", grid_layers.key.tolist())
layer_paths = grid_layers.path.tolist()
print("Processing", len(layer_paths), "layers for", grid_id)
ds = (
xr.concat(
[
xr.open_dataset(source_path / layer_path, engine="rasterio")
for layer_path in layer_paths
],
dim=var,
)
.squeeze("band", drop=True)
.drop_vars("spatial_ref")
)
# Trade-off in chunk size vs number of files
# (smaller chunks -> more files -> slower to write, unknown effect on reads)
# 10 10 100 wrote in 1m10 - 180byte to 22k chunk files
# 100 100 100 wrote in 1.9s - 11k to 1.1M chunk files
# 1000 1000 100 wrote in 1.5s - 1.1M to 9.4M chunk files
dsc = ds.chunk({"x": 100, "y": 100, "key": 1000})

grid_fname = grid_fname_lookup.loc[grid_id, "fname"]

dsc.to_zarr(target_path / grid_fname, mode="w-")


if __name__ == "__main__":
try:
source_path = Path(sys.argv[1])
target_path = Path(sys.argv[2])
except IndexError:
print("Usage: python ingest.py <source_path> <target_path>")
sys.exit()

# CSV is structured like this:
# hazard,path,rp,rcp,epoch,confidence,key
# coastal,hazards/Coastal_flood_data/Flood_maps_future_climate/RCP26_2050/JamaicaJAM001RCP262050_epsg_32618_RP_1.tif,1,2.6,2050,,coastal__rp_1__rcp_2x6__epoch_2050__conf_None
# path is relative to "source_path"
# key is a unique compound string key that encodes (hazard,rp,rcp,epoch,confidence)
layers_without_grid_ids = pd.read_csv(
Path(__file__).parent / ".." / "etl" / "hazard_layers.csv"
)
layers, grids = read_grids(source_path, layers_without_grid_ids)

# Conventional filename using grid_id - could drive this with data
# e.g. to name like datasets/hazards
grids["fname"] = grids.grid_id.apply(lambda grid_id: f"{grid_id}.zarr")

stack(source_path, target_path, layers, grids)

layers.to_csv(target_path / "layers.csv")
grids.to_csv(target_path / "stacks.csv")
26 changes: 26 additions & 0 deletions pixel_driller/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
from pathlib import Path

import pandas as pd
from fastapi import FastAPI
from pyproj import CRS

from .query import point_query, RasterStackMetadata


def read_metadata(target_path: Path) -> list[RasterStackMetadata]:
datasets = pd.read_csv(target_path / "stacks.csv")
return [
RasterStackMetadata(ds.grid_id, target_path / ds.fname, CRS(ds.crs))
for ds in datasets.itertuples()
]


DATA_PATH = os.getenv("PIXEL_STACK_DATA_DIR", "/data")
DATASETS = read_metadata(Path(DATA_PATH))
app = FastAPI()


@app.get("/{lon}/{lat}")
async def get_values_at_point(lon: float, lat: float):
return point_query(DATASETS, lon, lat)
45 changes: 45 additions & 0 deletions pixel_driller/query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
import xarray as xr
from pyproj import CRS
from pyproj.transformer import Transformer


@dataclass
class RasterStackMetadata:
"""Metadata about each stack of rasters"""

name: str
path: Path
crs: CRS


def point_query(datasets: list[RasterStackMetadata], lon: float, lat: float):
"""
Query a raster file with multiple bands to extract the values at a specific (x, y) coordinate.

Parameters:
datasets (pandas.DataFrame): Metadata about the raster files.
x (float): longitude coordinate
y (float): latitude coordinate

Returns:
dict: A dictionary where keys are band 'source' tags and values are the pixel values at (x, y).
"""
dfs = []
for dataset in datasets:
t = Transformer.from_crs("EPSG:4326", dataset.crs)
tx, ty = t.transform(lon, lat)
df = (
xr.open_zarr(dataset.path)
.sel(x=tx, y=ty, method="nearest")
.drop_vars(["x", "y"])
.to_dataframe()
.fillna(0)
.reset_index()
)
dfs.append(df)
out = pd.concat(dfs)
thomas-fred marked this conversation as resolved.
Show resolved Hide resolved
return out.to_dict(orient="records") # could go straight .to_json(orient="records")
5 changes: 5 additions & 0 deletions pixel_driller/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
nismod-snail==0.5.3
fastapi[standard]==0.115.6
xarray==2023.10.1
zarr==2.18.3
rasterio==1.3.9
22 changes: 22 additions & 0 deletions pixel_driller/test_main.http
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Test your FastAPI endpoints

GET http://localhost:5080/1/1
Accept: application/json

###

GET http://localhost:5080/docs
Accept: application/json

###

GET http://localhost/pixel/docs
Accept: application/json

###


GET http://localhost/pixel/1/1
Accept: application/json

###
Empty file added pixel_driller/tests/__init__.py
Empty file.
60 changes: 60 additions & 0 deletions pixel_driller/tests/fixtures/make_fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np
import rasterio
from rasterio.transform import from_origin
import xarray as xr


def create_raster(
file_path,
width,
height,
value=0,
crs="EPSG:4326",
):
"""
Creates a single-band raster with constant or random values, occasionally introducing nodata values.

Parameters:
file_path (str): Path to save the raster.
width (int): Width of the raster.
height (int): Height of the raster.
value (int or None): Constant value for the raster. If None, random values are used.
crs (str): Coordinate Reference System for the raster.
nodata_value (float or None): Value to use as nodata. If None, no nodata values are set.
nodata_frequency (float): Probability of a cell being assigned the nodata value.
"""
transform = from_origin(0, 0, 1, 1) # Arbitrary transform
data = np.full((height, width), value, dtype=np.float32)

with rasterio.open(
file_path,
"w",
driver="GTiff",
height=height,
width=width,
count=1,
dtype=np.float32,
crs=crs,
transform=transform,
) as dst:
dst.write(data, 1)


def create_stack():
"""Creates a zarr dataset for testing"""
data = np.arange(16).reshape(2, 2, 4)
da = xr.DataArray(
data,
dims=("x", "y", "key"),
coords={"x": [0.0, 0.1], "y": [0.0, 0.1], "key": ["a", "b", "c", "d"]},
)
ds = xr.Dataset({"band_data": da})
ds.to_zarr("test.zarr", mode="w-")


if __name__ == "__main__":
# Generate single-band rasters
for i, letter in enumerate(["a", "b", "c", "d"]):
create_raster(f"single_band/{letter}.tif", 10, 10, value=i)

create_stack()
Empty file.
Loading
Loading