diff --git a/README.md b/README.md index 31188e7..0c4a8ab 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,9 @@ Here's the relational schema of the `.warcdb` file. ![WarcDB Schema](schema.png) +In addition to the core tables that map to the WARC record types there are also helper views that make it a bit easier to query data: + +- *http_header*: A view of HTTP headers in responses where each row is a tuple of `(warc_record_id, name, value)` ## Motivation diff --git a/warcdb/migrations.py b/warcdb/migrations.py index 64925ba..43f3fab 100644 --- a/warcdb/migrations.py +++ b/warcdb/migrations.py @@ -95,3 +95,16 @@ def m001_initial(db): ("warc_concurrent_to", "metadata", "warc_record_id"), ], ) + +@migration() +def m002_headers(db): + db.create_view( + "http_header", + """ + SELECT + response.warc_record_id AS warc_record_id, + LOWER(JSON_EACH.VALUE ->> '$.header') AS name, + JSON_EACH.VALUE ->> '$.value' AS value + FROM response, JSON_EACH(response.http_headers) + """ + )