From 41848dd3434697c57aa4bd401c2472ee902d8789 Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Sat, 28 Oct 2023 15:49:33 -0400
Subject: [PATCH] Add a view for HTTP headers

This commit adds a migration that creates a view of the HTTP headers in the response table. Once the view is in place you can run a query like this without requiring JSON parsing:

```sql
SELECT warc_record_id, name, value FROM http_headers;
```

It can be helpful for identifying for things like:

```sql
SELECT
  value,
  COUNT(*) AS count
FROM http_header
WHERE name = 'content-type'
GROUP BY value
ORDER BY count DESC;

value                              count
---------------------------------  -----
application/javascript             57
image/png                          11
text/css                           7
text/html; charset=utf-8           6
image/jpeg                         4
image/gif                          4
text/fragment+html; charset=utf-8  3
image/svg+xml                      3
text/plain                         2
text/html; charset=UTF-8           1
```

Closes #24
---
 README.md            |  3 +++
 tests/test_warcdb.py | 16 ++++++++++++++++
 warcdb/migrations.py | 14 ++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/README.md b/README.md
index 31188e7..0c4a8ab 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,9 @@ Here's the relational schema of the `.warcdb` file.
 
 ![WarcDB Schema](schema.png)
 
+In addition to the core tables that map to the WARC record types there are also helper views that make it a bit easier to query data:
+
+- *http_header*: A view of HTTP headers in responses where each row is a tuple of `(warc_record_id, name, value)` 
 
 ## Motivation
 
diff --git a/tests/test_warcdb.py b/tests/test_warcdb.py
index bc6b41c..dc2be10 100644
--- a/tests/test_warcdb.py
+++ b/tests/test_warcdb.py
@@ -64,3 +64,19 @@ def test_column_names():
             assert re.match(r"^[a-z_]+", col.name), f"column {col.name} named correctly"
 
     os.remove(db_file)
+
+
+def test_http_header():
+    runner = CliRunner()
+    runner.invoke(
+        warcdb_cli, ["import", db_file, str(pathlib.Path("tests/google.warc"))]
+    )
+
+    db = sqlite_utils.Database(db_file)
+    headers = list(db["http_header"].rows)
+    assert len(headers) == 43
+    assert {
+        "name": "content-type",
+        "value": "text/html; charset=UTF-8",
+        "warc_record_id": "<urn:uuid:2008CBED-030B-435B-A4DF-09A842DDB764>",
+    } in headers
diff --git a/warcdb/migrations.py b/warcdb/migrations.py
index 64925ba..f5f2582 100644
--- a/warcdb/migrations.py
+++ b/warcdb/migrations.py
@@ -95,3 +95,17 @@ def m001_initial(db):
             ("warc_concurrent_to", "metadata", "warc_record_id"),
         ],
     )
+
+
+@migration()
+def m002_headers(db):
+    db.create_view(
+        "http_header",
+        """
+            SELECT
+                response.warc_record_id AS warc_record_id,
+                LOWER(JSON_EXTRACT(JSON_EACH.VALUE, '$.header')) AS name,
+                JSON_EXTRACT(JSON_EACH.VALUE, '$.value') AS value
+            FROM response, JSON_EACH(response.http_headers)
+        """,
+    )