From d41118dd45eaf594dd63b38f7bc72c8152190c81 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Sat, 21 Oct 2023 09:51:34 -0400 Subject: [PATCH] Implicitly migrate on import In order not to require an extra `init` step, and for the user to know when to update their database with a `migrate`, we can migrate the database every time a user does an import. This will: 1. ensure that there is a complete database schema to import into 2. that it is up to date with the latest installed version of warcdb Closes #18 --- README.md | 7 +------ tests/test_warcdb.py | 6 ------ warcdb/__init__.py | 18 ++++-------------- 3 files changed, 5 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 4e7f2f9..eaee421 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,6 @@ pip install warcdb ``` ```shell - -# Create the database `archive.warcdb`. -warcdb init archive.warcdb - # Load the `archive.warcdb` file with data. warcdb import archive.warcdb ./tests/google.warc ./tests/frontpages.warc.gz "https://tselai.com/data/google.warc" @@ -44,10 +40,9 @@ Individual `.warc` files are read and parsed and their data is inserted into an ## Schema -If there is a new major or minor version of warcdb you may need to migrate existing databases to use the new database schema (if there have been any changes). To do this you first upgrade warcdb, and then migrate the database: +If there is a new major or minor version of warcdb you may need to migrate existing databases to use the new database schema (if there have been any changes). To do this you first upgrade warcdb, and then import into the database, which will make sure all migrations have been run. If you want to migrate the database explicitly you can: ```shell -pip install --upgrade warcdb warcdb migrate archive.warcdb ``` diff --git a/tests/test_warcdb.py b/tests/test_warcdb.py index 4842fef..c4c12fe 100644 --- a/tests/test_warcdb.py +++ b/tests/test_warcdb.py @@ -24,11 +24,6 @@ def test_import(warc_path): runner = CliRunner() - - # initialize db - result = runner.invoke(warcdb_cli, ['init', db_file]) - assert result.exit_code == 0 - args = ["import", db_file, warc_path] result = runner.invoke(warcdb_cli, args) assert result.exit_code == 0 @@ -46,7 +41,6 @@ def test_import(warc_path): def test_column_names(): runner = CliRunner() - runner.invoke(warcdb_cli, ['init', db_file]) runner.invoke(warcdb_cli, ["import", db_file, str(pathlib.Path('tests/google.warc'))]) # make sure that the columns are named correctly (lowercase with underscores) diff --git a/warcdb/__init__.py b/warcdb/__init__.py index f712baf..315d412 100644 --- a/warcdb/__init__.py +++ b/warcdb/__init__.py @@ -225,23 +225,10 @@ def __iadd__(self, r: ArcWarcRecord): "Commands for interacting with .warcdb files\n\nBased on SQLite-Utils" -@warcdb_cli.command('init') -@click.argument( - "db_path", - type=click.Path(file_okay=True, dir_okay=False, exists=False, allow_dash=False), -) -def init (db_path): - """ - Initialize a new warcdb database - """ - db = WarcDB(db_path) - migration.apply(db.db) - - @warcdb_cli.command('import') @click.argument( "db_path", - type=click.Path(file_okay=True, dir_okay=False, exists=True, allow_dash=False), + type=click.Path(file_okay=True, dir_okay=False, allow_dash=False), ) @click.argument('warc_path', type=click.STRING, @@ -256,6 +243,9 @@ def import_(db_path, warc_path, batch_size): """ db = WarcDB(db_path, batch_size=batch_size) + # ensure the schema is there and up to date + migration.apply(db.db) + # if batch_size: # warnings.warn("--batch-size has been temporarily disabled")