Skip to content

Commit

Permalink
Merge pull request #1879 from alphagov/remove-dataset-duplicates
Browse files Browse the repository at this point in the history
Remove dataset duplicates
  • Loading branch information
kentsanggds authored Nov 14, 2024
2 parents ccf107e + 4785561 commit 65ad16f
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 30 deletions.
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Set up environment from ckan.ini
# export POSTGRES_URL=<sqlalchemy.url from ckan.ini>
#
# Execute script like this -
# python remove_march2019_duplicates.py
# python remove_duplicates.py - to show impacted datasets
# python remove_duplicates.py run - to delete duplicate datasets
#

import os
Expand All @@ -15,7 +13,7 @@

import logging

POSTGRES_URL = os.environ.get('POSTGRES_URL')
POSTGRES_URL = os.environ.get('CKAN_SQLALCHEMY_URL')

logger = logging.getLogger(__name__)
connection = psycopg2.connect(POSTGRES_URL)
Expand All @@ -40,9 +38,7 @@ def is_local():
return '@localhost' in POSTGRES_URL


def get_duplicate_datasets():
cursor = connection.cursor()
sql = """
MARCH_2019_SQL = """
WITH duplicates AS
(SELECT COUNT(*) AS duplicate_count, owner_org, title, notes,
package_extra.value AS metadata_date, package.state AS package_state
Expand Down Expand Up @@ -74,19 +70,51 @@ def get_duplicate_datasets():
ORDER BY publisher, package.title, pkg_created
""" % ("AND package.metadata_created BETWEEN '2019-03-01' AND '2019-04-01'" if not is_local() else '')

cursor.execute(sql)

NOV_2024_TITLES_SQL = "SELECT title FROM package WHERE state = 'active' GROUP BY title, owner_org " \
"HAVING COUNT(*) > 100;"


# retrieve active package_ids which are matching titles from 2 publishers with more than 100 duplicate datasets
NOV_2024_PACKAGE_IDS_SQL = "SELECT package_extra.package_id FROM package_extra, harvest_object WHERE " \
"harvest_object.id = value AND key = 'harvest_object_id' AND value IN (" \
"SELECT id FROM harvest_object WHERE id IN (" \
"SELECT value FROM package_extra WHERE key = 'harvest_object_id' AND package_id IN (" \
"SELECT id FROM package WHERE title = '%s' AND state = 'active' AND owner_org IN " \
"('c924c995-e063-4f30-bbd3-61418486f0a9', 'b6b50d70-9d5c-4fef-9135-7756cca343c3')))) " \
"ORDER BY metadata_modified_date DESC;"


def get_duplicate_datasets(sql, token=None):
cursor = connection.cursor()
_sql = globals()[sql]
_sql = _sql % token if token else _sql

cursor.execute(_sql)

return cursor


def delete_dataset(dataset):
paster_command = 'paster --plugin=ckan dataset delete {} -c /{}/ckan/ckan.ini'.format(
dataset[0], 'etc' if is_local() else 'var')
command = 'ckan dataset delete {}'.format(
dataset[0])

logger.info('CKAN delete dataset - Running command: %s', command)

try:
subprocess.call(command, shell=True)
except Exception as exception:
logger.error('Subprocess Failed, exception occured: %s', exc_info=exception)


def reindex_dataset(dataset):
command = 'ckan search-index rebuild {}'.format(
dataset[0])

logger.info('CKAN delete dataset - Running command: %s', paster_command)
logger.info('CKAN reindex dataset - Running command: %s', command)

try:
subprocess.call(paster_command, shell=True)
subprocess.call(command, shell=True)
except Exception as exception:
logger.error('Subprocess Failed, exception occured: %s', exc_info=exception)

Expand All @@ -110,20 +138,20 @@ def reindex_solr():
for line in f.readlines():
fields = line.split(',')

paster_command = 'paster --plugin=ckan search-index rebuild {} -c /{}/ckan/ckan.ini'.format(
fields[0], 'etc' if is_local() else 'var')
command = 'ckan search-index rebuild {}'.format(
fields[0])

logger.info('CKAN reindex - Running command: %s', paster_command)
logger.info('CKAN reindex - Running command: %s', command)

try:
subprocess.call(paster_command, shell=True)
subprocess.call(command, shell=True)
except Exception as exception:
logger.error('Subprocess Failed, exception occured: %s', exc_info=exception)


def main(command=None):
def main(command=None, sql="NOV_2024_TITLES_SQL", subset_sql="NOV_2024_PACKAGE_IDS_SQL"):
while command not in ['show', 'run', 'reindex']:
command = raw_input('(Options: show, run, reindex) show? ')
command = input('(Options: show, run, reindex) show? ')
if not command:
command = 'show'

Expand All @@ -147,11 +175,29 @@ def main(command=None):
csv_rows = ''

logger.info('Delete duplicate datasets')
for i, dataset in enumerate(get_duplicate_datasets()):
logger.info('%d - %r', i, dataset)
if run:
csv_rows += ','.join(dataset) + '\n'
delete_dataset(dataset)
counter = 0
for dataset in get_duplicate_datasets(sql):
if subset_sql:
reindexed_dataset = False
for subset_dataset in get_duplicate_datasets(subset_sql, token=dataset):
# reindex the latest dataset to make it available
if not reindexed_dataset:
reindex_dataset(subset_dataset)
reindexed_dataset = True
continue

counter += 1

logger.info('%d - %r', counter, f"{dataset}-{subset_dataset}")
if run:
csv_rows += ','.join(subset_dataset) + '\n'
delete_dataset(subset_dataset)
else:
counter += 1
logger.info('%d - %r', counter, dataset)
if run:
csv_rows += ','.join(dataset) + '\n'
delete_dataset(dataset)

if run:
create_csv(csv_rows)
Expand Down
4 changes: 2 additions & 2 deletions build-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ apps:
ckan: &app_ckan
name: ckan
version: "2.10.4"
patch: f
patch: g
pycsw: &app_pycsw
name: pycsw
version: "2.6.1"
patch: l
patch: m
solr: &app_solr
name: solr
version: "2.10"
Expand Down
4 changes: 2 additions & 2 deletions docker/ckan/2.10.4-base.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG BASE_IMAGE=ghcr.io/alphagov/ckan:2.10.4-f-core
ARG BASE_IMAGE=ghcr.io/alphagov/ckan:2.10.4-g-core
FROM --platform=$TARGETPLATFORM ${BASE_IMAGE}

COPY production.ini $CKAN_CONFIG/production.ini
Expand All @@ -15,7 +15,7 @@ ENV ckan_harvest_sha='9fb44f79809a1c04dfeb0e1ca2540c5ff3cacef4'
ENV ckan_dcat_fork='ckan'
ENV ckan_dcat_sha='618928be5a211babafc45103a72b6aab4642e964'

ENV ckan_spatial_sha='23f9e5d0d07fa411ffea56498167da6a2c9a7df6'
ENV ckan_spatial_sha='1eded8ad2236b3d885e56f9c39ffab52294fd4d0'
ENV ckan_spatial_fork='alphagov'

RUN echo "pip install DGU extensions..." && \
Expand Down
2 changes: 1 addition & 1 deletion docker/ckan/2.10.4.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ghcr.io/alphagov/ckan:2.10.4-f-base
FROM ghcr.io/alphagov/ckan:2.10.4-g-base

USER root

Expand Down
2 changes: 1 addition & 1 deletion docker/pycsw/2.6.1.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ WORKDIR $CKAN_VENV/src
USER ckan
EXPOSE 5000

ENV ckan_spatial_sha='23f9e5d0d07fa411ffea56498167da6a2c9a7df6'
ENV ckan_spatial_sha='1eded8ad2236b3d885e56f9c39ffab52294fd4d0'
ENV ckan_spatial_fork='alphagov'

ENV ckan_harvest_fork='ckan'
Expand Down

0 comments on commit 65ad16f

Please sign in to comment.