Skip to content

Commit

Permalink
Retire essay fetch (#274)
Browse files Browse the repository at this point in the history
Change the default for `title_sync` and `load_essays` to skip essays by default but add a `--no-skip-essays` flag which can be used not to.

Co-authored-by: Chris Adams <cadams@loc.gov>
  • Loading branch information
myusuf and acdha authored Apr 2, 2024
1 parent 6ed48d9 commit 004b098
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 76 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,4 @@ repos:
- flake8-builtins
- flake8-comprehensions
- flake8-logging-format
- setuptools
5 changes: 0 additions & 5 deletions conf/celeryconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@
CELERYD_CONCURRENCY = 2

CELERYBEAT_SCHEDULE = {
"load_essays": {
"task": "chronam.core.tasks.load_essays",
"schedule": crontab(hour=0, minute=0),
"args": (),
},
"delete_django_cache": {
"task": "chronam.core.tasks.delete_django_cache",
"schedule": crontab(hour=5, minute=0),
Expand Down
56 changes: 36 additions & 20 deletions core/management/commands/chronam_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,37 @@


class Command(LoggingCommand):
verbose = make_option('--verbose', action='store_true', dest='verbose', default=False, help='')
verbose = make_option("--verbose", action="store_true", dest="verbose", default=False, help="")

skip_essays = make_option(
'--skip-essays', action='store_true', dest='skip_essays', default=False, help='Skip essay loading.'
"--skip-essays",
action="store_true",
dest="skip_essays",
default=True,
help="Skip essay loading.",
)
no_skip_essays = make_option(
"--no-skip-essays",
action="store_false",
dest="skip_essays",
help="Do not skip essay loading.",
)

pull_title_updates = make_option(
'--pull-title-updates',
action='store_true',
dest='pull_title_updates',
"--pull-title-updates",
action="store_true",
dest="pull_title_updates",
default=False,
help='Pull down a new set of titles.',
help="Pull down a new set of titles.",
)

option_list = LoggingCommand.option_list + (verbose, skip_essays, pull_title_updates)
help = '' # NOQA: A003
args = ''
option_list = LoggingCommand.option_list + (
verbose,
skip_essays,
pull_title_updates,
)
help = "" # NOQA: A003
args = ""

def handle(self, **options):
if not (
Expand All @@ -46,27 +60,29 @@ def handle(self, **options):
return

start = datetime.now()
management.call_command('loaddata', 'languages.json')
management.call_command('loaddata', 'institutions.json')
management.call_command('loaddata', 'ethnicities.json')
management.call_command('loaddata', 'labor_presses.json')
management.call_command('loaddata', 'countries.json')
management.call_command("loaddata", "languages.json")
management.call_command("loaddata", "institutions.json")
management.call_command("loaddata", "ethnicities.json")
management.call_command("loaddata", "labor_presses.json")
management.call_command("loaddata", "countries.json")

bib_in_settings = validate_bib_dir()
if bib_in_settings:
# look in BIB_STORAGE for original titles to load
for filename in os.listdir(bib_in_settings):
if filename.startswith('titles-') and filename.endswith('.xml'):
if filename.startswith("titles-") and filename.endswith(".xml"):
filepath = os.path.join(bib_in_settings, filename)
management.call_command('load_titles', filepath, skip_index=True)
management.call_command("load_titles", filepath, skip_index=True)

management.call_command(
'title_sync', skip_essays=options['skip_essays'], pull_title_updates=options['pull_title_updates']
"title_sync",
skip_essays=options["skip_essays"],
pull_title_updates=options["pull_title_updates"],
)

end = datetime.now()
total_time = end - start
self.stdout.write('start time: %s' % start)
self.stdout.write('end time: %s' % end)
self.stdout.write('total time: %s' % total_time)
self.stdout.write("start time: %s" % start)
self.stdout.write("end time: %s" % end)
self.stdout.write("total time: %s" % total_time)
self.stdout.write("chronam_sync done.")
99 changes: 56 additions & 43 deletions core/management/commands/title_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,31 @@

class Command(LoggingCommand):
skip_essays = make_option(
'--skip-essays', action='store_true', dest='skip_essays', default=False, help='Skip essay loading.'
"--skip-essays",
action="store_true",
dest="skip_essays",
default=True,
help="Skip essay loading.",
)
no_skip_essays = make_option(
"--no-skip-essays",
action="store_false",
dest="skip_essays",
help="Do not skip essay loading.",
)

pull_title_updates = make_option(
'--pull-title-updates',
action='store_true',
dest='pull_title_updates',
"--pull-title-updates",
action="store_true",
dest="pull_title_updates",
default=False,
help='Pull down a new set of titles.',
help="Pull down a new set of titles.",
)

option_list = LoggingCommand.option_list + (skip_essays, pull_title_updates)

help = 'Runs title pull and title load for a complete title refresh.' # NOQA: A003
args = ''
help = "Runs title pull and title load for a complete title refresh." # NOQA: A003
args = ""

def find_titles_not_updated(self, limited=True):
LOGGER.info("Looking for titles not yet updated.")
Expand All @@ -50,25 +60,25 @@ def find_titles_not_updated(self, limited=True):
LOGGER.info("Total number of titles not updated: 0")
return Title.objects.values()
elif limited:
titles = Title.objects.order_by('-version').values('lccn_orig', 'oclc', 'version')
end = titles[0]['version']
titles = Title.objects.order_by("-version").values("lccn_orig", "oclc", "version")
end = titles[0]["version"]
else:
titles = Title.objects.order_by('-version')
titles = Title.objects.order_by("-version")
end = titles[0].version

start = end - timedelta(weeks=2)
titles = titles.exclude(version__range=(start, end))

LOGGER.info("Total number of titles not updated: %s" % len(titles))
LOGGER.info("Total number of titles not updated: %s", len(titles))
return titles

def pull_lccn_updates(self, titles):
start = datetime.now()
for t in titles:
call_command('pull_titles', lccn=t['lccn_orig'], oclc=t['oclc'])
call_command("pull_titles", lccn=t["lccn_orig"], oclc=t["oclc"])
end = datetime.now()
total_time = end - start
LOGGER.info('total time for pull_lccn_updates: %s' % total_time)
LOGGER.info("total time for pull_lccn_updates: %s", total_time)
return

def handle(self, *args, **options):
Expand All @@ -79,16 +89,16 @@ def handle(self, *args, **options):
# for folks in the opensource world
bib_in_settings = validate_bib_dir()
if bib_in_settings:
worldcat_dir = bib_in_settings + '/worldcat_titles/'
worldcat_dir = bib_in_settings + "/worldcat_titles/"

pull_titles = bool(options['pull_title_updates'] and hasattr(settings, "WORLDCAT_KEY"))
pull_titles = bool(options["pull_title_updates"] and hasattr(settings, "WORLDCAT_KEY"))
if pull_titles:
call_command('pull_titles')
call_command("pull_titles")

LOGGER.info("Starting load of OCLC titles.")
bulk_dir = worldcat_dir + 'bulk'
bulk_dir = worldcat_dir + "bulk"
if os.path.isdir(bulk_dir):
call_command('load_titles', bulk_dir, skip_index=True)
call_command("load_titles", bulk_dir, skip_index=True)

tnu = self.find_titles_not_updated()

Expand All @@ -98,15 +108,15 @@ def handle(self, *args, **options):
self.pull_lccn_updates(tnu)

LOGGER.info("Loading titles from second title pull.")
lccn_dir = worldcat_dir + 'lccn'
lccn_dir = worldcat_dir + "lccn"
if os.path.isdir(lccn_dir):
call_command('load_titles', lccn_dir, skip_index=True)
call_command("load_titles", lccn_dir, skip_index=True)

tnu = self.find_titles_not_updated(limited=False)
LOGGER.info("Running pre-deletion checks for these titles.")

# Make sure that our essays are up to date
if not options['skip_essays']:
if not options["skip_essays"]:
load_essays(settings.ESSAYS_FEED)

if bib_in_settings:
Expand All @@ -116,51 +126,54 @@ def handle(self, *args, **options):
essays = title.essays.all()
issues = title.issues.all()

error = "DELETION ERROR: Title %s has " % title
error_end = "It will not be deleted."
error = "DELETION ERROR: Title %s has %s. It will not be deleted."

if not essays or not issues:
delete_txt = (title.name, title.lccn, title.oclc)
LOGGER.info('TITLE DELETED: %s, lccn: %s, oclc: %s' % delete_txt)
LOGGER.info(
"TITLE DELETED: %s, lccn: %s, oclc: %s",
title.name,
title.lccn,
title.oclc,
)
title.delete()
elif essays:
LOGGER.warning(error + 'essays.' + error_end)
LOGGER.warning(error, title, "essays")
continue
elif issues:
LOGGER.warning(error + 'issues.' + error_end)
LOGGER.warning(error, title, "issues")
continue

# Load holdings for all remaining titles.
call_command('load_holdings')
call_command("load_holdings")

# overlay place info harvested from dbpedia onto the places table
try:
self.load_place_links()
except Exception as e:
LOGGER.exception(e)
except Exception:
LOGGER.exception("Unhandled exception loading place links")

index.index_titles()

# Time of full process run
end = datetime.now()
total_time = end - start
LOGGER.info('start time: %s' % start)
LOGGER.info('end time: %s' % end)
LOGGER.info('total time: %s' % total_time)
LOGGER.info("start time: %s", start)
LOGGER.info("end time: %s", end)
LOGGER.info("total time: %s", total_time)
LOGGER.info("title_sync done.")

def load_place_links(self):
LOGGER.info('loading place links')
LOGGER.info("loading place links")
_CORE_ROOT = os.path.abspath(os.path.dirname(core.__file__))
filename = os.path.join(_CORE_ROOT, './fixtures/place_links.json')
filename = os.path.join(_CORE_ROOT, "./fixtures/place_links.json")
for p in json.load(open(filename)):
try:
place = Place.objects.get(name=p['name'])
except (Place.DoesNotExist):
place = Place(name=p['name'])
place.longitude = p['longitude']
place.latitude = p['latitude']
place.geonames = p['geonames']
place.dbpedia = p['dbpedia']
place = Place.objects.get(name=p["name"])
except Place.DoesNotExist:
place = Place(name=p["name"])
place.longitude = p["longitude"]
place.latitude = p["latitude"]
place.geonames = p["geonames"]
place.dbpedia = p["dbpedia"]
place.save()
LOGGER.info('finished loading place links')
LOGGER.info("finished loading place links")
8 changes: 0 additions & 8 deletions core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,6 @@ def load_batch(batch_dir, service_request=None, process_coordinates=True):
service_request.fail(str(e))


@task
def load_essays():
try:
management.call_command("load_essays")
except Exception:
logger.exception("Unable to load essays")


@task
def purge_batch(batch, service_request=None):
try:
Expand Down

0 comments on commit 004b098

Please sign in to comment.