Retire essay fetch (#274)

Change the default for `title_sync` and `load_essays` to skip essays by default but add a `--no-skip-essays` flag which can be used not to. Co-authored-by: Chris Adams <cadams@loc.gov>
LibraryOfCongress · Apr 2, 2024 · 004b098 · 004b098
1 parent 6ed48d9
commit 004b098
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 76 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -55,3 +55,4 @@ repos:
                 - flake8-builtins
                 - flake8-comprehensions
                 - flake8-logging-format
+                - setuptools
diff --git a/conf/celeryconfig.py b/conf/celeryconfig.py
@@ -16,11 +16,6 @@
 CELERYD_CONCURRENCY = 2
 
 CELERYBEAT_SCHEDULE = {
-    "load_essays": {
-        "task": "chronam.core.tasks.load_essays",
-        "schedule": crontab(hour=0, minute=0),
-        "args": (),
-    },
     "delete_django_cache": {
         "task": "chronam.core.tasks.delete_django_cache",
         "schedule": crontab(hour=5, minute=0),

diff --git a/core/management/commands/chronam_sync.py b/core/management/commands/chronam_sync.py
@@ -13,23 +13,37 @@
 
 
 class Command(LoggingCommand):
-    verbose = make_option('--verbose', action='store_true', dest='verbose', default=False, help='')
+    verbose = make_option("--verbose", action="store_true", dest="verbose", default=False, help="")
 
     skip_essays = make_option(
-        '--skip-essays', action='store_true', dest='skip_essays', default=False, help='Skip essay loading.'
+        "--skip-essays",
+        action="store_true",
+        dest="skip_essays",
+        default=True,
+        help="Skip essay loading.",
+    )
+    no_skip_essays = make_option(
+        "--no-skip-essays",
+        action="store_false",
+        dest="skip_essays",
+        help="Do not skip essay loading.",
     )
 
     pull_title_updates = make_option(
-        '--pull-title-updates',
-        action='store_true',
-        dest='pull_title_updates',
+        "--pull-title-updates",
+        action="store_true",
+        dest="pull_title_updates",
         default=False,
-        help='Pull down a new set of titles.',
+        help="Pull down a new set of titles.",
     )
 
-    option_list = LoggingCommand.option_list + (verbose, skip_essays, pull_title_updates)
-    help = ''  # NOQA: A003
-    args = ''
+    option_list = LoggingCommand.option_list + (
+        verbose,
+        skip_essays,
+        pull_title_updates,
+    )
+    help = ""  # NOQA: A003
+    args = ""
 
     def handle(self, **options):
         if not (
@@ -46,27 +60,29 @@ def handle(self, **options):
             return
 
         start = datetime.now()
-        management.call_command('loaddata', 'languages.json')
-        management.call_command('loaddata', 'institutions.json')
-        management.call_command('loaddata', 'ethnicities.json')
-        management.call_command('loaddata', 'labor_presses.json')
-        management.call_command('loaddata', 'countries.json')
+        management.call_command("loaddata", "languages.json")
+        management.call_command("loaddata", "institutions.json")
+        management.call_command("loaddata", "ethnicities.json")
+        management.call_command("loaddata", "labor_presses.json")
+        management.call_command("loaddata", "countries.json")
 
         bib_in_settings = validate_bib_dir()
         if bib_in_settings:
             # look in BIB_STORAGE for original titles to load
             for filename in os.listdir(bib_in_settings):
-                if filename.startswith('titles-') and filename.endswith('.xml'):
+                if filename.startswith("titles-") and filename.endswith(".xml"):
                     filepath = os.path.join(bib_in_settings, filename)
-                    management.call_command('load_titles', filepath, skip_index=True)
+                    management.call_command("load_titles", filepath, skip_index=True)
 
         management.call_command(
-            'title_sync', skip_essays=options['skip_essays'], pull_title_updates=options['pull_title_updates']
+            "title_sync",
+            skip_essays=options["skip_essays"],
+            pull_title_updates=options["pull_title_updates"],
         )
 
         end = datetime.now()
         total_time = end - start
-        self.stdout.write('start time: %s' % start)
-        self.stdout.write('end time: %s' % end)
-        self.stdout.write('total time: %s' % total_time)
+        self.stdout.write("start time: %s" % start)
+        self.stdout.write("end time: %s" % end)
+        self.stdout.write("total time: %s" % total_time)
         self.stdout.write("chronam_sync done.")
diff --git a/core/management/commands/title_sync.py b/core/management/commands/title_sync.py
@@ -27,21 +27,31 @@
 
 class Command(LoggingCommand):
     skip_essays = make_option(
-        '--skip-essays', action='store_true', dest='skip_essays', default=False, help='Skip essay loading.'
+        "--skip-essays",
+        action="store_true",
+        dest="skip_essays",
+        default=True,
+        help="Skip essay loading.",
+    )
+    no_skip_essays = make_option(
+        "--no-skip-essays",
+        action="store_false",
+        dest="skip_essays",
+        help="Do not skip essay loading.",
     )
 
     pull_title_updates = make_option(
-        '--pull-title-updates',
-        action='store_true',
-        dest='pull_title_updates',
+        "--pull-title-updates",
+        action="store_true",
+        dest="pull_title_updates",
         default=False,
-        help='Pull down a new set of titles.',
+        help="Pull down a new set of titles.",
     )
 
     option_list = LoggingCommand.option_list + (skip_essays, pull_title_updates)
 
-    help = 'Runs title pull and title load for a complete title refresh.'  # NOQA: A003
-    args = ''
+    help = "Runs title pull and title load for a complete title refresh."  # NOQA: A003
+    args = ""
 
     def find_titles_not_updated(self, limited=True):
         LOGGER.info("Looking for titles not yet updated.")
@@ -50,25 +60,25 @@ def find_titles_not_updated(self, limited=True):
             LOGGER.info("Total number of titles not updated: 0")
             return Title.objects.values()
         elif limited:
-            titles = Title.objects.order_by('-version').values('lccn_orig', 'oclc', 'version')
-            end = titles[0]['version']
+            titles = Title.objects.order_by("-version").values("lccn_orig", "oclc", "version")
+            end = titles[0]["version"]
         else:
-            titles = Title.objects.order_by('-version')
+            titles = Title.objects.order_by("-version")
             end = titles[0].version
 
         start = end - timedelta(weeks=2)
         titles = titles.exclude(version__range=(start, end))
 
-        LOGGER.info("Total number of titles not updated: %s" % len(titles))
+        LOGGER.info("Total number of titles not updated: %s", len(titles))
         return titles
 
     def pull_lccn_updates(self, titles):
         start = datetime.now()
         for t in titles:
-            call_command('pull_titles', lccn=t['lccn_orig'], oclc=t['oclc'])
+            call_command("pull_titles", lccn=t["lccn_orig"], oclc=t["oclc"])
         end = datetime.now()
         total_time = end - start
-        LOGGER.info('total time for pull_lccn_updates: %s' % total_time)
+        LOGGER.info("total time for pull_lccn_updates: %s", total_time)
         return
 
     def handle(self, *args, **options):
@@ -79,16 +89,16 @@ def handle(self, *args, **options):
         # for folks in the opensource world
         bib_in_settings = validate_bib_dir()
         if bib_in_settings:
-            worldcat_dir = bib_in_settings + '/worldcat_titles/'
+            worldcat_dir = bib_in_settings + "/worldcat_titles/"
 
-            pull_titles = bool(options['pull_title_updates'] and hasattr(settings, "WORLDCAT_KEY"))
+            pull_titles = bool(options["pull_title_updates"] and hasattr(settings, "WORLDCAT_KEY"))
             if pull_titles:
-                call_command('pull_titles')
+                call_command("pull_titles")
 
             LOGGER.info("Starting load of OCLC titles.")
-            bulk_dir = worldcat_dir + 'bulk'
+            bulk_dir = worldcat_dir + "bulk"
             if os.path.isdir(bulk_dir):
-                call_command('load_titles', bulk_dir, skip_index=True)
+                call_command("load_titles", bulk_dir, skip_index=True)
 
             tnu = self.find_titles_not_updated()
 
@@ -98,15 +108,15 @@ def handle(self, *args, **options):
                 self.pull_lccn_updates(tnu)
 
             LOGGER.info("Loading titles from second title pull.")
-            lccn_dir = worldcat_dir + 'lccn'
+            lccn_dir = worldcat_dir + "lccn"
             if os.path.isdir(lccn_dir):
-                call_command('load_titles', lccn_dir, skip_index=True)
+                call_command("load_titles", lccn_dir, skip_index=True)
 
             tnu = self.find_titles_not_updated(limited=False)
             LOGGER.info("Running pre-deletion checks for these titles.")
 
         # Make sure that our essays are up to date
-        if not options['skip_essays']:
+        if not options["skip_essays"]:
             load_essays(settings.ESSAYS_FEED)
 
         if bib_in_settings:
@@ -116,51 +126,54 @@ def handle(self, *args, **options):
                     essays = title.essays.all()
                     issues = title.issues.all()
 
-                    error = "DELETION ERROR: Title %s has " % title
-                    error_end = "It will not be deleted."
+                    error = "DELETION ERROR: Title %s has %s. It will not be deleted."
 
                     if not essays or not issues:
-                        delete_txt = (title.name, title.lccn, title.oclc)
-                        LOGGER.info('TITLE DELETED: %s, lccn: %s, oclc: %s' % delete_txt)
+                        LOGGER.info(
+                            "TITLE DELETED: %s, lccn: %s, oclc: %s",
+                            title.name,
+                            title.lccn,
+                            title.oclc,
+                        )
                         title.delete()
                     elif essays:
-                        LOGGER.warning(error + 'essays.' + error_end)
+                        LOGGER.warning(error, title, "essays")
                         continue
                     elif issues:
-                        LOGGER.warning(error + 'issues.' + error_end)
+                        LOGGER.warning(error, title, "issues")
                         continue
 
             # Load holdings for all remaining titles.
-            call_command('load_holdings')
+            call_command("load_holdings")
 
         # overlay place info harvested from dbpedia onto the places table
         try:
             self.load_place_links()
-        except Exception as e:
-            LOGGER.exception(e)
+        except Exception:
+            LOGGER.exception("Unhandled exception loading place links")
 
         index.index_titles()
 
         # Time of full process run
         end = datetime.now()
         total_time = end - start
-        LOGGER.info('start time: %s' % start)
-        LOGGER.info('end time: %s' % end)
-        LOGGER.info('total time: %s' % total_time)
+        LOGGER.info("start time: %s", start)
+        LOGGER.info("end time: %s", end)
+        LOGGER.info("total time: %s", total_time)
         LOGGER.info("title_sync done.")
 
     def load_place_links(self):
-        LOGGER.info('loading place links')
+        LOGGER.info("loading place links")
         _CORE_ROOT = os.path.abspath(os.path.dirname(core.__file__))
-        filename = os.path.join(_CORE_ROOT, './fixtures/place_links.json')
+        filename = os.path.join(_CORE_ROOT, "./fixtures/place_links.json")
         for p in json.load(open(filename)):
             try:
-                place = Place.objects.get(name=p['name'])
-            except (Place.DoesNotExist):
-                place = Place(name=p['name'])
-            place.longitude = p['longitude']
-            place.latitude = p['latitude']
-            place.geonames = p['geonames']
-            place.dbpedia = p['dbpedia']
+                place = Place.objects.get(name=p["name"])
+            except Place.DoesNotExist:
+                place = Place(name=p["name"])
+            place.longitude = p["longitude"]
+            place.latitude = p["latitude"]
+            place.geonames = p["geonames"]
+            place.dbpedia = p["dbpedia"]
             place.save()
-        LOGGER.info('finished loading place links')
+        LOGGER.info("finished loading place links")
diff --git a/core/tasks.py b/core/tasks.py
@@ -37,14 +37,6 @@ def load_batch(batch_dir, service_request=None, process_coordinates=True):
             service_request.fail(str(e))
 
 
-@task
-def load_essays():
-    try:
-        management.call_command("load_essays")
-    except Exception:
-        logger.exception("Unable to load essays")
-
-
 @task
 def purge_batch(batch, service_request=None):
     try: