Store sequence metadata locally (#164)

* Get name and references from local metadata Use local sequence file reads as drop-in replacements for OEIS searches. Update tests to use sequences' real names instead of placeholders, because names and raw references are now written to the database almost immediately after `fetch_metadata` is called. Since we're still fetching names and raw references asynchronously, this could make the tests flaky. * Add instructions for downloading the OEIS metadata * Stop using the first B-file comment as stopgap name The `get_oeis_name_and_values` endpoint only looks up a sequence's name if the name field in the database is either empty or set to the placeholder name. As a result, this endpoint can't overwrite the stopgap name, even though we want it to. It seems like this problem should've existed before we started developing the local metadata feature, so it would be useful to know why we didn't notice it before. This is hard to investigate, though, because the OEIS search format change prevents the old code from working as-is. * Strip sequence name in `get_oeis_name_and_values` Also, add a test of that would've caught this bug. * Add `ripgrep` as a dependency * fix: Restore search_oeis endpoint, at least somewhat --------- Co-authored-by: Aaron Fenyes <aaron.fenyes@fareycircles.ooo> Co-authored-by: Glen Whitney <glen@studioinfinity.org>
numberscope · Feb 10, 2025 · 535dc52 · 535dc52
1 parent 3162931
commit 535dc52
Show file tree

Hide file tree

Showing 9 changed files with 119 additions and 77 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,9 @@ node_modules
 /dist
 migrations
 
+# OEIS metadata
+oeisdata
+
 # Python specific
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -22,9 +22,12 @@ disruptive and wastefully repetitive if performed in each visitor's browser.
 [installing backscope on Ubuntu](doc/install-ubuntu.md), which can perhaps
 also be tailored to other Linux distributions or other operating systems.)
 
-1. Install Git if need be and clone this repo from
-   `github.com/numberscope/backscope`. Switch to the top-level directory
+1. Install Git, if need be, and clone this repo by calling
+   `git clone github.com/numberscope/backscope`. Switch to the top-level directory
    of the clone.
+2. Nest the OEIS metadata repository within this one by calling `git clone https://github.com/oeis/oeisdata`.
+   + As of 2025, the metadata repository contains about 0.6&nbsp;GB of current data and 0.5&nbsp;GB of Git history, for a total of 1.1&nbsp;GB.
+   + To do Git operations on the nested metadata repository, call Git from anywhere inside the `oeisdata` directory.
 2. Install prerequisites:
    + Python 3 (>= version 3.9)
    + The Python 3 dev package

diff --git a/doc/install-ubuntu.md b/doc/install-ubuntu.md
@@ -54,6 +54,12 @@ cd backscope
 All later commands in the installation sequence assume that you are in this
 directory.
 
+### Clone the OEIS metadata
+
+Nest the OEIS metadata repository within the backscope repository by calling `git clone https://github.com/oeis/oeisdata`. As of 2025, the metadata repository contains about 0.6&nbsp;GB of current data and 0.5&nbsp;GB of Git history, for a total of 1.1&nbsp;GB.
+
+The `oeisdata` directory is already listed in backscope's `.gitignore`, so the metadata files won't clutter your view when you call Git from outside `oeisdata`. To do Git operations on the nested metadata repository, call Git from anywhere inside `oeisdata`.
+
 ### Install pari-gp, required for cypari2
 
 This is the actual PARI/GP package. You need to have a full installation,

diff --git a/flaskr/nscope/test/test_get_oeis_values.py b/flaskr/nscope/test/test_get_oeis_values.py
@@ -11,7 +11,7 @@ class TestGetOEISValuesWithoutShift(abstract_endpoint_test.AbstractEndpointTest)
   #   background work triggered by the request
   expected_response = {
     'id': 'A153080',
-    'name': 'A153080 [name not yet loaded]',
+    'name': 'a(n) = 13*n + 2.',
     'values': {
       '0': '2',
       '1': '15',
@@ -38,7 +38,7 @@ class TestGetOEISValues(abstract_endpoint_test.AbstractEndpointTest):
   #   background work triggered by the request
   expected_response = {
     'id': 'A321580',
-    'name': 'A321580 [name not yet loaded]',
+    'name': 'Numbers k such that it is possible to reverse a deck of k cards by a sequence of perfect Faro shuffles with cut.',
     'values': {
       '1': '1',
       '2': '2',
@@ -69,7 +69,7 @@ class TestGetOEISValuesNegativeShift(abstract_endpoint_test.AbstractEndpointTest
   #   such a comment works
   expected_response = {
     'id': 'A078302',
-    'name': ' A078302 (b-file synthesized from sequence entry)',
+    'name': 'Decimal expansion of Planck time (in seconds).',
     'values': {
       '-43': '5',
       '-42': '3',
@@ -78,5 +78,39 @@ class TestGetOEISValuesNegativeShift(abstract_endpoint_test.AbstractEndpointTest
     }
   }
 
+class TestGetOEISNameAndValues(abstract_endpoint_test.AbstractEndpointTest):
+  endpoint = 'http://localhost:5000/api/get_oeis_name_and_values/A178600'
+
+  # we choose A178600 because:
+  # - it only has fifteen entries, so we can hard-code all of them into the
+  #   test. since it's a finite sequence, we don't have to worry about more
+  #   values being added
+  # - it has zero shift, so the test can pass even if the shift defaults to zero
+  # - it has small values and few references, which speeds up the background
+  #   work triggered by the request
+  # sequence A070178 ("coefficients of Lehmer's polynomial") would be an
+  # equally good choice
+  expected_response = {
+    'id': 'A178600',
+    'name': 'Expansion of the polynomial (1+x^3)*(1+x^11).',
+    'values': {
+      '0': '1',
+      '1': '0',
+      '2': '0',
+      '3': '1',
+      '4': '0',
+      '5': '0',
+      '6': '0',
+      '7': '0',
+      '8': '0',
+      '9': '0',
+      '10': '0',
+      '11': '1',
+      '12': '0',
+      '13': '0',
+      '14': '1'
+    }
+  }
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/flaskr/nscope/test/test_search_oeis.py b/flaskr/nscope/test/test_search_oeis.py
@@ -2,6 +2,7 @@
 import flaskr.nscope.test.abstract_endpoint_test as abstract_endpoint_test
 
 
+@unittest.skip("The OEIS search endpoint needs to be updated to handle the new result format")
 class TestSearchOEIS(abstract_endpoint_test.AbstractEndpointTest):
   endpoint = 'http://localhost:5000/api/search_oeis/germain'
 
@@ -30,5 +31,6 @@ class TestSearchOEIS(abstract_endpoint_test.AbstractEndpointTest):
     ]
   }
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/flaskr/nscope/test/test_wrong_response_type.py b/flaskr/nscope/test/test_wrong_response_type.py
@@ -1,5 +1,6 @@
 import flaskr.nscope.test.abstract_mock_oeis_test as abstract_mock_oeis_test
 
+
 class TestUnavailableValues(abstract_mock_oeis_test.AbstractMockOEISTest):
   values_available = False
   endpoint = 'http://localhost:5000/api/get_oeis_values/A153080/12'
@@ -9,15 +10,4 @@ class TestUnavailableValues(abstract_mock_oeis_test.AbstractMockOEISTest):
 class TestUnavailableValuesInNameAndValues(abstract_mock_oeis_test.AbstractMockOEISTest):
   values_available = False
   endpoint = 'http://localhost:5000/api/get_oeis_name_and_values/A153080'
-  expected_response = 'Error: 503 Server Error: SERVICE UNAVAILABLE for url: http://localhost:5001/A153080/b153080.txt'
-
-
-class TestUnavailableSearchInNameAndValues(abstract_mock_oeis_test.AbstractMockOEISTest):
-  search_available = False
-  endpoint = 'http://localhost:5000/api/get_oeis_name_and_values/A153080'
-  expected_response = 'Error: 503 Server Error: SERVICE UNAVAILABLE for url: http://localhost:5001/search?q=id%3AA153080&fmt=json'
-
-class TestUnavailableSearch(abstract_mock_oeis_test.AbstractMockOEISTest):
-  search_available = False
-  endpoint = 'http://localhost:5000/api/get_oeis_metadata/A153080'
-  expected_response = 'Error: 503 Server Error: SERVICE UNAVAILABLE for url: http://localhost:5001/search?q=A153080&fmt=json'
+  expected_response = 'Error: 503 Server Error: SERVICE UNAVAILABLE for url: http://localhost:5001/A153080/b153080.txt'
diff --git a/flaskr/nscope/views.py b/flaskr/nscope/views.py
@@ -6,11 +6,12 @@
 import base64 # for encoding response dumps
 from flask import Blueprint, jsonify, current_app, render_template
 from flask_executor import Executor
+import os
 import re
 import requests
 from requests_toolbelt.utils import dump
 import structlog
-from subprocess import check_output, TimeoutExpired
+import subprocess
 from sympy import factorint
 from tempfile import NamedTemporaryFile
 import time
@@ -109,10 +110,10 @@ def oeis_get(path='', params=None, json=True, timeout=4):
 def fetch_metadata(oeis_id):
     """ When called with a *valid* oeis id, makes sure the metadata has been
         obtained, and returns the corresponding Sequence object with valid
-        metadata.
-
-        Note that this also crawls all backreferences, so it can take quite
-        a long time for popular sequences (potentially hours).
+        metadata. The sequence's name and raw references should be written to
+        the database almost immediately, but the function won't return until
+        we also finish searching for backreferences, which can take several
+        seconds.
     """
     seq = find_oeis_sequence(oeis_id)
     if seq.backrefs is not None:
@@ -150,46 +151,50 @@ def fetch_metadata(oeis_id):
     seq.meta_req_time = our_req_time
     db.session.commit()
 
-    # Try to grab the metadata
-    search_params = {'q': seq.id, 'fmt': 'json'}
-    search_response = oeis_get('/search', search_params)
-    if isinstance(search_response, Exception):
-        return search_response
-    if search_response['results'] != None:
-        # We found some metadata. Write down the reference count, so later
-        # threads can decide how long to wait for us
-        ref_count = search_response['count']
-        seq.ref_count = ref_count
-        db.session.commit()
+    # Look up our sequence's name and raw references. The metadata file we're
+    # parsing is written in the OEIS internal format, which is specified here:
+    #
+    #   https://oeis.org/eishelp1.html
+    #
+    short_id = seq.id[:4]
+    oeis_data_path = os.path.join('oeisdata', 'seq')
+    seq_file_path = os.path.join(oeis_data_path, short_id, seq.id + '.seq')
+    prefix_len = 4 + len(seq.id)
+    seq.raw_refs = ''
+    for line in open(seq_file_path):
+        if line.startswith('%N'):
+            # the OEIS internal format specification says that only one %N line
+            # can appear, so the line we just found must be the whole name
+            seq.name = line[prefix_len:]
+        elif line.startswith('%Y'):
+            seq.raw_refs += line[prefix_len:]
+    seq.name = seq.name.strip()
+    seq.raw_refs = seq.raw_refs.strip()
+    db.session.commit()
 
-        backrefs = []
-        target_number = int(seq.id[1:])
-        saw = 0
-        while (saw < ref_count):
-            for result in search_response['results']:
-                if result['number'] == target_number:
-                    # Write the sequence's name and raw references as soon as we
-                    # find them
-                    if seq.raw_refs is None:
-                        seq.name = result['name']
-                        seq.raw_refs = "\n".join(result.get('xref', []))
-                        db.session.commit()
-                else:
-                    backrefs.append(oeis_a_id(result['number']))
-                saw += 1
-            if saw < ref_count:
-                search_params['start'] = saw
-                search_response = oeis_get('/search', search_params)
-                if isinstance(search_response, Exception):
-                    return search_response
-                if search_response['results'] == None:
-                    break
-        seq.backrefs = backrefs
+    # Find all the other sequences whose metadata mentions our sequence
+    backref_search = subprocess.run(
+        ['rg', seq.id, '--glob', f'!{seq.id}.seq', '--files-with-matches'],
+        cwd=oeis_data_path,
+        capture_output=True,
+        encoding='utf8'
+    )
+    if backref_search.returncode:
+        # ripgrep returned status code 1, which means it didn't find anything
+        seq.backrefs = []
     else:
-        # We didn't find any metadata
-        seq.ref_count = 0
-
-    # We write what we've found to the database in the following situations:
+        # ripgrep returned status code 0, which means its output lists all the
+        # sequence files that mention our sequence
+        seq.backrefs = list(map(
+            lambda name: os.path.splitext(os.path.basename(name))[0],
+            backref_search.stdout.strip().split('\n')
+        ))
+
+    # Count the references to our sequence, including the sequence itself
+    seq.ref_count = len(seq.backrefs) + 1
+
+    # We write the backreferences we've found to the database in the following
+    # situations:
     #
     # - No more recent thread has set out to fetch the same metadata
     #
@@ -259,15 +264,9 @@ def fetch_values(oeis_id):
     # Parse the b-file:
     first = float('inf')
     last = float('-inf')
-    name = ''
     seq_vals = {}
     for line in b_text.split("\n"):
-        if not line: continue
-        if line[0] == '#':
-            # Some sequences have info in first comment that we can use as a
-            # stopgap until the real name is obtained.
-            if not name: name = line[1:]
-            continue
+        if not line or line[0] == '#': continue
         column = line.split()
         if len(column) < 2: continue
         if not (column[0][0].isdigit() or column[0][0] == '-'):
@@ -281,7 +280,7 @@ def fetch_values(oeis_id):
         return IndexError(f"No terms found for ID '{oeis_id}'.")
     seq.values = [seq_vals[i] for i in range(first,last+1)]
     if not seq.name:
-        seq.name = name or placeholder_name(oeis_id)
+        seq.name = placeholder_name(oeis_id)
     seq.shift = first
     db.session.commit()
     return seq
@@ -352,10 +351,10 @@ def fetch_factors(oeis_id, num_elements = -1, timeout = 10):
             temp.write("\\q\n")
             temp.close()
             try:
-                results = check_output(
+                results = subprocess.check_output(
                     ['gp', '-q', '-s', '256000000', tempname],
                     timeout=timeout)
-            except TimeoutExpired as te:
+            except subprocess.TimeoutExpired as te:
                 results = te.output
         if results:
             lines = results.decode('utf-8').split("\n")[0:-1]
@@ -446,12 +445,15 @@ def get_oeis_name_and_values(oeis_id):
     # Now get the name
     seq = find_oeis_sequence(valid_oeis_id)
     if not seq.name or seq.name == placeholder_name(oeis_id):
-        search_response = oeis_get('/search', {'q': f'id:{oeis_id}', 'fmt': 'json'})
-        if isinstance(search_response, Exception):
-            return f"Error: {search_response}"
-        if search_response['results'] != None:
-            seq.name = search_response['results'][0]['name']
-            db.session.commit()
+        seq_file_path = os.path.join('oeisdata', 'seq', seq.id[:4], seq.id + '.seq')
+        for line in open(seq_file_path):
+            if line.startswith('%N'):
+                # the OEIS internal format specification says that only one %N line
+                # can appear, so the line we just found must be the whole name
+                prefix_len = 4 + len(seq.id)
+                seq.name = line[prefix_len:].strip()
+                db.session.commit()
+                break
     executor.submit(fetch_factors, valid_oeis_id, timeout=1000)
     return jsonify({'id': seq.id, 'name': seq.name, 'values': vals})
 
@@ -519,7 +521,7 @@ def search_oeis(search_term):
         else:
             ids = []
             names = []
-            resultList = search_response['results']
+            resultList = search_response
             if resultList is None:
                 resultList = []
             for result in resultList:

diff --git a/requirements-freeze.txt b/requirements-freeze.txt
@@ -20,6 +20,7 @@ psycopg2-binary==2.9.9
 python-dotenv==1.0.1
 requests==2.32.3
 requests-toolbelt==1.0.0
+ripgrep==14.1.0
 setuptools==70.0.0
 SQLAlchemy==2.0.31
 structlog==24.4.0

diff --git a/requirements.txt b/requirements.txt
@@ -7,5 +7,6 @@ psycopg2-binary # library for interacting with PostgreSQL
 python-dotenv # library for .env files
 requests # library for making HTTP requests
 requests-toolbelt # we use this for logging HTTP requests
+ripgrep # used for searching the local metadata
 structlog # package for structured logging
 sympy # used for factoring