[u r] Eliminate RepositoryPlugin.list_partitions (#6531)

DataBiosphere · Sep 26, 2024 · 77afdb0 · 77afdb0
1 parent 0959f94
commit 77afdb0
Show file tree

Hide file tree

Showing 21 changed files with 2,695 additions and 2,797 deletions.
diff --git a/OPERATOR.rst b/OPERATOR.rst
@@ -440,9 +440,14 @@ Adding snapshots to ``dev``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 When adding a new snapshot to ``dev``, ``anvildev``, the operator should also
-add the snapshot to ``sandbox`` or ``anvilbox``, respectively. To determine the
-subgraph counts and any explicit prefixes that are needed, use the
-``update_subgraph_counts.py`` script.
+add the snapshot to ``sandbox`` or ``anvilbox``, respectively.
+
+The ``post_deploy_tdr.py`` script will fail if the computed common prefix
+contains an unacceptable number of subgraphs. If the script reports that the
+common prefix is too long, truncate it by 1 character. If it's too short, append
+1 arbitrary hexadecimal character. Pass the updated prefix as a keyword argument
+to the ``mksrc`` function for the affected source(s), including a partition
+prefix length of 1. Then refresh the environment and re-attempt the deployment.
 
 Adding snapshots to ``prod``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/UPGRADING.rst b/UPGRADING.rst
@@ -19,6 +19,17 @@ branch that does not have the listed changes, the steps would need to be
 reverted. This is all fairly informal and loosely defined. Hopefully we won't
 have too many entries in this file.
 
+#6531 Eliminate RepositoryPlugin.list_partitions
+================================================
+
+The subgraph counts of indexed sources are no longer tracked in the source tree.
+For each of your personal deployments, in ``environment.py``: update the
+``mksrc`` function, remove the ``subgraphs`` parameter from all of its call
+sites, update the ``prefix`` parameter where is passed, and remove any functions
+used to construct prefixes, e.g. ``common_prefix()``. Be careful to preserve any
+flags such as ``ma`` or ``pop``. As always, use the sandbox deployment's
+``environment.py`` as a model when upgrading personal deployments.
+
 
 #6570 Upgrade dependencies 2024-09-16
 =====================================

diff --git a/scripts/update_subgraph_counts.py → attic/scripts/update_subgraph_counts.py b/scripts/update_subgraph_counts.py → attic/scripts/update_subgraph_counts.py
diff --git a/deployments/anvilbox/environment.py b/deployments/anvilbox/environment.py
@@ -9,44 +9,26 @@
 
 is_sandbox = True
 
-
-def common_prefix(n: int) -> str:
-    """
-    For a given number of subgraphs, return a common prefix that yields around
-    16 subgraphs.
-
-    >>> [common_prefix(n) for n in (0, 1, 31, 32, 33, 512+15, 512+16, 512+17)]
-    ['', '', '', '', '1', 'f', '01', '11']
-    """
-    hex_digits = '0123456789abcdef'
-    m = len(hex_digits)
-    # Double threshold to lower probability that no subgraphs match the prefix
-    return hex_digits[n % m] + common_prefix(n // m) if n > 2 * m else ''
-
-
 ma = 1  # managed access
 pop = 2  # remove snapshot
 
 
 def mksrc(source_type: Literal['bigquery', 'parquet'],
           google_project,
           snapshot,
-          subgraphs,
           flags: int = 0,
           /,
-          prefix: Optional[str] = None
+          prefix: str = ''
           ) -> tuple[str, str | None]:
     project = '_'.join(snapshot.split('_')[1:-3])
     assert flags <= ma | pop
-    if prefix is None:
-        prefix = common_prefix(subgraphs)
     source = None if flags & pop else ':'.join([
         'tdr',
         source_type,
         'gcp',
         google_project,
         snapshot,
-        prefix + '/0'
+        prefix
     ])
     return project, source
 
@@ -73,9 +55,9 @@ def mkdict(previous_catalog: dict[str, str],
 
 
 anvil_sources = mkdict({}, 3, mkdelta([
-    mksrc('bigquery', 'datarepo-dev-e53e74aa', 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732', 6804),
-    mksrc('bigquery', 'datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520', 28),
-    mksrc('bigquery', 'datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509', 25)
+    mksrc('bigquery', 'datarepo-dev-e53e74aa', 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732'),
+    mksrc('bigquery', 'datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520'),
+    mksrc('bigquery', 'datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509')
 ]))
 
 

diff --git a/deployments/anvildev/environment.py b/deployments/anvildev/environment.py
@@ -7,27 +7,16 @@
     Optional,
 )
 
-
-def partition_prefix_length(n: int) -> int:
-    """
-    For a given number of subgraphs, return a partition prefix length that is
-    expected to rarely exceed 512 subgraphs per partition.
-
-    >>> [partition_prefix_length(n) for n in (0, 1, 512, 513, 16 * 512, 16 * 513 )]
-    [0, 0, 0, 1, 1, 2]
-    """
-    return 1 + partition_prefix_length(n // 16) if n > 512 else 0
-
-
 ma = 1  # managed access
 pop = 2  # remove snapshot
 
 
 def mksrc(source_type: Literal['bigquery', 'parquet'],
           google_project,
           snapshot,
-          subgraphs,
-          flags: int = 0
+          flags: int = 0,
+          /,
+          prefix: str = ''
           ) -> tuple[str, str | None]:
     project = '_'.join(snapshot.split('_')[1:-3])
     assert flags <= ma | pop
@@ -37,7 +26,7 @@ def mksrc(source_type: Literal['bigquery', 'parquet'],
         'gcp',
         google_project,
         snapshot,
-        '/' + str(partition_prefix_length(subgraphs))
+        prefix
     ])
     return project, source
 
@@ -64,9 +53,9 @@ def mkdict(previous_catalog: dict[str, str],
 
 
 anvil_sources = mkdict({}, 3, mkdelta([
-    mksrc('bigquery', 'datarepo-dev-e53e74aa', 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732', 6804),
-    mksrc('bigquery', 'datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520', 28),
-    mksrc('bigquery', 'datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509', 25)
+    mksrc('bigquery', 'datarepo-dev-e53e74aa', 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732'),
+    mksrc('bigquery', 'datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520'),
+    mksrc('bigquery', 'datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509')
 ]))