Skip to content

Commit

Permalink
[u r] Eliminate RepositoryPlugin.list_partitions (#6531)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Sep 26, 2024
1 parent 0959f94 commit 77afdb0
Show file tree
Hide file tree
Showing 21 changed files with 2,695 additions and 2,797 deletions.
11 changes: 8 additions & 3 deletions OPERATOR.rst
Original file line number Diff line number Diff line change
Expand Up @@ -440,9 +440,14 @@ Adding snapshots to ``dev``
^^^^^^^^^^^^^^^^^^^^^^^^^^^

When adding a new snapshot to ``dev``, ``anvildev``, the operator should also
add the snapshot to ``sandbox`` or ``anvilbox``, respectively. To determine the
subgraph counts and any explicit prefixes that are needed, use the
``update_subgraph_counts.py`` script.
add the snapshot to ``sandbox`` or ``anvilbox``, respectively.

The ``post_deploy_tdr.py`` script will fail if the computed common prefix
contains an unacceptable number of subgraphs. If the script reports that the
common prefix is too long, truncate it by 1 character. If it's too short, append
1 arbitrary hexadecimal character. Pass the updated prefix as a keyword argument
to the ``mksrc`` function for the affected source(s), including a partition
prefix length of 1. Then refresh the environment and re-attempt the deployment.

Adding snapshots to ``prod``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
11 changes: 11 additions & 0 deletions UPGRADING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ branch that does not have the listed changes, the steps would need to be
reverted. This is all fairly informal and loosely defined. Hopefully we won't
have too many entries in this file.

#6531 Eliminate RepositoryPlugin.list_partitions
================================================

The subgraph counts of indexed sources are no longer tracked in the source tree.
For each of your personal deployments, in ``environment.py``: update the
``mksrc`` function, remove the ``subgraphs`` parameter from all of its call
sites, update the ``prefix`` parameter where is passed, and remove any functions
used to construct prefixes, e.g. ``common_prefix()``. Be careful to preserve any
flags such as ``ma`` or ``pop``. As always, use the sandbox deployment's
``environment.py`` as a model when upgrading personal deployments.


#6570 Upgrade dependencies 2024-09-16
=====================================
Expand Down
File renamed without changes.
28 changes: 5 additions & 23 deletions deployments/anvilbox/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,44 +9,26 @@

is_sandbox = True


def common_prefix(n: int) -> str:
"""
For a given number of subgraphs, return a common prefix that yields around
16 subgraphs.
>>> [common_prefix(n) for n in (0, 1, 31, 32, 33, 512+15, 512+16, 512+17)]
['', '', '', '', '1', 'f', '01', '11']
"""
hex_digits = '0123456789abcdef'
m = len(hex_digits)
# Double threshold to lower probability that no subgraphs match the prefix
return hex_digits[n % m] + common_prefix(n // m) if n > 2 * m else ''


ma = 1 # managed access
pop = 2 # remove snapshot


def mksrc(source_type: Literal['bigquery', 'parquet'],
google_project,
snapshot,
subgraphs,
flags: int = 0,
/,
prefix: Optional[str] = None
prefix: str = ''
) -> tuple[str, str | None]:
project = '_'.join(snapshot.split('_')[1:-3])
assert flags <= ma | pop
if prefix is None:
prefix = common_prefix(subgraphs)
source = None if flags & pop else ':'.join([
'tdr',
source_type,
'gcp',
google_project,
snapshot,
prefix + '/0'
prefix
])
return project, source

Expand All @@ -73,9 +55,9 @@ def mkdict(previous_catalog: dict[str, str],


anvil_sources = mkdict({}, 3, mkdelta([
mksrc('bigquery', 'datarepo-dev-e53e74aa', 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732', 6804),
mksrc('bigquery', 'datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520', 28),
mksrc('bigquery', 'datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509', 25)
mksrc('bigquery', 'datarepo-dev-e53e74aa', 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732'),
mksrc('bigquery', 'datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520'),
mksrc('bigquery', 'datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509')
]))


Expand Down
25 changes: 7 additions & 18 deletions deployments/anvildev/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,16 @@
Optional,
)


def partition_prefix_length(n: int) -> int:
"""
For a given number of subgraphs, return a partition prefix length that is
expected to rarely exceed 512 subgraphs per partition.
>>> [partition_prefix_length(n) for n in (0, 1, 512, 513, 16 * 512, 16 * 513 )]
[0, 0, 0, 1, 1, 2]
"""
return 1 + partition_prefix_length(n // 16) if n > 512 else 0


ma = 1 # managed access
pop = 2 # remove snapshot


def mksrc(source_type: Literal['bigquery', 'parquet'],
google_project,
snapshot,
subgraphs,
flags: int = 0
flags: int = 0,
/,
prefix: str = ''
) -> tuple[str, str | None]:
project = '_'.join(snapshot.split('_')[1:-3])
assert flags <= ma | pop
Expand All @@ -37,7 +26,7 @@ def mksrc(source_type: Literal['bigquery', 'parquet'],
'gcp',
google_project,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
prefix
])
return project, source

Expand All @@ -64,9 +53,9 @@ def mkdict(previous_catalog: dict[str, str],


anvil_sources = mkdict({}, 3, mkdelta([
mksrc('bigquery', 'datarepo-dev-e53e74aa', 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732', 6804),
mksrc('bigquery', 'datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520', 28),
mksrc('bigquery', 'datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509', 25)
mksrc('bigquery', 'datarepo-dev-e53e74aa', 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732'),
mksrc('bigquery', 'datarepo-dev-42c70e6a', 'ANVIL_CCDG_Sample_1_20230228_ANV5_202302281520'),
mksrc('bigquery', 'datarepo-dev-97ad270b', 'ANVIL_CMG_Sample_1_20230225_ANV5_202302281509')
]))


Expand Down
Loading

0 comments on commit 77afdb0

Please sign in to comment.