diff --git a/.gitignore b/.gitignore index d12b6a4c27d..bdec059923d 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,8 @@ GTAGS *.dylib */hail.jar infra/.terraform.lock.hcl +hail/python/hail/docs/experimental/hail.experimental.DB.rst +hail/python/hailtop/batch/docs/api/ +web_common/web_common/static/css/ +website/docs.tar.gz +website/website/static/css/ diff --git a/auth/auth/auth.py b/auth/auth/auth.py index 90c8c8c0323..1ab47f4a995 100644 --- a/auth/auth/auth.py +++ b/auth/auth/auth.py @@ -8,7 +8,6 @@ import google.auth.transport.requests import google.oauth2.id_token import google_auth_oauthlib.flow -from hailtop.auth import async_get_userinfo from hailtop.config import get_deploy_config from hailtop.tls import internal_server_ssl_context from hailtop.hail_logging import AccessLogger @@ -526,18 +525,7 @@ async def rest_logout(request, userdata): return web.Response(status=200) -@routes.get('/api/v1alpha/userinfo') -async def userinfo(request): - if 'Authorization' not in request.headers: - log.info('Authorization not in request.headers') - raise web.HTTPUnauthorized() - - auth_header = request.headers['Authorization'] - session_id = maybe_parse_bearer_header(auth_header) - if not session_id: - log.info('Bearer not in Authorization header') - raise web.HTTPUnauthorized() - +async def get_userinfo(request, session_id): # b64 encoding of 32-byte session ID is 44 bytes if len(session_id) != 44: log.info('Session id != 44 bytes') @@ -554,18 +542,41 @@ async def userinfo(request): if len(users) != 1: log.info(f'Unknown session id: {session_id}') raise web.HTTPUnauthorized() - user = users[0] + return users[0] + + +@routes.get('/api/v1alpha/userinfo') +async def userinfo(request): + if 'Authorization' not in request.headers: + log.info('Authorization not in request.headers') + raise web.HTTPUnauthorized() + + auth_header = request.headers['Authorization'] + session_id = maybe_parse_bearer_header(auth_header) + if not session_id: + log.info('Bearer not in Authorization header') + raise web.HTTPUnauthorized() + + return web.json_response(await get_userinfo(request, session_id)) + - return web.json_response(user) +async def get_session_id(request): + if 'X-Hail-Internal-Authorization' in request.headers: + return maybe_parse_bearer_header(request.headers['X-Hail-Internal-Authorization']) + + if 'Authorization' in request.headers: + return maybe_parse_bearer_header(request.headers['Authorization']) + + session = await aiohttp_session.get_session(request) + return session.get('session_id') @routes.get('/api/v1alpha/verify_dev_credentials') async def verify_dev_credentials(request): - session = await aiohttp_session.get_session(request) - session_id = session.get('session_id') + session_id = await get_session_id(request) if not session_id: raise web.HTTPUnauthorized() - userdata = await async_get_userinfo(session_id=session_id) + userdata = await get_userinfo(request, session_id) is_developer = userdata is not None and userdata['is_developer'] == 1 if not is_developer: raise web.HTTPUnauthorized() diff --git a/batch/Dockerfile.worker b/batch/Dockerfile.worker index 2ef29ed6aed..5680613518c 100644 --- a/batch/Dockerfile.worker +++ b/batch/Dockerfile.worker @@ -10,7 +10,7 @@ RUN hail-apt-get-install \ COPY docker/hail-ubuntu/pip.conf /root/.config/pip/pip.conf COPY docker/hail-ubuntu/hail-pip-install /bin/hail-pip-install COPY docker/requirements.txt . -RUN hail-pip-install -r requirements.txt pyspark==2.4.0 +RUN hail-pip-install -r requirements.txt pyspark==3.1.1 ENV SPARK_HOME /usr/local/lib/python3.7/site-packages/pyspark ENV PATH "$PATH:$SPARK_HOME/sbin:$SPARK_HOME/bin" diff --git a/batch/batch/batch.py b/batch/batch/batch.py index 0a1201070de..db0eb257166 100644 --- a/batch/batch/batch.py +++ b/batch/batch/batch.py @@ -41,6 +41,7 @@ def _time_msecs_str(t): d = { 'id': record['id'], + 'user': record['user'], 'billing_project': record['billing_project'], 'token': record['token'], 'state': state, @@ -85,6 +86,8 @@ def job_record_to_dict(record, name): 'batch_id': record['batch_id'], 'job_id': record['job_id'], 'name': name, + 'user': record['user'], + 'billing_project': record['billing_project'], 'state': record['state'], 'exit_code': exit_code, 'duration': duration diff --git a/batch/batch/front_end/front_end.py b/batch/batch/front_end/front_end.py index f3202fa158b..a8721383aa7 100644 --- a/batch/batch/front_end/front_end.py +++ b/batch/batch/front_end/front_end.py @@ -222,7 +222,8 @@ async def _query_batch_jobs(request, batch_id): where_args.extend(args) sql = f''' -SELECT jobs.*, batches.format_version, job_attributes.value AS name, SUM(`usage` * rate) AS cost +SELECT jobs.*, batches.user, batches.billing_project, batches.format_version, + job_attributes.value AS name, SUM(`usage` * rate) AS cost FROM jobs INNER JOIN batches ON jobs.batch_id = batches.id LEFT JOIN job_attributes @@ -1150,7 +1151,7 @@ async def _get_job(app, batch_id, job_id): db: Database = app['db'] record = await db.select_and_fetchone(''' -SELECT jobs.*, ip_address, format_version, SUM(`usage` * rate) AS cost +SELECT jobs.*, user, billing_project, ip_address, format_version, SUM(`usage` * rate) AS cost FROM jobs INNER JOIN batches ON jobs.batch_id = batches.id @@ -1252,28 +1253,31 @@ async def ui_get_job(request, userdata, batch_id): app = request.app job_id = int(request.match_info['job_id']) - job_status, attempts, job_log = await asyncio.gather(_get_job(app, batch_id, job_id), - _get_attempts(app, batch_id, job_id), - _get_job_log(app, batch_id, job_id)) + job, attempts, job_log = await asyncio.gather(_get_job(app, batch_id, job_id), + _get_attempts(app, batch_id, job_id), + _get_job_log(app, batch_id, job_id)) - job_status_status = job_status['status'] + job['duration'] = humanize_timedelta_msecs(job['duration']) + job['cost'] = cost_str(job['cost']) + + job_status = job['status'] container_status_spec = dictfix.NoneOr({ 'name': str, 'timing': {'pulling': dictfix.NoneOr({'duration': dictfix.NoneOr(Number)}), 'running': dictfix.NoneOr({'duration': dictfix.NoneOr(Number)})}, 'container_status': {'out_of_memory': False}, 'state': str}) - job_status_status_spec = { + job_status_spec = { 'container_statuses': {'input': container_status_spec, 'main': container_status_spec, 'output': container_status_spec}} - job_status_status = dictfix.dictfix(job_status_status, job_status_status_spec) - container_statuses = job_status_status['container_statuses'] + job_status = dictfix.dictfix(job_status, job_status_spec) + container_statuses = job_status['container_statuses'] step_statuses = [container_statuses['input'], container_statuses['main'], container_statuses['output']] - job_specification = job_status['spec'] + job_specification = job['spec'] if 'process' in job_specification: process_specification = job_specification['process'] process_type = process_specification['type'] @@ -1289,11 +1293,12 @@ async def ui_get_job(request, userdata, batch_id): page_context = { 'batch_id': batch_id, 'job_id': job_id, + 'job': job, 'job_log': job_log, 'attempts': attempts, 'step_statuses': step_statuses, 'job_specification': job_specification, - 'job_status_str': json.dumps(job_status, indent=2) + 'job_status_str': json.dumps(job, indent=2) } return await render_template('batch', request, userdata, 'job.html', page_context) diff --git a/batch/batch/front_end/templates/batch.html b/batch/batch/front_end/templates/batch.html index 2bd63c82ef7..107061262ab 100644 --- a/batch/batch/front_end/templates/batch.html +++ b/batch/batch/front_end/templates/batch.html @@ -4,7 +4,28 @@ {% endblock %} {% block content %} +

Batch {{ batch['id'] }}

+ +

Properties

+ + +

Attributes

{% if 'attributes' in batch %} {% for name, value in batch['attributes'].items() %}

{{ name }}: {{ value }}

@@ -64,7 +85,7 @@

Jobs

{% for job in batch['jobs'] %} - + {{ job['job_id'] }} diff --git a/batch/batch/front_end/templates/batches.html b/batch/batch/front_end/templates/batches.html index ef5ee38890a..799f3463216 100644 --- a/batch/batch/front_end/templates/batches.html +++ b/batch/batch/front_end/templates/batches.html @@ -52,6 +52,8 @@

Batches

ID + User + Billing Project Name Submitted Completed @@ -68,7 +70,11 @@

Batches

{% for batch in batches %} - {{ batch['id'] }} + + {{ batch['id'] }} + + {{ batch['user'] }} + {{ batch['billing_project'] }} {% if 'attributes' in batch and 'name' in batch['attributes'] and batch['attributes']['name'] is not none %} {{ batch['attributes']['name'] }} diff --git a/batch/batch/front_end/templates/job.html b/batch/batch/front_end/templates/job.html index 2d66b461303..f9e8ec79ddb 100644 --- a/batch/batch/front_end/templates/job.html +++ b/batch/batch/front_end/templates/job.html @@ -3,6 +3,18 @@ {% block content %}

Batch {{ batch_id }} Job {{ job_id }}

+

Properties

+ +

Attempts

{% if attempts %} diff --git a/batch/test/test_dag.py b/batch/test/test_dag.py index 3adae54d1c6..42c8d286604 100644 --- a/batch/test/test_dag.py +++ b/batch/test/test_dag.py @@ -156,6 +156,7 @@ def test(): callback_body.pop('duration') assert (callback_body == { 'id': b.id, + 'user': 'test', 'billing_project': 'test', 'token': token, 'state': 'success', diff --git a/benchmark-service/Dockerfile.test b/benchmark-service/Dockerfile.test index 8629c8d2b96..f739af15f07 100644 --- a/benchmark-service/Dockerfile.test +++ b/benchmark-service/Dockerfile.test @@ -1,6 +1,3 @@ FROM {{ service_base_image.image }} COPY benchmark-service/test/ /test/ -RUN python3 -m pip install --no-cache-dir \ - pytest-instafail==0.4.1 \ - pytest-asyncio==0.10.0 diff --git a/benchmark-service/test/test_update_commits.py b/benchmark-service/test/test_update_commits.py index 5ab7ecb8dfe..39b904d95ae 100644 --- a/benchmark-service/test/test_update_commits.py +++ b/benchmark-service/test/test_update_commits.py @@ -9,14 +9,13 @@ from hailtop.httpx import client_session import hailtop.utils as utils -pytestmark = pytest.mark.asyncio - logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) sha = 'd626f793ad700c45a878d192652a0378818bbd8b' +@pytest.mark.asyncio async def test_update_commits(): deploy_config = get_deploy_config() headers = service_auth_headers(deploy_config, 'benchmark') diff --git a/build.yaml b/build.yaml index 43ecc016348..c884d9d830f 100644 --- a/build.yaml +++ b/build.yaml @@ -684,26 +684,6 @@ steps: to: /cluster-tests.tar.gz dependsOn: - hail_build_image - - kind: runImage - name: build_hail_spark3 - image: - valueFrom: hail_build_image.image - resources: - memory: "7.5G" - cpu: "4" - script: | - set -ex - cd / - rm -rf repo - mkdir repo - cd repo - {{ code.checkout_script }} - cd hail - time retry ./gradlew --version - export SPARK_VERSION="3.0.1" SCALA_VERSION="2.12.12" - time retry make jars python-version-info wheel - dependsOn: - - hail_build_image - kind: buildImage name: batch_worker_image dockerFile: batch/Dockerfile.worker @@ -2830,6 +2810,8 @@ steps: mkdir -p ./ci/test ./hail/python cp /repo/hail/ci/test/resources/build.yaml ./ cp -R /repo/hail/ci/test/resources ./ci/test/ + cp /repo/hail/tls/Dockerfile ./ci/test/resources/Dockerfile.certs + cp /repo/hail/tls/create_certs.py ./ci/test/resources/ cp /repo/hail/pylintrc ./ cp /repo/hail/setup.cfg ./ cp -R /repo/hail/docker ./ @@ -3289,7 +3271,7 @@ steps: script: | set -ex gcloud auth activate-service-account --key-file=/secrets/ci-deploy-0-1--hail-is-hail.json - SPARK_VERSION=2.4.5 + SPARK_VERSION=3.1.1 BRANCH=0.2 SHA="{{ code.sha }}" GS_JAR=gs://hail-common/builds/${BRANCH}/jars/hail-${BRANCH}-${SHA}-Spark-${SPARK_VERSION}.jar diff --git a/ci/Dockerfile.test b/ci/Dockerfile.test index 4f2dfbe8150..15ea0b73172 100644 --- a/ci/Dockerfile.test +++ b/ci/Dockerfile.test @@ -4,4 +4,3 @@ COPY hail/python/setup-hailtop.py /hailtop/setup.py COPY hail/python/hailtop /hailtop/hailtop/ RUN hail-pip-install /hailtop && rm -rf /hailtop COPY ci/test/ /test/ -RUN hail-pip-install pytest-instafail==0.4.1 pytest-asyncio==0.10.0 diff --git a/ci/test/resources/build.yaml b/ci/test/resources/build.yaml index 7078e450185..93bbee7ec08 100644 --- a/ci/test/resources/build.yaml +++ b/ci/test/resources/build.yaml @@ -52,6 +52,36 @@ steps: publishAs: service-base dependsOn: - base_image + - kind: buildImage + name: create_certs_image + dockerFile: ci/test/resources/Dockerfile.certs + contextPath: ci/test/resources + publishAs: test_hello_create_certs_image + dependsOn: + - service_base_image + - kind: runImage + name: create_certs + image: + valueFrom: create_certs_image.image + script: | + set -ex + python3 create_certs.py \ + {{ default_ns.name }} \ + config.yaml \ + /ssl-config-hail-root/hail-root-key.pem \ + /ssl-config-hail-root/hail-root-cert.pem + serviceAccount: + name: admin + namespace: + valueFrom: default_ns.name + secrets: + - name: ssl-config-hail-root + namespace: + valueFrom: default_ns.name + mountPath: /ssl-config-hail-root + dependsOn: + - default_ns + - create_certs_image - kind: buildImage name: hello_image dockerFile: ci/test/resources/Dockerfile diff --git a/ci/test/resources/config.yaml b/ci/test/resources/config.yaml new file mode 100644 index 00000000000..5a67fd4191a --- /dev/null +++ b/ci/test/resources/config.yaml @@ -0,0 +1,4 @@ +principals: +- name: hello + domain: hello + kind: json diff --git a/ci/test/resources/deployment.yaml b/ci/test/resources/deployment.yaml index 4ef6e318c76..95c7dedee2d 100644 --- a/ci/test/resources/deployment.yaml +++ b/ci/test/resources/deployment.yaml @@ -50,6 +50,9 @@ spec: - name: session-secret-key mountPath: /session-secret-key readOnly: true + - name: ssl-config + mountPath: /ssl-config + readOnly: true env: - name: HAIL_IP valueFrom: @@ -74,6 +77,10 @@ spec: secret: optional: false secretName: session-secret-key + - name: ssl-config + secret: + optional: false + secretName: ssl-config-hello --- apiVersion: v1 kind: Service @@ -83,8 +90,7 @@ metadata: app: hello spec: ports: - - name: http - port: 80 + - port: 443 protocol: TCP targetPort: 5000 selector: diff --git a/ci/test/resources/hello.py b/ci/test/resources/hello.py index 0e914e097e0..5bfc3fded0a 100644 --- a/ci/test/resources/hello.py +++ b/ci/test/resources/hello.py @@ -2,6 +2,7 @@ from aiohttp import web from hailtop.config import get_deploy_config +from hailtop.tls import internal_server_ssl_context from gear import setup_aiohttp_session @@ -14,15 +15,17 @@ @routes.get('/healthcheck') -async def get_healthcheck(request): # pylint: disable=W0613 +async def get_healthcheck(request): # pylint: disable=unused-argument return web.Response() @routes.get('/sha') -async def get_sha(request): +async def get_sha(request): # pylint: disable=unused-argument return web.Response(text=SHA) setup_aiohttp_session(app) app.add_routes(routes) -web.run_app(deploy_config.prefix_application(app, 'hello'), host='0.0.0.0', port=5000) +web.run_app( + deploy_config.prefix_application(app, 'hello'), host='0.0.0.0', port=5000, ssl_context=internal_server_ssl_context() +) diff --git a/ci/test/resources/statefulset.yaml b/ci/test/resources/statefulset.yaml index 0bdd9a37796..1b9b9a9f2f9 100644 --- a/ci/test/resources/statefulset.yaml +++ b/ci/test/resources/statefulset.yaml @@ -49,6 +49,9 @@ spec: - name: session-secret-key mountPath: /session-secret-key readOnly: true + - name: ssl-config + mountPath: /ssl-config + readOnly: true env: - name: HAIL_IP value: "{{ global.ip }}" @@ -67,6 +70,10 @@ spec: secret: optional: false secretName: session-secret-key + - name: ssl-config + secret: + optional: false + secretName: ssl-config-hello --- apiVersion: v1 kind: Service @@ -76,8 +83,7 @@ metadata: app: hello-stateful-set spec: ports: - - name: http - port: 80 + - port: 443 protocol: TCP targetPort: 5000 selector: diff --git a/ci/test/test_ci.py b/ci/test/test_ci.py index bd4be73cd7a..b712ffd1a87 100644 --- a/ci/test/test_ci.py +++ b/ci/test/test_ci.py @@ -9,12 +9,11 @@ from hailtop.httpx import client_session import hailtop.utils as utils -pytestmark = pytest.mark.asyncio - logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) +@pytest.mark.asyncio async def test_deploy(): deploy_config = get_deploy_config() ci_deploy_status_url = deploy_config.url('ci', '/api/v1alpha/deploy_status') diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index 1e899fab56c..11d470f54ee 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -26,7 +26,7 @@ RUN /bin/sh -c 'curl https://sdk.cloud.google.com | bash' && \ ENV PATH $PATH:/google-cloud-sdk/bin COPY docker/requirements.txt . -RUN hail-pip-install -r requirements.txt pyspark==2.4.0 +RUN hail-pip-install -r requirements.txt pyspark==3.1.1 ENV SPARK_HOME /usr/local/lib/python3.7/dist-packages/pyspark ENV PATH "$PATH:$SPARK_HOME/sbin:$SPARK_HOME/bin" diff --git a/docker/Dockerfile.service-java-run-base b/docker/Dockerfile.service-java-run-base index 4ca7f6dd789..c05ae40e118 100644 --- a/docker/Dockerfile.service-java-run-base +++ b/docker/Dockerfile.service-java-run-base @@ -8,7 +8,7 @@ RUN hail-apt-get-install \ liblapack3 COPY docker/requirements.txt . -RUN hail-pip-install -r requirements.txt pyspark==2.4.0 +RUN hail-pip-install -r requirements.txt pyspark==3.1.1 ENV SPARK_HOME /usr/local/lib/python3.7/dist-packages/pyspark ENV PYSPARK_PYTHON python3 diff --git a/docker/requirements.txt b/docker/requirements.txt index 5b3c1ca3394..13d0ec46796 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -24,7 +24,7 @@ google-cloud-logging==1.12.1 google-cloud-storage==1.25.0 humanize==1.0.0 hurry.filesize==0.9 -Jinja2==2.10.1 +Jinja2==2.11.3 # keyrings.alt>3.1: https://bugs.launchpad.net/usd-importer/+bug/1794041/comments/6 keyrings.alt>=3.1 kubernetes-asyncio==9.1.0 @@ -38,11 +38,11 @@ pyjwt==1.7.1 pylint==2.6.0 astroid<2.5 # https://github.com/PyCQA/pylint/issues/4131 PyMySQL==0.9.2 -pytest==4.6.3 -pytest-asyncio==0.10.0 +pytest==6.2.2 +pytest-asyncio==0.14.0 pytest-html==1.20.0 -pytest-instafail==0.4.1 -pytest-xdist==1.28 +pytest-instafail==0.4.2 +pytest-xdist==2.2.1 python-dateutil==2.8.1 python-json-logger==0.1.11 requests==2.22.0 diff --git a/gateway/gateway.nginx.conf b/gateway/gateway.nginx.conf index b21f4d0cd13..3408533c769 100644 --- a/gateway/gateway.nginx.conf +++ b/gateway/gateway.nginx.conf @@ -17,11 +17,6 @@ server { } } -map $maybe_router_scheme $router_scheme { - default $maybe_router_scheme; - '' http; -} - server { server_name internal.hail.populationgenomics.org.au; client_max_body_size 8m; @@ -29,7 +24,7 @@ server { location = /auth { internal; resolver kube-dns.kube-system.svc.cluster.local; - proxy_pass https://router-resolver.default.svc.cluster.local/auth/$namespace; + proxy_pass https://auth/api/v1alpha/verify_dev_credentials; include /ssl-config/ssl-config-proxy.conf; } @@ -38,10 +33,9 @@ server { set $service $2; auth_request /auth; - auth_request_set $router_ip $upstream_http_x_router_ip; - auth_request_set $maybe_router_scheme $upstream_http_x_router_scheme; - proxy_pass $router_scheme://$router_ip$request_uri; + resolver kube-dns.kube-system.svc.cluster.local; + proxy_pass https://$service.$namespace.svc.cluster.local; proxy_set_header Host $service.internal; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; diff --git a/hail/Makefile b/hail/Makefile index 36c57a99717..9596c2997fa 100644 --- a/hail/Makefile +++ b/hail/Makefile @@ -10,8 +10,8 @@ MAKEFLAGS += --no-builtin-rules REVISION := $(shell git rev-parse HEAD) SHORT_REVISION := $(shell git rev-parse --short=12 HEAD) BRANCH := $(shell git rev-parse --abbrev-ref HEAD) -SCALA_VERSION ?= 2.11.12 -SPARK_VERSION ?= 2.4.5 +SCALA_VERSION ?= 2.12.13 +SPARK_VERSION ?= 3.1.1 HAIL_MAJOR_MINOR_VERSION := 0.2 HAIL_PATCH_VERSION := 64 HAIL_PIP_VERSION := $(HAIL_MAJOR_MINOR_VERSION).$(HAIL_PATCH_VERSION) @@ -59,12 +59,23 @@ SHADOW_TEST_JAR := build/libs/hail-all-spark-test.jar PYTHON_JAR := python/hail/backend/hail-all-spark.jar WHEEL := build/deploy/dist/hail-$(HAIL_PIP_VERSION)-py3-none-any.whl EGG := build/deploy/dist/hail-$(HAIL_PIP_VERSION)-py3.6.egg +ELASTICSEARCH_JAR := libs/elasticsearch-spark-30_2.12-8.0.0-SNAPSHOT-custom-hail-spark311.jar GRADLE_ARGS += -Dscala.version=$(SCALA_VERSION) -Dspark.version=$(SPARK_VERSION) -Delasticsearch.major-version=$(ELASTIC_MAJOR_VERSION) +.PHONY: elasticsearchJar +elasticsearchJar: $(ELASTICSEARCH_JAR) + +$(ELASTICSEARCH_JAR): + @mkdir -p libs + gsutil cp gs://hail-common/elasticsearch-libs/elasticsearch-spark-30_2.12-8.0.0-SNAPSHOT-custom-hail-spark311.jar libs/ + .PHONY: shadowJar shadowJar: $(SHADOW_JAR) +$(JAR_SOURCES): $(ELASTICSEARCH_JAR) +$(JAR_TEST_SOURCES): $(ELASTICSEARCH_JAR) + ifdef HAIL_COMPILE_NATIVES $(SHADOW_JAR): native-lib-prebuilt endif @@ -372,7 +383,10 @@ native-lib-prebuilt: native-lib-reset-prebuilt: $(MAKE) -C src/main/c reset-prebuilt -clean: clean-env native-lib-clean +clean-libs: + rm -rf libs + +clean: clean-env clean-libs native-lib-clean $(MAKE) -C python/hail/docs clean $(MAKE) -C python/hailtop/batch/docs clean ./gradlew clean $(GRADLE_ARGS) diff --git a/hail/build.gradle b/hail/build.gradle index 91202a23138..9a3593eb394 100644 --- a/hail/build.gradle +++ b/hail/build.gradle @@ -18,6 +18,9 @@ plugins { import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar repositories { + flatDir { + dirs 'libs' + } mavenCentral() jcenter() maven { url "https://repository.cloudera.com/artifactory/cloudera-repos/" } @@ -46,8 +49,17 @@ tasks.withType(JavaCompile) { project.ext { cachedBreezeVersion = null - sparkVersion = System.getProperty("spark.version", "2.4.5") - scalaVersion = System.getProperty("scala.version", "2.11.12") + sparkVersion = System.getProperty("spark.version", "3.1.1") + if (sparkVersion.startsWith("2.")) { + throw new UnsupportedOperationException("Hail no longer supports Spark 2.") + } + else if (sparkVersion != "3.1.1") { + project.logger.lifecycle("WARNING: Hail only tested with Spark 3.1.1, use other versions at your own risk.") + } + scalaVersion = System.getProperty("scala.version", "2.12.13") + if (!scalaVersion.startsWith("2.12.")) { + throw new UnsupportedOperationException("Hail currently only supports Scala 2.12") + } scalaMajorVersion = (scalaVersion =~ /^\d+.\d+/)[0] } @@ -100,19 +112,6 @@ String breezeVersion() { return cachedBreezeVersion } -String elasticHadoopVersion() { - def elasticMajorVersion = System.getProperty("elasticsearch.major-version", "7") - if (elasticMajorVersion == "6") { - return "6.8.13" - } - else if (elasticMajorVersion == "7") { - return "7.8.1" - } - else { - throw new UnsupportedOperationException("elasticsearch.major-version must be 6 or 7") - } -} - configurations { justSpark @@ -126,9 +125,8 @@ configurations { } } eachDependency { DependencyResolveDetails details -> - if (details.requested.group == 'org.json4s') { - // JSON4S 3.6.0+ contain a known bug (https://github.com/json4s/json4s/issues/507) - details.useVersion('3.5.3') + if (details.requested.group == 'org.apache.spark') { + details.useVersion(sparkVersion) } else if (details.requested.group == 'org.scalanlp' && details.requested.version == '1.0') { // Breeze 1.0 contains a known bug (https://github.com/scalanlp/breeze/issues/772) details.useVersion('1.1') @@ -181,7 +179,12 @@ dependencies { bundled group: 'org.slf4j', name: 'slf4j-api', version: '1.7.25' - bundled 'org.elasticsearch:elasticsearch-spark-20_2.11:' + elasticHadoopVersion() + def elasticMajorVersion = System.getProperty("elasticsearch.major-version", "7") + if (elasticMajorVersion != "7") { + throw new UnsupportedOperationException("elasticsearch.major-version must be 7") + } + // This comes from a local libs directory, see Makefile + bundled 'org.elasticsearch:elasticsearch-spark-30_2.12-8.0.0-SNAPSHOT-custom-hail-spark311' bundled 'com.google.cloud:google-cloud-storage:1.106.0' @@ -310,7 +313,6 @@ tasks.withType(ShadowJar) { // we should really shade indeed, but it has native libraries // relocate 'com.indeed', 'is.hail.relocated.com.indeed' relocate 'com.google.cloud', 'is.hail.relocated.com.google.cloud' - relocate 'org.elasticsearch', 'is.hail.relocated.org.elasticsearch' relocate 'com.github.samtools', 'is.hail.relocated.com.github.samtools' relocate 'org.lz4', 'is.hail.relocated.org.lz4' relocate 'org.freemarker', 'is.hail.relocated.org.freemarker' diff --git a/hail/python/dev-requirements.txt b/hail/python/dev-requirements.txt index 039b6b4412c..a2f706f9483 100644 --- a/hail/python/dev-requirements.txt +++ b/hail/python/dev-requirements.txt @@ -5,10 +5,10 @@ astroid<2.5 # https://github.com/PyCQA/pylint/issues/4131 pre-commit==2.9.2 black==20.8b1 curlylint==0.12.0 -pytest==4.6.3 +pytest==6.2.2 pytest-html==1.20.0 -pytest-xdist==1.28 -pytest-instafail==0.4.1 +pytest-xdist==2.2.1 +pytest-instafail==0.4.2 sphinx==3.2.1 sphinx-autodoc-typehints==1.11.0 nbsphinx==0.7.1 diff --git a/hail/python/hail/backend/local_backend.py b/hail/python/hail/backend/local_backend.py index c246b401498..6ab7220e6ed 100644 --- a/hail/python/hail/backend/local_backend.py +++ b/hail/python/hail/backend/local_backend.py @@ -126,7 +126,7 @@ def __init__(self, tmpdir, log, quiet, append, branching_factor, port = launch_gateway( redirect_stdout=sys.stdout, redirect_stderr=sys.stderr, - jarpath=f'{spark_home}/jars/py4j-0.10.7.jar', + jarpath=f'{spark_home}/jars/py4j-0.10.9.jar', classpath=f'{spark_home}/jars/*:{hail_jar_path}', die_on_exit=True) self._gateway = JavaGateway( diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_AFR.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_AFR.rst new file mode 100644 index 00000000000..a308200981c --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_AFR.rst @@ -0,0 +1,28 @@ +.. _panukb_ld_scores_AFR: + +panukb_ld_scores_AFR +==================== + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + None + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'rsid': str + 'varid': str + 'AF': float64 + 'ld_score': float64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_AMR.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_AMR.rst new file mode 100644 index 00000000000..759c06d15ff --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_AMR.rst @@ -0,0 +1,28 @@ +.. _panukb_ld_scores_AMR: + +panukb_ld_scores_AMR +==================== + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + None + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'rsid': str + 'varid': str + 'AF': float64 + 'ld_score': float64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_CSA.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_CSA.rst new file mode 100644 index 00000000000..c0d11c300bf --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_CSA.rst @@ -0,0 +1,28 @@ +.. _panukb_ld_scores_CSA: + +panukb_ld_scores_CSA +==================== + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + None + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'rsid': str + 'varid': str + 'AF': float64 + 'ld_score': float64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_EAS.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_EAS.rst new file mode 100644 index 00000000000..c036bb98686 --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_EAS.rst @@ -0,0 +1,28 @@ +.. _panukb_ld_scores_EAS: + +panukb_ld_scores_EAS +==================== + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + None + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'rsid': str + 'varid': str + 'AF': float64 + 'ld_score': float64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_EUR.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_EUR.rst new file mode 100644 index 00000000000..3697ceb561e --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_EUR.rst @@ -0,0 +1,28 @@ +.. _panukb_ld_scores_EUR: + +panukb_ld_scores_EUR +==================== + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + None + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'rsid': str + 'varid': str + 'AF': float64 + 'ld_score': float64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_MID.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_MID.rst new file mode 100644 index 00000000000..de24078b6cd --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_scores_MID.rst @@ -0,0 +1,28 @@ +.. _panukb_ld_scores_MID: + +panukb_ld_scores_MID +==================== + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + None + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'rsid': str + 'varid': str + 'AF': float64 + 'ld_score': float64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_AFR.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_AFR.rst new file mode 100644 index 00000000000..b82c3826cf1 --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_AFR.rst @@ -0,0 +1,26 @@ +.. _panukb_ld_variant_indices_AFR: + +panukb_ld_variant_indices_AFR +============================= + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + 'n_samples': int32 + 'pop': str + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'idx': int64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_AMR.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_AMR.rst new file mode 100644 index 00000000000..09445a9e971 --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_AMR.rst @@ -0,0 +1,26 @@ +.. _panukb_ld_variant_indices_AMR: + +panukb_ld_variant_indices_AMR +============================= + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + 'n_samples': int32 + 'pop': str + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'idx': int64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_CSA.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_CSA.rst new file mode 100644 index 00000000000..7c0d9e8c8ca --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_CSA.rst @@ -0,0 +1,26 @@ +.. _panukb_ld_variant_indices_CSA: + +panukb_ld_variant_indices_CSA +============================= + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + 'n_samples': int32 + 'pop': str + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'idx': int64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_EAS.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_EAS.rst new file mode 100644 index 00000000000..583575bf111 --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_EAS.rst @@ -0,0 +1,26 @@ +.. _panukb_ld_variant_indices_EAS: + +panukb_ld_variant_indices_EAS +============================= + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + 'n_samples': int32 + 'pop': str + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'idx': int64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_EUR.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_EUR.rst new file mode 100644 index 00000000000..97c87a3cb93 --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_EUR.rst @@ -0,0 +1,26 @@ +.. _panukb_ld_variant_indices_EUR: + +panukb_ld_variant_indices_EUR +============================= + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + 'n_samples': int32 + 'pop': str + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'idx': int64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_MID.rst b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_MID.rst new file mode 100644 index 00000000000..b6a7969b6f1 --- /dev/null +++ b/hail/python/hail/docs/datasets/schemas/panukb_ld_variant_indices_MID.rst @@ -0,0 +1,26 @@ +.. _panukb_ld_variant_indices_MID: + +panukb_ld_variant_indices_MID +============================= + +* **Versions:** 0.2 +* **Reference genome builds:** GRCh37 +* **Type:** :class:`hail.Table` + +Schema (0.2, GRCh37) +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: text + + ---------------------------------------- + Global fields: + 'n_samples': int32 + 'pop': str + ---------------------------------------- + Row fields: + 'locus': locus + 'alleles': array + 'idx': int64 + ---------------------------------------- + Key: ['locus', 'alleles'] + ---------------------------------------- diff --git a/hail/python/hail/docs/install/linux.rst b/hail/python/hail/docs/install/linux.rst index 66eb949ccc4..8af0649b4ae 100644 --- a/hail/python/hail/docs/install/linux.rst +++ b/hail/python/hail/docs/install/linux.rst @@ -3,7 +3,7 @@ Install Hail on GNU/Linux ========================= - Install Java 8. -- Install Python 3.6 or 3.7. +- Install Python 3.6+. - Install a recent version of the C and C++ standard libraries. GCC 5.0, LLVM version 3.4, or any later versions suffice. - Install BLAS and LAPACK. diff --git a/hail/python/hail/docs/install/macosx.rst b/hail/python/hail/docs/install/macosx.rst index 0a8cf198408..a826a4520d4 100644 --- a/hail/python/hail/docs/install/macosx.rst +++ b/hail/python/hail/docs/install/macosx.rst @@ -3,6 +3,6 @@ Install Hail on Mac OS X ======================== - Install `Java 8 `__. -- Install Python 3.6 or 3.7. We recommend `Miniconda `__; however, the latest version of Miniconda installs Python 3.8 by default. Please follow `these instructions `__ to create a Python 3.6 or 3.7 environment. +- Install Python 3.6+. We recommend `Miniconda `__. - Open Terminal.app and execute ``pip install hail``. - `Run your first Hail query! `__ diff --git a/hail/python/hail/docs/install/other-cluster.rst b/hail/python/hail/docs/install/other-cluster.rst index b294e46bb1e..af7514b9114 100644 --- a/hail/python/hail/docs/install/other-cluster.rst +++ b/hail/python/hail/docs/install/other-cluster.rst @@ -5,13 +5,13 @@ Install Hail on a Spark Cluster If you are using Google Dataproc, please see `these simpler instructions `__. -Hail should work with any Spark 2.4.x cluster built with Scala 2.11. +Hail should work with any Spark 3.1.1 cluster built with Scala 2.12. Hail needs to be built from source on the leader node. Building Hail from source requires: - Java 8 JDK. -- Python 3.6 or 3.7. +- Python 3.6+. - A recent C and a C++ compiler, GCC 5.0, LLVM 3.4, or later versions of either suffice. - BLAS and LAPACK. diff --git a/hail/python/hail/experimental/datasets.json b/hail/python/hail/experimental/datasets.json index 59507dd4fdc..f6cac416040 100644 --- a/hail/python/hail/experimental/datasets.json +++ b/hail/python/hail/experimental/datasets.json @@ -1581,5 +1581,311 @@ "version": "1.1" } ] + }, + "panukb_ld_block_matrix_AFR": { + "description": "Pan-UKB: linkage disequilibrium (LD) matrix Hail BlockMatrix for African ancestry population. To determine which row/column corresponds to which variant, see the associated variant indices Hail Table: panukb_ld_variant_indices_AFR.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.AFR.ldadj.bm" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_block_matrix_AMR": { + "description": "Pan-UKB: linkage disequilibrium (LD) matrix Hail BlockMatrix for Admixed American ancestry population. To determine which row/column corresponds to which variant, see the associated variant indices Hail Table: panukb_ld_variant_indices_AMR.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.AMR.ldadj.bm" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_block_matrix_CSA": { + "description": "Pan-UKB: linkage disequilibrium (LD) matrix Hail BlockMatrix for Central/South Asian ancestry population. To determine which row/column corresponds to which variant, see the associated variant indices Hail Table: panukb_ld_variant_indices_CSA.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.CSA.ldadj.bm" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_block_matrix_EAS": { + "description": "Pan-UKB: linkage disequilibrium (LD) matrix Hail BlockMatrix for East Asian ancestry population. To determine which row/column corresponds to which variant, see the associated variant indices Hail Table: panukb_ld_variant_indices_EAS.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.EAS.ldadj.bm" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_block_matrix_EUR": { + "description": "Pan-UKB: linkage disequilibrium (LD) matrix Hail BlockMatrix for European ancestry population. To determine which row/column corresponds to which variant, see the associated variant indices Hail Table: panukb_ld_variant_indices_EUR.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.EUR.ldadj.bm" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_block_matrix_MID": { + "description": "Pan-UKB: linkage disequilibrium (LD) matrix Hail BlockMatrix for Middle Eastern ancestry population. To determine which row/column corresponds to which variant, see the associated variant indices Hail Table: panukb_ld_variant_indices_MID.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.MID.ldadj.bm" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_scores_AFR": { + "description": "Pan-UKB: linkage disequilibrium (LD) scores Hail Table for African ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.AFR.ldscore.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_scores_AMR": { + "description": "Pan-UKB: linkage disequilibrium (LD) scores Hail Table for Admixed American ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.AMR.ldscore.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_scores_CSA": { + "description": "Pan-UKB: linkage disequilibrium (LD) scores Hail Table for Central/South Asian ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.CSA.ldscore.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_scores_EAS": { + "description": "Pan-UKB: linkage disequilibrium (LD) scores Hail Table for East Asian ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.EAS.ldscore.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_scores_EUR": { + "description": "Pan-UKB: linkage disequilibrium (LD) scores Hail Table for European ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.EUR.ldscore.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_scores_MID": { + "description": "Pan-UKB: linkage disequilibrium (LD) scores Hail Table for Middle Eastern ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.MID.ldscore.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_variant_indices_AFR": { + "description": "Pan-UKB: variant indices Hail Table for African ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.AFR.ldadj.variant.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_variant_indices_AMR": { + "description": "Pan-UKB: variant indices Hail Table for Admixed American ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.AMR.ldadj.variant.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_variant_indices_CSA": { + "description": "Pan-UKB: variant indices Hail Table for Central/South Asian ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.CSA.ldadj.variant.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_variant_indices_EAS": { + "description": "Pan-UKB: variant indices Hail Table for East Asian ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.EAS.ldadj.variant.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_variant_indices_EUR": { + "description": "Pan-UKB: variant indices Hail Table for European ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.EUR.ldadj.variant.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_ld_variant_indices_MID": { + "description": "Pan-UKB: variant indices Hail Table for Middle Eastern ancestry population.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/ld_release/UKBB.MID.ldadj.variant.ht" + } + }, + "version": "0.2" + } + ] + }, + "panukb_meta_analysis": { + "description": "Pan-UKB: pan-ancestry GWAS of UK Biobank, full meta-analysis Hail MatrixTable.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview/index.html", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/sumstats_release/meta_analysis.mt" + }, + "gcp": { + "us": "gs://ukb-diverse-pops-public/sumstats_release/meta_analysis.mt" + } + }, + "version": "0.1" + } + ] + }, + "panukb_summary_stats": { + "description": "Pan-UKB: pan-ancestry GWAS of UK Biobank, full summary statistics Hail MatrixTable.", + "url": "https://pan.ukbb.broadinstitute.org/docs/technical-overview/index.html", + "versions": [ + { + "reference_genome": "GRCh37", + "url": { + "aws": { + "us": "s3://pan-ukb-us-east-1/sumstats_release/results_full.mt" + }, + "gcp": { + "us": "gs://ukb-diverse-pops-public/sumstats_release/results_full.mt" + } + }, + "version": "0.1" + } + ] } } diff --git a/hail/python/hail/experimental/table_ndarray_utils.py b/hail/python/hail/experimental/table_ndarray_utils.py index e107db97520..7d9fcef5cef 100644 --- a/hail/python/hail/experimental/table_ndarray_utils.py +++ b/hail/python/hail/experimental/table_ndarray_utils.py @@ -3,7 +3,7 @@ from hail.utils.java import Env -def mt_to_table_of_ndarray(entry_expr, block_size=16): +def mt_to_table_of_ndarray(entry_expr, block_size=16, return_checkpointed_table_also=False): check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr) mt = matrix_table_source('mt_to_table_of_ndarray/entry_expr', entry_expr) @@ -35,8 +35,11 @@ def get_even_partitioning(ht, partition_size, total_num_rows): ht = ht.checkpoint(temp_file_name) num_rows = ht.count() new_partitioning = get_even_partitioning(ht, block_size, num_rows) - ht = hl.read_table(temp_file_name, _intervals=new_partitioning) + new_part_ht = hl.read_table(temp_file_name, _intervals=new_partitioning) - grouped = ht._group_within_partitions("groups", block_size) + grouped = new_part_ht._group_within_partitions("groups", block_size) A = grouped.select(ndarray=hl.nd.array(grouped.groups.map(lambda group: group.xs))) + + if return_checkpointed_table_also: + return A, ht return A diff --git a/hail/python/hail/linalg/blockmatrix.py b/hail/python/hail/linalg/blockmatrix.py index 3190aa6ac8f..b7231575bfc 100644 --- a/hail/python/hail/linalg/blockmatrix.py +++ b/hail/python/hail/linalg/blockmatrix.py @@ -1375,6 +1375,8 @@ def __add__(self, b): ------- :class:`.BlockMatrix` """ + if isinstance(b, (int, float)): + return self._map_dense(lambda entry: entry + b) return self._apply_map2(BlockMatrix._binary_op('+'), b, sparsity_strategy="Union") @typecheck_method(b=oneof(numeric, np.ndarray, block_matrix_type)) @@ -1389,6 +1391,8 @@ def __sub__(self, b): ------- :class:`.BlockMatrix` """ + if isinstance(b, (int, float)): + return self._map_dense(lambda entry: entry - b) return self._apply_map2(BlockMatrix._binary_op('-'), b, sparsity_strategy="Union") @typecheck_method(b=oneof(numeric, np.ndarray, block_matrix_type)) @@ -1403,6 +1407,9 @@ def __mul__(self, b): ------- :class:`.BlockMatrix` """ + if isinstance(b, (int, float)): + # sparse since multiplying by zero is zero + return self._map_sparse(lambda entry: entry * b) return self._apply_map2(BlockMatrix._binary_op('*'), b, sparsity_strategy="Intersection") @typecheck_method(b=oneof(numeric, np.ndarray, block_matrix_type)) @@ -1417,6 +1424,9 @@ def __truediv__(self, b): ------- :class:`.BlockMatrix` """ + if isinstance(b, (int, float)): + # sparse since dividing by zero is zero + return self._map_sparse(lambda entry: entry / b) return self._apply_map2(BlockMatrix._binary_op('/'), b, sparsity_strategy="NeedsDense") @typecheck_method(b=numeric) @@ -1528,6 +1538,12 @@ def __pow__(self, x): """ return self._apply_map(lambda i: i ** x, needs_dense=False) + def _map_dense(self, func): + return self._apply_map(func, True) + + def _map_sparse(self, func): + return self._apply_map(func, False) + def sqrt(self): """Element-wise square root. diff --git a/hail/python/hail/methods/pca.py b/hail/python/hail/methods/pca.py index 53583baf123..db8f42769f4 100644 --- a/hail/python/hail/methods/pca.py +++ b/hail/python/hail/methods/pca.py @@ -300,10 +300,10 @@ def _blanczos_pca(entry_expr, k=10, compute_loadings=False, q_iterations=2, over (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ - + check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr) mt = matrix_table_source('pca/entry_expr', entry_expr) - A = mt_to_table_of_ndarray(entry_expr, block_size) + A, ht = mt_to_table_of_ndarray(entry_expr, block_size, return_checkpointed_table_also=True) A = A.persist() # Set Parameters @@ -365,10 +365,12 @@ def hailBlanczos(A, G, k, q): cols_and_scores = hl.zip(A.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key)) - lt = mt.rows().select() + lt = ht.select() lt = lt.annotate_globals(U=U) - lt = lt.add_index() - lt = lt.annotate(loadings=lt.U[lt.idx, :]._data_array()).select_globals() + idx_name = '_tmp_pca_loading_index' + lt = lt.add_index(idx_name) + lt = lt.annotate(loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() + lt = lt.drop(lt[idx_name]) if compute_loadings: return eigens, st, lt diff --git a/hail/python/hailtop/batch_client/aioclient.py b/hail/python/hailtop/batch_client/aioclient.py index 976cebd9ebf..0eb986ecbdf 100644 --- a/hail/python/hailtop/batch_client/aioclient.py +++ b/hail/python/hailtop/batch_client/aioclient.py @@ -203,6 +203,8 @@ async def is_complete(self): # { # batch_id: int # job_id: int + # user: str + # billing_project: str # name: optional(str) # state: str (Ready, Running, Success, Error, Failure, Cancelled) # exit_code: optional(int) @@ -349,7 +351,8 @@ async def get_job_log(self, job_id: int) -> Optional[Dict[str, Any]]: return await self._client.get_job_log(self.id, job_id) # { - # id: int, + # id: int + # user: str # billing_project: str # token: str # state: str, (open, failure, cancelled, success, running) diff --git a/hail/python/hailtop/batch_client/client.py b/hail/python/hailtop/batch_client/client.py index 530fe1592d2..d4a67d51839 100644 --- a/hail/python/hailtop/batch_client/client.py +++ b/hail/python/hailtop/batch_client/client.py @@ -82,6 +82,8 @@ def is_complete(self): # { # batch_id: int # job_id: int + # user: str + # billing_project: str # name: optional(str) # state: str (Ready, Running, Success, Error, Failure, Cancelled) # exit_code: optional(int) @@ -128,7 +130,8 @@ def cancel(self): async_to_blocking(self._async_batch.cancel()) # { - # id: int, + # id: int + # user: str # billing_project: str # token: str # state: str, (open, failure, cancelled, success, running) diff --git a/hail/python/hailtop/hailctl/dataproc/start.py b/hail/python/hailtop/hailctl/dataproc/start.py index b8aff6137fb..81e341f1e2f 100755 --- a/hail/python/hailtop/hailctl/dataproc/start.py +++ b/hail/python/hailtop/hailctl/dataproc/start.py @@ -136,7 +136,7 @@ ANNOTATION_DB_BUCKETS = ["hail-datasets-us", "hail-datasets-eu", "gnomad-public-requester-pays"] -IMAGE_VERSION = '1.4-debian9' +IMAGE_VERSION = '2.0.6-debian10' def init_parser(parser): diff --git a/hail/python/requirements.txt b/hail/python/requirements.txt index c3909bbd60f..86353224136 100644 --- a/hail/python/requirements.txt +++ b/hail/python/requirements.txt @@ -13,7 +13,7 @@ numpy<2 pandas>=1.1.0,<1.1.5 parsimonious<0.9 PyJWT -pyspark>=2.4,<2.4.2 +pyspark>=3.1.1,<3.2.0 python-json-logger==0.1.11 requests==2.22.0 scipy>1.2,<1.7 diff --git a/hail/python/test/hail/expr/test_ndarrays.py b/hail/python/test/hail/expr/test_ndarrays.py index f4f7743bdc4..715d3e6b4cb 100644 --- a/hail/python/test/hail/expr/test_ndarrays.py +++ b/hail/python/test/hail/expr/test_ndarrays.py @@ -59,7 +59,7 @@ def test_ndarray_ref(): with pytest.raises(HailUserError) as exc: hl.eval(hl.nd.array([1, 2, 3])[4]) - assert "Index 4 is out of bounds for axis 0 with size 3" in str(exc) + assert "Index 4 is out of bounds for axis 0 with size 3" in str(exc.value) @fails_service_backend() @@ -295,31 +295,31 @@ def test_ndarray_reshape(): with pytest.raises(FatalError) as exc: hl.eval(hl.literal(np_cube).reshape((-1, -1))) - assert "more than one -1" in str(exc) + assert "more than one -1" in str(exc.value) with pytest.raises(FatalError) as exc: hl.eval(hl.literal(np_cube).reshape((20,))) - assert "requested shape is incompatible with number of elements" in str(exc) + assert "requested shape is incompatible with number of elements" in str(exc.value) with pytest.raises(FatalError) as exc: hl.eval(a.reshape((3,))) - assert "requested shape is incompatible with number of elements" in str(exc) + assert "requested shape is incompatible with number of elements" in str(exc.value) with pytest.raises(FatalError) as exc: hl.eval(a.reshape(())) - assert "requested shape is incompatible with number of elements" in str(exc) + assert "requested shape is incompatible with number of elements" in str(exc.value) with pytest.raises(FatalError) as exc: hl.eval(hl.literal(np_cube).reshape((0, 2, 2))) - assert "requested shape is incompatible with number of elements" in str(exc) + assert "requested shape is incompatible with number of elements" in str(exc.value) with pytest.raises(FatalError) as exc: hl.eval(hl.literal(np_cube).reshape((2, 2, -2))) - assert "must contain only nonnegative numbers or -1" in str(exc) + assert "must contain only nonnegative numbers or -1" in str(exc.value) with pytest.raises(FatalError) as exc: hl.eval(shape_zero.reshape((0, -1))) - assert "Can't reshape" in str(exc) + assert "Can't reshape" in str(exc.value) with pytest.raises(TypeError): a.reshape(hl.tuple(['4', '5'])) @@ -616,11 +616,11 @@ def test_ndarray_matmul(): with pytest.raises(FatalError) as exc: hl.eval(r @ r) - assert "Matrix dimensions incompatible: 3 2" in str(exc) + assert "Matrix dimensions incompatible: 3 2" in str(exc.value) with pytest.raises(FatalError) as exc: hl.eval(hl.nd.array([1, 2]) @ hl.nd.array([1, 2, 3])) - assert "Matrix dimensions incompatible" in str(exc) + assert "Matrix dimensions incompatible" in str(exc.value) def test_ndarray_big(): @@ -651,7 +651,7 @@ def test_ndarray_arange(): with pytest.raises(FatalError) as exc: hl.eval(hl.nd.arange(5, 20, 0)) - assert "Array range cannot have step size 0" in str(exc) + assert "Array range cannot have step size 0" in str(exc.value) def test_ndarray_mixed(): @@ -680,7 +680,7 @@ def test_ndarray_diagonal(): with pytest.raises(AssertionError) as exc: hl.nd.diagonal(hl.nd.array([1, 2])) - assert "2 dimensional" in str(exc) + assert "2 dimensional" in str(exc.value) def test_ndarray_qr(): @@ -782,11 +782,11 @@ def assert_same_qr(hl_ndarray, np_ndarray): with pytest.raises(ValueError) as exc: hl.nd.qr(wiki_example, mode="invalid") - assert "Unrecognized mode" in str(exc) + assert "Unrecognized mode" in str(exc.value) with pytest.raises(AssertionError) as exc: hl.nd.qr(hl.nd.arange(6)) - assert "requires 2 dimensional" in str(exc) + assert "requires 2 dimensional" in str(exc.value) def test_svd(): @@ -1053,4 +1053,4 @@ def test_agg_ndarray_sum(): mismatched = hl.utils.range_table(5) mismatched = mismatched.annotate(x=hl.nd.ones((mismatched.idx,))) mismatched.aggregate(hl.agg.ndarray_sum(mismatched.x)) - assert "Can't sum" in str(exc) + assert "Can't sum" in str(exc.value) diff --git a/hail/python/test/hail/linalg/test_linalg.py b/hail/python/test/hail/linalg/test_linalg.py index 02d56855b30..289b284e44c 100644 --- a/hail/python/test/hail/linalg/test_linalg.py +++ b/hail/python/test/hail/linalg/test_linalg.py @@ -1097,11 +1097,11 @@ def test_filtering(self): with pytest.raises(ValueError) as exc: bm.filter_cols([0]).filter_cols([3]).to_numpy() - assert "index" in str(exc) + assert "index" in str(exc.value) with pytest.raises(ValueError) as exc: bm.filter_rows([0]).filter_rows([3]).to_numpy() - assert "index" in str(exc) + assert "index" in str(exc.value) @skip_unless_spark_backend() def test_sparsify_blocks(self): diff --git a/hail/src/main/scala/is/hail/backend/service/ServiceBackend.scala b/hail/src/main/scala/is/hail/backend/service/ServiceBackend.scala index 15fffb0d966..c94265dc509 100644 --- a/hail/src/main/scala/is/hail/backend/service/ServiceBackend.scala +++ b/hail/src/main/scala/is/hail/backend/service/ServiceBackend.scala @@ -4,7 +4,6 @@ import java.io._ import java.net._ import java.nio.charset.StandardCharsets import java.util.concurrent._ - import is.hail.HailContext import is.hail.annotations._ import is.hail.asm4s._ @@ -33,6 +32,7 @@ import org.json4s.{DefaultFormats, Formats} import org.newsclub.net.unix.{AFUNIXSocket, AFUNIXSocketAddress, AFUNIXServerSocket} +import java.nio.charset.Charset import scala.collection.mutable import scala.reflect.ClassTag import scala.annotation.switch @@ -89,7 +89,7 @@ object Worker { val fs = retryTransientErrors { using(new FileInputStream(s"$scratchDir/gsa-key/key.json")) { is => - new GoogleStorageFS(IOUtils.toString(is)) + new GoogleStorageFS(IOUtils.toString(is, Charset.defaultCharset())) } } diff --git a/hail/src/main/scala/is/hail/expr/AnnotationImpex.scala b/hail/src/main/scala/is/hail/expr/AnnotationImpex.scala index 78da7ed33ab..c04d672e55d 100644 --- a/hail/src/main/scala/is/hail/expr/AnnotationImpex.scala +++ b/hail/src/main/scala/is/hail/expr/AnnotationImpex.scala @@ -140,10 +140,10 @@ object JSONAnnotationImpex { } } - def irImportAnnotation(s: String, t: Type): Row = { + def irImportAnnotation(s: String, t: Type, warnContext: mutable.HashSet[String]): Row = { try { // wraps in a Row to handle returned missingness - Row(importAnnotation(JsonMethods.parse(s), t, true, null)) + Row(importAnnotation(JsonMethods.parse(s), t, true, warnContext)) } catch { case e: Throwable => fatal(s"Error parsing JSON:\n type: $t\n value: $s", e) diff --git a/hail/src/main/scala/is/hail/expr/ir/AbstractMatrixTableSpec.scala b/hail/src/main/scala/is/hail/expr/ir/AbstractMatrixTableSpec.scala index cc2c4e57dd1..33338e79c4e 100644 --- a/hail/src/main/scala/is/hail/expr/ir/AbstractMatrixTableSpec.scala +++ b/hail/src/main/scala/is/hail/expr/ir/AbstractMatrixTableSpec.scala @@ -21,8 +21,7 @@ object RelationalSpec { implicit val formats: Formats = new DefaultFormats() { override val typeHints = ShortTypeHints(List( classOf[ComponentSpec], classOf[RVDComponentSpec], classOf[PartitionCountsComponentSpec], - classOf[RelationalSpec], classOf[MatrixTableSpec], classOf[TableSpec])) - override val typeHintFieldName = "name" + classOf[RelationalSpec], classOf[MatrixTableSpec], classOf[TableSpec]), typeHintFieldName="name") } + new TableTypeSerializer + new MatrixTypeSerializer @@ -150,8 +149,7 @@ object MatrixTableSpec { def fromJValue(fs: FS, path: String, jv: JValue): MatrixTableSpec = { implicit val formats: Formats = new DefaultFormats() { override val typeHints = ShortTypeHints(List( - classOf[ComponentSpec], classOf[RVDComponentSpec], classOf[PartitionCountsComponentSpec])) - override val typeHintFieldName = "name" + classOf[ComponentSpec], classOf[RVDComponentSpec], classOf[PartitionCountsComponentSpec]), typeHintFieldName = "name") } + new MatrixTypeSerializer val params = jv.extract[MatrixTableSpecParameters] diff --git a/hail/src/main/scala/is/hail/expr/ir/BlockMatrixIR.scala b/hail/src/main/scala/is/hail/expr/ir/BlockMatrixIR.scala index 4c4e866fe84..bbe0e9289d4 100644 --- a/hail/src/main/scala/is/hail/expr/ir/BlockMatrixIR.scala +++ b/hail/src/main/scala/is/hail/expr/ir/BlockMatrixIR.scala @@ -92,8 +92,8 @@ case class BlockMatrixRead(reader: BlockMatrixReader) extends BlockMatrixIR { object BlockMatrixReader { implicit val formats: Formats = new DefaultFormats() { override val typeHints = ShortTypeHints( - List(classOf[BlockMatrixNativeReader], classOf[BlockMatrixBinaryReader], classOf[BlockMatrixPersistReader])) - override val typeHintFieldName: String = "name" + List(classOf[BlockMatrixNativeReader], classOf[BlockMatrixBinaryReader], classOf[BlockMatrixPersistReader]), + typeHintFieldName = "name") } def fromJValue(ctx: ExecuteContext, jv: JValue): BlockMatrixReader = { @@ -270,6 +270,17 @@ case class BlockMatrixMap(child: BlockMatrixIR, eltName: String, f: IR, needsDen override protected[ir] def execute(ctx: ExecuteContext): BlockMatrix = { val prev = child.execute(ctx) + val functionArgs = f match { + case ApplyUnaryPrimOp(_, arg1) => IndexedSeq(arg1) + case Apply(_, _, args, _) => args + case ApplyBinaryPrimOp(_, l, r) => IndexedSeq(l, r) + } + + assert(functionArgs.forall(ir => IsConstant(ir) || ir.isInstanceOf[Ref]), + "Spark backend without lowering does not support general mapping over " + + "BlockMatrix entries. Use predefined functions like `BlockMatrix.abs`.") + + val (name, breezeF): (String, DenseMatrix[Double] => DenseMatrix[Double]) = f match { case ApplyUnaryPrimOp(Negate(), _) => ("negate", BlockMatrix.negationOp) case Apply("abs", _, _, _) => ("abs", numerics.abs(_)) @@ -457,7 +468,7 @@ case class BlockMatrixDot(left: BlockMatrixIR, right: BlockMatrixIR) extends Blo val (tensorShape, isRowVector) = BlockMatrixIR.matrixShapeToTensorShape(lRows, rCols) val sparsity = if (left.typ.isSparse || right.typ.isSparse) - BlockMatrixSparsity( + BlockMatrixSparsity.constructFromShapeAndFunction( BlockMatrixType.numBlocks(lRows, blockSize), BlockMatrixType.numBlocks(rCols, blockSize)) { (i: Int, j: Int) => Array.tabulate(BlockMatrixType.numBlocks(rCols, blockSize)) { k => @@ -527,14 +538,14 @@ case class BlockMatrixBroadcast( BlockMatrixSparsity.dense case IndexedSeq(0) => // broadcast col vector assert(Set(1, shape(0)) == Set(child.typ.nRows, child.typ.nCols)) - BlockMatrixSparsity(nRowBlocks, nColBlocks)((i: Int, j: Int) => child.typ.hasBlock(0 -> j)) + BlockMatrixSparsity.constructFromShapeAndFunction(nRowBlocks, nColBlocks)((i: Int, j: Int) => child.typ.hasBlock(0 -> j)) case IndexedSeq(1) => // broadcast row vector assert(Set(1, shape(1)) == Set(child.typ.nRows, child.typ.nCols)) - BlockMatrixSparsity(nRowBlocks, nColBlocks)((i: Int, j: Int) => child.typ.hasBlock(i -> 0)) + BlockMatrixSparsity.constructFromShapeAndFunction(nRowBlocks, nColBlocks)((i: Int, j: Int) => child.typ.hasBlock(i -> 0)) case IndexedSeq(0, 0) => // diagonal as col vector assert(shape(0) == 1L) assert(shape(1) == java.lang.Math.min(child.typ.nRows, child.typ.nCols)) - BlockMatrixSparsity(nRowBlocks, nColBlocks)((_, j: Int) => child.typ.hasBlock(j -> j)) + BlockMatrixSparsity.constructFromShapeAndFunction(nRowBlocks, nColBlocks)((_, j: Int) => child.typ.hasBlock(j -> j)) case IndexedSeq(1, 0) => // transpose assert(child.typ.blockSize == blockSize) assert(shape(0) == child.typ.nCols && shape(1) == child.typ.nRows) @@ -613,11 +624,11 @@ case class BlockMatrixAgg( outIndexExpr match { case IndexedSeq() => BlockMatrixSparsity.dense case IndexedSeq(1) => // col vector result; agg over row - BlockMatrixSparsity(child.typ.nRowBlocks, 1) { (i, _) => + BlockMatrixSparsity.constructFromShapeAndFunction(child.typ.nRowBlocks, 1) { (i, _) => (0 until child.typ.nColBlocks).exists(j => child.typ.hasBlock(i -> j)) } case IndexedSeq(0) => // row vector result; agg over col - BlockMatrixSparsity(1, child.typ.nColBlocks) { (_, j) => + BlockMatrixSparsity.constructFromShapeAndFunction(1, child.typ.nColBlocks) { (_, j) => (0 until child.typ.nRowBlocks).exists(i => child.typ.hasBlock(i -> j)) } } @@ -733,7 +744,7 @@ case class BandSparsifier(blocksOnly: Boolean, l: Long, u: Long) extends BlockMa val leftBuffer = java.lang.Math.floorDiv(-l, childType.blockSize) val rightBuffer = java.lang.Math.floorDiv(u, childType.blockSize) - BlockMatrixSparsity(childType.nRowBlocks, childType.nColBlocks) { (i, j) => + BlockMatrixSparsity.constructFromShapeAndFunction(childType.nRowBlocks, childType.nColBlocks) { (i, j) => j >= (i - leftBuffer) && j <= (i + rightBuffer) && childType.hasBlock(i -> j) } } @@ -753,7 +764,7 @@ case class RowIntervalSparsifier(blocksOnly: Boolean, starts: IndexedSeq[Long], val blockStarts = starts.grouped(childType.blockSize).map(idxs => childType.getBlockIdx(idxs.min)).toArray val blockStops = stops.grouped(childType.blockSize).map(idxs => childType.getBlockIdx(idxs.max - 1)).toArray - BlockMatrixSparsity(childType.nRowBlocks, childType.nColBlocks) { (i, j) => + BlockMatrixSparsity.constructFromShapeAndFunction(childType.nRowBlocks, childType.nColBlocks) { (i, j) => blockStarts(i) <= j && blockStops(i) >= j && childType.hasBlock(i -> j) } } @@ -800,7 +811,7 @@ case class PerBlockSparsifier(blocks: IndexedSeq[Int]) extends BlockMatrixSparsi val blockSet = blocks.toSet override def definedBlocks(childType: BlockMatrixType): BlockMatrixSparsity = { - BlockMatrixSparsity(childType.nRowBlocks, childType.nColBlocks){ case(i: Int, j: Int) => + BlockMatrixSparsity.constructFromShapeAndFunction(childType.nRowBlocks, childType.nColBlocks){ case(i: Int, j: Int) => blockSet.contains(i + j * childType.nRowBlocks) } } diff --git a/hail/src/main/scala/is/hail/expr/ir/BlockMatrixWriter.scala b/hail/src/main/scala/is/hail/expr/ir/BlockMatrixWriter.scala index bd81517bec7..cc8dc82003b 100644 --- a/hail/src/main/scala/is/hail/expr/ir/BlockMatrixWriter.scala +++ b/hail/src/main/scala/is/hail/expr/ir/BlockMatrixWriter.scala @@ -22,8 +22,7 @@ object BlockMatrixWriter { override val typeHints = ShortTypeHints( List(classOf[BlockMatrixNativeWriter], classOf[BlockMatrixBinaryWriter], classOf[BlockMatrixRectanglesWriter], classOf[BlockMatrixBinaryMultiWriter], classOf[BlockMatrixTextMultiWriter], - classOf[BlockMatrixPersistWriter], classOf[BlockMatrixNativeMultiWriter])) - override val typeHintFieldName: String = "name" + classOf[BlockMatrixPersistWriter], classOf[BlockMatrixNativeMultiWriter]), typeHintFieldName = "name") } } diff --git a/hail/src/main/scala/is/hail/expr/ir/Emit.scala b/hail/src/main/scala/is/hail/expr/ir/Emit.scala index 7035797f6e2..c905f5cd27b 100644 --- a/hail/src/main/scala/is/hail/expr/ir/Emit.scala +++ b/hail/src/main/scala/is/hail/expr/ir/Emit.scala @@ -515,7 +515,7 @@ class Emit[C]( case If(cond, cnsq, altr) => assert(cnsq.typ == TVoid && altr.typ == TVoid) - emitI(cond).consume(cb, {}, m => cb.ifx(m.tcode[Boolean], emitVoid(cnsq), emitVoid(altr))) + emitI(cond).consume(cb, {}, m => cb.ifx(m.asBoolean.boolCode(cb), emitVoid(cnsq), emitVoid(altr))) case Let(name, value, body) => value.pType match { case streamType: PCanonicalStream => @@ -1011,7 +1011,7 @@ class Emit[C]( FastIndexedSeq(typeInfo[Region], keyValTyp.asEmitParam, keyValTyp.asEmitParam), BooleanInfo) isSame.emitWithBuilder { cb => - emitInMethod(cb, compare2).consumeCode[Boolean](cb, true, _.tcode[Boolean]) + emitInMethod(cb, compare2).consumeCode[Boolean](cb, true, _.asBoolean.boolCode(cb)) } val eltIdx = mb.newLocal[Int]("groupByKey_eltIdx") @@ -2186,7 +2186,10 @@ class Emit[C]( val cmp2 = ApplyComparisonOp(EQWithNA(eltVType), In(0, eltType), In(1, eltType)) InferPType(cmp2) val EmitCode(s, m, pv) = emitInMethod(cmp2, discardNext) - discardNext.emit(Code(s, m || pv.tcode[Boolean])) + discardNext.emitWithBuilder { cb => + cb += s + m || pv.asBoolean.boolCode(cb) + } val lessThan = ApplyComparisonOp(Compare(eltVType), In(0, eltType), In(1, eltType)) < 0 InferPType(lessThan) (a, lessThan, sorter.distinctFromSorted { (r, v1, m1, v2, m2) => @@ -2208,7 +2211,10 @@ class Emit[C]( val cmp2 = ApplyComparisonOp(EQWithNA(keyType.virtualType), k0, k1).deepCopy() InferPType(cmp2) val EmitCode(s, m, pv) = emitInMethod(cmp2, discardNext) - discardNext.emit(Code(s, m || pv.tcode[Boolean])) + discardNext.emitWithBuilder { cb => + cb += s + m || pv.asBoolean.boolCode(cb) + } val lessThan = (ApplyComparisonOp(Compare(keyType.virtualType), k0, k1) < 0).deepCopy() InferPType(lessThan) (a, lessThan, Code(sorter.pruneMissing, sorter.distinctFromSorted { (r, v1, m1, v2, m2) => @@ -2963,4 +2969,3 @@ abstract class NDArrayEmitter(val outputShape: IndexedSeq[Value[Long]]) finish(cb) } } - diff --git a/hail/src/main/scala/is/hail/expr/ir/EmitStream.scala b/hail/src/main/scala/is/hail/expr/ir/EmitStream.scala index b04c961a373..6425b263df6 100644 --- a/hail/src/main/scala/is/hail/expr/ir/EmitStream.scala +++ b/hail/src/main/scala/is/hail/expr/ir/EmitStream.scala @@ -1954,7 +1954,7 @@ object EmitStream { emitIR(condIR).flatMap(cb) { cond => val xCond = mb.genFieldThisRef[Boolean]("stream_if_cond") - cb += (xCond := cond.tcode[Boolean]) + cb.assign(xCond, cond.asBoolean.boolCode(cb)) var leftSS: SizedStream = null var rightSS: SizedStream = null val Lmissing = CodeLabel() diff --git a/hail/src/main/scala/is/hail/expr/ir/IR.scala b/hail/src/main/scala/is/hail/expr/ir/IR.scala index e0c06538520..8adf9632e48 100644 --- a/hail/src/main/scala/is/hail/expr/ir/IR.scala +++ b/hail/src/main/scala/is/hail/expr/ir/IR.scala @@ -649,9 +649,8 @@ object PartitionReader { classOf[PartitionNativeReaderIndexed], classOf[PartitionZippedNativeReader], classOf[AbstractTypedCodecSpec], - classOf[TypedCodecSpec]) - ) + BufferSpec.shortTypeHints - override val typeHintFieldName = "name" + classOf[TypedCodecSpec]), + typeHintFieldName = "name") + BufferSpec.shortTypeHints } + new TStructSerializer + new TypeSerializer + @@ -664,9 +663,8 @@ object PartitionWriter { override val typeHints = ShortTypeHints(List( classOf[PartitionNativeWriter], classOf[AbstractTypedCodecSpec], - classOf[TypedCodecSpec]) + classOf[TypedCodecSpec]), typeHintFieldName = "name" ) + BufferSpec.shortTypeHints - override val typeHintFieldName = "name" } + new TStructSerializer + new TypeSerializer + @@ -683,9 +681,9 @@ object MetadataWriter { classOf[RelationalWriter], classOf[RVDSpecMaker], classOf[AbstractTypedCodecSpec], - classOf[TypedCodecSpec]) + classOf[TypedCodecSpec]), + typeHintFieldName = "name" ) + BufferSpec.shortTypeHints - override val typeHintFieldName = "name" } + new TStructSerializer + new TypeSerializer + diff --git a/hail/src/main/scala/is/hail/expr/ir/MatrixWriter.scala b/hail/src/main/scala/is/hail/expr/ir/MatrixWriter.scala index e22acac94b8..e72c37ed32a 100644 --- a/hail/src/main/scala/is/hail/expr/ir/MatrixWriter.scala +++ b/hail/src/main/scala/is/hail/expr/ir/MatrixWriter.scala @@ -28,8 +28,7 @@ object MatrixWriter { implicit val formats: Formats = new DefaultFormats() { override val typeHints = ShortTypeHints( List(classOf[MatrixNativeWriter], classOf[MatrixVCFWriter], classOf[MatrixGENWriter], - classOf[MatrixBGENWriter], classOf[MatrixPLINKWriter], classOf[WrappedMatrixWriter])) - override val typeHintFieldName = "name" + classOf[MatrixBGENWriter], classOf[MatrixPLINKWriter], classOf[WrappedMatrixWriter]), typeHintFieldName = "name") } } @@ -339,8 +338,7 @@ case class MatrixPLINKWriter( object MatrixNativeMultiWriter { implicit val formats: Formats = new DefaultFormats() { - override val typeHints = ShortTypeHints(List(classOf[MatrixNativeMultiWriter])) - override val typeHintFieldName = "name" + override val typeHints = ShortTypeHints(List(classOf[MatrixNativeMultiWriter]), typeHintFieldName = "name") } } diff --git a/hail/src/main/scala/is/hail/expr/ir/TableWriter.scala b/hail/src/main/scala/is/hail/expr/ir/TableWriter.scala index 742443d35e9..76cd2ee8723 100644 --- a/hail/src/main/scala/is/hail/expr/ir/TableWriter.scala +++ b/hail/src/main/scala/is/hail/expr/ir/TableWriter.scala @@ -21,10 +21,9 @@ import is.hail.variant.ReferenceGenome import org.json4s.{DefaultFormats, Formats, ShortTypeHints} object TableWriter { - implicit val formats: Formats = new DefaultFormats() { + implicit val formats: Formats = new DefaultFormats() { override val typeHints = ShortTypeHints( - List(classOf[TableNativeWriter], classOf[TableTextWriter])) - override val typeHintFieldName = "name" + List(classOf[TableNativeWriter], classOf[TableTextWriter]), typeHintFieldName = "name") } } diff --git a/hail/src/main/scala/is/hail/expr/ir/functions/RelationalFunctions.scala b/hail/src/main/scala/is/hail/expr/ir/functions/RelationalFunctions.scala index a8bd02549bb..9d36254542f 100644 --- a/hail/src/main/scala/is/hail/expr/ir/functions/RelationalFunctions.scala +++ b/hail/src/main/scala/is/hail/expr/ir/functions/RelationalFunctions.scala @@ -5,6 +5,7 @@ import is.hail.types.virtual.Type import is.hail.types.{BlockMatrixType, MatrixType, RTable, TableType, TypeWithRequiredness} import is.hail.linalg.BlockMatrix import is.hail.methods._ +import is.hail.utils._ import is.hail.rvd.RVDType import org.json4s.{Extraction, JValue, ShortTypeHints} import org.json4s.jackson.{JsonMethods, Serialization} @@ -129,14 +130,16 @@ object RelationalFunctions { classOf[WrappedMatrixToTableFunction], classOf[WrappedMatrixToValueFunction], classOf[PCRelate] - )) + ), typeHintFieldName = "name") def extractTo[T: Manifest](ctx: ExecuteContext, config: String): T = { val jv = JsonMethods.parse(config) (jv \ "name").extract[String] match { case "VEP" => VEP.fromJValue(ctx.fs, jv).asInstanceOf[T] - case _ => + case _ => { + log.info("JSON: " + jv.toString) jv.extract[T] + } } } diff --git a/hail/src/main/scala/is/hail/expr/ir/functions/StringFunctions.scala b/hail/src/main/scala/is/hail/expr/ir/functions/StringFunctions.scala index fcacbc5be04..e79b3e6752c 100644 --- a/hail/src/main/scala/is/hail/expr/ir/functions/StringFunctions.scala +++ b/hail/src/main/scala/is/hail/expr/ir/functions/StringFunctions.scala @@ -299,8 +299,11 @@ object StringFunctions extends RegistryFunctions { (rType: Type, _: Seq[PType]) => PType.canonical(rType, true), typeParameters = Array(tv("T")) ) { case (er, cb, _, resultType, Array(s: PStringCode)) => - val row = Code.invokeScalaObject2[String, Type, Row](JSONAnnotationImpex.getClass, "irImportAnnotation", - s.loadString(), er.mb.ecb.getType(resultType.virtualType.asInstanceOf[TTuple].types(0))) + val warnCtx = cb.emb.genFieldThisRef[mutable.HashSet[String]]("parse_json_context") + cb.ifx(warnCtx.load().isNull, cb.assign(warnCtx, Code.newInstance[mutable.HashSet[String]]())) + + val row = Code.invokeScalaObject3[String, Type, mutable.HashSet[String], Row](JSONAnnotationImpex.getClass, "irImportAnnotation", + s.loadString(), er.mb.ecb.getType(resultType.virtualType.asInstanceOf[TTuple].types(0)), warnCtx) unwrapReturn(cb, er.region, resultType, row) } diff --git a/hail/src/main/scala/is/hail/expr/ir/lowering/LowerBlockMatrixIR.scala b/hail/src/main/scala/is/hail/expr/ir/lowering/LowerBlockMatrixIR.scala index 4c9bb06ab1a..4b7be878a58 100644 --- a/hail/src/main/scala/is/hail/expr/ir/lowering/LowerBlockMatrixIR.scala +++ b/hail/src/main/scala/is/hail/expr/ir/lowering/LowerBlockMatrixIR.scala @@ -311,8 +311,10 @@ object LowerBlockMatrixIR { MakeTuple.ordered(FastSeq(rows, cols)) }.mapBody { (ctx, body) => NDArraySlice(body, GetField(ctx, "new")) } - case BlockMatrixDensify(child) => unimplemented(bmir) - case BlockMatrixSparsify(child, sparsifier) => unimplemented(bmir) + // Both densify and sparsify change the sparsity pattern tracked on the BlockMatrixType. + case BlockMatrixDensify(child) => lower(child) + case BlockMatrixSparsify(child, sparsifier) => lower(child) + case RelationalLetBlockMatrix(name, value, body) => unimplemented(bmir) case ValueToBlockMatrix(child, shape, blockSize) if !child.typ.isInstanceOf[TArray] => throw new LowererUnsupportedOperation("use explicit broadcast for scalars!") diff --git a/hail/src/main/scala/is/hail/io/BufferSpecs.scala b/hail/src/main/scala/is/hail/io/BufferSpecs.scala index abe80042761..b98fca3aa4a 100644 --- a/hail/src/main/scala/is/hail/io/BufferSpecs.scala +++ b/hail/src/main/scala/is/hail/io/BufferSpecs.scala @@ -66,7 +66,7 @@ object BufferSpec { classOf[LEB128BufferSpec], classOf[BlockingBufferSpec], classOf[StreamBufferSpec] - )) + ), typeHintFieldName = "name") } trait BufferSpec extends Spec { diff --git a/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala b/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala index 657d303e583..e5cb8e3a029 100644 --- a/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala +++ b/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala @@ -30,9 +30,8 @@ object AbstractRVDSpec { classOf[compatibility.IndexSpec], classOf[compatibility.UnpartitionedRVDSpec], classOf[AbstractTypedCodecSpec], - classOf[TypedCodecSpec]) - ) + BufferSpec.shortTypeHints - override val typeHintFieldName = "name" + classOf[TypedCodecSpec]), + typeHintFieldName = "name") + BufferSpec.shortTypeHints } + new TStructSerializer + new TypeSerializer + diff --git a/hail/src/main/scala/is/hail/types/BlockMatrixType.scala b/hail/src/main/scala/is/hail/types/BlockMatrixType.scala index ef05efc0f50..84e957b41c5 100644 --- a/hail/src/main/scala/is/hail/types/BlockMatrixType.scala +++ b/hail/src/main/scala/is/hail/types/BlockMatrixType.scala @@ -7,11 +7,11 @@ import is.hail.linalg.BlockMatrix object BlockMatrixSparsity { private val builder: BoxedArrayBuilder[(Int, Int)] = new BoxedArrayBuilder[(Int, Int)] - val dense: BlockMatrixSparsity = BlockMatrixSparsity(None) + val dense: BlockMatrixSparsity = new BlockMatrixSparsity(None: Option[IndexedSeq[(Int, Int)]]) def apply(definedBlocks: IndexedSeq[(Int, Int)]): BlockMatrixSparsity = BlockMatrixSparsity(Some(definedBlocks)) - def apply(nRows: Int, nCols: Int)(exists: (Int, Int) => Boolean): BlockMatrixSparsity = { + def constructFromShapeAndFunction(nRows: Int, nCols: Int)(exists: (Int, Int) => Boolean): BlockMatrixSparsity = { var i = 0 builder.clear() while (i < nRows) { @@ -47,7 +47,7 @@ case class BlockMatrixSparsity(definedBlocks: Option[IndexedSeq[(Int, Int)]]) { def condense(blockOverlaps: => (Array[Array[Int]], Array[Array[Int]])): BlockMatrixSparsity = { definedBlocks.map { _ => val (ro, co) = blockOverlaps - BlockMatrixSparsity(ro.length, co.length) { (i, j) => + BlockMatrixSparsity.constructFromShapeAndFunction(ro.length, co.length) { (i, j) => ro(i).exists(ii => co(j).exists(jj => hasBlock(ii -> jj))) } }.getOrElse(BlockMatrixSparsity.dense) diff --git a/monitoring/Dockerfile.test b/monitoring/Dockerfile.test index 35290dacdd8..c5f31840903 100644 --- a/monitoring/Dockerfile.test +++ b/monitoring/Dockerfile.test @@ -1,4 +1,3 @@ FROM {{ service_base_image.image }} COPY monitoring/test/ /test/ -RUN hail-pip-install pytest-instafail==0.4.1 pytest-asyncio==0.10.0 diff --git a/monitoring/test/test_monitoring.py b/monitoring/test/test_monitoring.py index 81d695a6f4f..9952c0b9c9c 100644 --- a/monitoring/test/test_monitoring.py +++ b/monitoring/test/test_monitoring.py @@ -8,12 +8,11 @@ from hailtop.httpx import client_session import hailtop.utils as utils -pytestmark = pytest.mark.asyncio - logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) +@pytest.mark.asyncio async def test_billing_monitoring(): deploy_config = get_deploy_config() monitoring_deploy_config_url = deploy_config.url('monitoring', '/api/v1alpha/billing') diff --git a/query/query/query.py b/query/query/query.py index a6ed04646b9..00af2adb260 100644 --- a/query/query/query.py +++ b/query/query/query.py @@ -264,15 +264,11 @@ async def on_cleanup(app): ) -async def on_shutdown(app): +async def on_shutdown(_): # Filter the asyncio.current_task(), because if we await # the current task we'll end up in a deadlock - remaining_tasks = [ - t for t in asyncio.all_tasks() if t is not asyncio.current_task() - ] - log.info( - f"On shutdown request received, with {len(remaining_tasks)} remaining tasks" - ) + remaining_tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()] + log.info(f"On shutdown request received, with {len(remaining_tasks)} remaining tasks") await asyncio.wait(*remaining_tasks) log.info("All tasks on shutdown have completed") diff --git a/router/deployment.yaml b/router/deployment.yaml index 59380108c13..7c9f2a1e32f 100644 --- a/router/deployment.yaml +++ b/router/deployment.yaml @@ -176,8 +176,7 @@ metadata: app: hello spec: ports: - - name: http - port: 80 + - port: 443 protocol: TCP targetPort: 5000 selector: diff --git a/router/router.nginx.conf.in b/router/router.nginx.conf.in index 4eca32aec79..2ab84162c1d 100644 --- a/router/router.nginx.conf.in +++ b/router/router.nginx.conf.in @@ -66,7 +66,7 @@ server { server_name hello.*; location / { - proxy_pass http://hello/; + proxy_pass https://hello/; include /etc/nginx/proxy.conf; }