Skip to content

Commit

Permalink
Merge branch 'master' into add-oracle-lib-ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
david-leifker authored Jul 31, 2024
2 parents 0fcbcd4 + e14dc91 commit bb6879c
Show file tree
Hide file tree
Showing 10 changed files with 115 additions and 105 deletions.
179 changes: 94 additions & 85 deletions docker/datahub-ingestion-base/base-requirements.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docker/datahub-ingestion-base/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ ext {
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"

revision = 3 // increment to trigger rebuild
revision = 4 // increment to trigger rebuild
}

docker {
Expand Down
2 changes: 1 addition & 1 deletion docker/datahub-ingestion/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ ext {
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"

revision = 3 // increment to trigger rebuild
revision = 4 // increment to trigger rebuild
}

dependencies {
Expand Down
6 changes: 3 additions & 3 deletions docker/docker-compose-with-cassandra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ services:
- neo4jdata:/data
schema-registry:
hostname: schema-registry
image: confluentinc/cp-schema-registry:7.4.0
image: ${DATAHUB_CONFLUENT_SCHEMA_REGISTRY_IMAGE:-confluentinc/cp-schema-registry}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
ports:
- ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081
env_file: schema-registry/env/docker.env
Expand All @@ -159,7 +159,7 @@ services:
condition: service_healthy
broker:
hostname: broker
image: confluentinc/cp-kafka:7.4.0
image: ${DATAHUB_CONFLUENT_KAFKA_IMAGE:-confluentinc/cp-kafka}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
ports:
- 29092:29092
- 9092:9092
Expand All @@ -177,7 +177,7 @@ services:
- broker:/var/lib/kafka/data/
zookeeper:
hostname: zookeeper
image: confluentinc/cp-zookeeper:7.4.0
image: ${DATAHUB_CONFLUENT_ZOOKEEPER_IMAGE:-confluentinc/cp-zookeeper}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
ports:
- 2181:2181
env_file: zookeeper/env/docker.env
Expand Down
6 changes: 3 additions & 3 deletions docker/docker-compose-without-neo4j.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ services:
- esdata:/usr/share/elasticsearch/data
schema-registry:
hostname: schema-registry
image: confluentinc/cp-schema-registry:7.4.0
image: ${DATAHUB_CONFLUENT_SCHEMA_REGISTRY_IMAGE:-confluentinc/cp-schema-registry}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
ports:
- ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081
env_file: schema-registry/env/docker.env
Expand All @@ -138,7 +138,7 @@ services:
condition: service_healthy
broker:
hostname: broker
image: confluentinc/cp-kafka:7.4.0
image: ${DATAHUB_CONFLUENT_KAFKA_IMAGE:-confluentinc/cp-kafka}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
ports:
- ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092
env_file: broker/env/docker.env
Expand All @@ -155,7 +155,7 @@ services:
- broker:/var/lib/kafka/data/
zookeeper:
hostname: zookeeper
image: confluentinc/cp-zookeeper:7.4.0
image: ${DATAHUB_CONFLUENT_ZOOKEEPER_IMAGE:-confluentinc/cp-zookeeper}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
ports:
- ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181
env_file: zookeeper/env/docker.env
Expand Down
6 changes: 3 additions & 3 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ services:
- neo4jdata:/data
schema-registry:
hostname: schema-registry
image: confluentinc/cp-schema-registry:7.4.0
image: ${DATAHUB_CONFLUENT_SCHEMA_REGISTRY_IMAGE:-confluentinc/cp-schema-registry}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
ports:
- ${DATAHUB_MAPPED_SCHEMA_REGISTRY_PORT:-8081}:8081
env_file: schema-registry/env/docker.env
Expand All @@ -158,7 +158,7 @@ services:
condition: service_healthy
broker:
hostname: broker
image: confluentinc/cp-kafka:7.4.0
image: ${DATAHUB_CONFLUENT_KAFKA_IMAGE:-confluentinc/cp-kafka}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
ports:
- ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092
env_file: broker/env/docker.env
Expand All @@ -175,7 +175,7 @@ services:
- broker:/var/lib/kafka/data/
zookeeper:
hostname: zookeeper
image: confluentinc/cp-zookeeper:7.4.0
image: ${DATAHUB_CONFLUENT_ZOOKEEPER_IMAGE:-confluentinc/cp-zookeeper}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
ports:
- ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181
env_file: zookeeper/env/docker.env
Expand Down
2 changes: 1 addition & 1 deletion docker/profiles/docker-compose.prerequisites.yml
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ services:
- neo4jdata:/data
kafka-broker:
hostname: broker
image: confluentinc/cp-kafka:7.4.0
image: ${DATAHUB_CONFLUENT_KAFKA_IMAGE:-confluentinc/cp-kafka}:${DATAHUB_CONFLUENT_VERSION:-7.4.0}
command:
- /bin/bash
- -c
Expand Down
8 changes: 2 additions & 6 deletions docs/deploy/aws.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ First, if you did not use eksctl to setup the kubernetes cluster, make sure to g
Download the IAM policy document for allowing the controller to make calls to AWS APIs on your behalf.

```
curl -o iam_policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/v2.2.0/docs/install/iam_policy.json
curl -o iam_policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/main/docs/install/iam_policy.json
```

Create an IAM policy based on the policy document by running the following.
Expand Down Expand Up @@ -148,13 +148,9 @@ datahub-frontend:
alb.ingress.kubernetes.io/certificate-arn: <<certificate-arn>>
alb.ingress.kubernetes.io/inbound-cidrs: 0.0.0.0/0
alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}, {"HTTPS":443}]'
alb.ingress.kubernetes.io/actions.ssl-redirect: '{"Type": "redirect", "RedirectConfig": { "Protocol": "HTTPS", "Port": "443", "StatusCode": "HTTP_301"}}'
alb.ingress.kubernetes.io/ssl-redirect: '443'
hosts:
- host: <<host-name>>
redirectPaths:
- path: /*
name: ssl-redirect
port: use-annotation
paths:
- /*
```
Expand Down
3 changes: 2 additions & 1 deletion docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ profiling:
- #10498 - Tableau ingestion can now be configured to ingest multiple sites at once and add the sites as containers. The feature is currently only available for Tableau Server.
- #10466 - Extends configuration in `~/.datahubenv` to match `DatahubClientConfig` object definition. See full configuration in https://datahubproject.io/docs/python-sdk/clients/. The CLI should now respect the updated configurations specified in `~/.datahubenv` across its functions and utilities. This means that for systems where ssl certification is disabled, setting `disable_ssl_verification: true` in `~./datahubenv` will apply to all CLI calls.
- #11002 - We will not auto-generate a `~/.datahubenv` file. You must either run `datahub init` to create that file, or set environment variables so that the config is loaded.

- #11023 - Added a new parameter to datahub's `put` cli command: `--run-id`. This parameter is useful to associate a given write to an ingestion process. A use-case can be mimick transformers when a transformer for aspect being written does not exist.
- #11051 - Ingestion reports will now trim the summary text to a maximum of 800k characters to avoid generating `dataHubExecutionRequestResult` that are too large for GMS to handle.
## 0.13.3

### Breaking Changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from datahub.utilities.logging_manager import get_log_buffer
from datahub.utilities.urns.urn import Urn


logger = logging.getLogger(__name__)


Expand All @@ -43,6 +44,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
_EXECUTOR_ID: str = "__datahub_cli_"
_EXECUTION_REQUEST_SOURCE_TYPE: str = "CLI_INGESTION_SOURCE"
_INGESTION_TASK_NAME: str = "CLI Ingestion"
_MAX_SUMMARY_SIZE: int = 800000

@staticmethod
def get_cur_time_in_ms() -> int:
Expand Down Expand Up @@ -209,7 +211,9 @@ def on_completion(
status=status,
startTimeMs=self.start_time_ms,
durationMs=self.get_cur_time_in_ms() - self.start_time_ms,
report=summary,
# Truncate summary such that the generated MCP will not exceed GMS's payload limit.
# Hardcoding the overall size of dataHubExecutionRequestResult to >1MB by trimming summary to 800,000 chars
report=summary[-self._MAX_SUMMARY_SIZE:],
structuredReport=structured_report,
)

Expand Down

0 comments on commit bb6879c

Please sign in to comment.