From ddaac21b01b206436ed0fe562d73d04a29d41747 Mon Sep 17 00:00:00 2001 From: Allison Suarez Miranda <22477579+allisonsuarez@users.noreply.github.com> Date: Mon, 19 Oct 2020 18:00:27 -0700 Subject: [PATCH] Revert "refactor: Update ES index maps to use same maps of amundsen-common (#385)" This reverts commit 20c2fd2f57076ba40ed0552802be0acda8b362d1. Signed-off-by: Allison Suarez Miranda --- .../publisher/elasticsearch_constants.py | 247 ++++++++++++++++++ .../publisher/elasticsearch_publisher.py | 3 +- docs/dashboard_ingestion_guide.md | 2 +- example/scripts/sample_data_loader.py | 4 +- example/scripts/sample_tableau_data_loader.py | 2 +- requirements.txt | 4 - setup.py | 1 + 7 files changed, 253 insertions(+), 10 deletions(-) create mode 100644 databuilder/publisher/elasticsearch_constants.py diff --git a/databuilder/publisher/elasticsearch_constants.py b/databuilder/publisher/elasticsearch_constants.py new file mode 100644 index 000000000..f19f69387 --- /dev/null +++ b/databuilder/publisher/elasticsearch_constants.py @@ -0,0 +1,247 @@ +# Copyright Contributors to the Amundsen project. +# SPDX-License-Identifier: Apache-2.0 + +import textwrap + +# Documentation: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html +# Setting type to "text" for all fields that would be used in search +# Using Simple Analyzer to convert all text into search terms +# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simple-analyzer.html +# Standard Analyzer is used for all text fields that don't explicitly specify an analyzer +# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-analyzer.html +# TODO use amundsencommon for this when this project is updated to py3 +TABLE_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent( + """ + { + "mappings":{ + "table":{ + "properties": { + "name": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "schema": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "display_name": { + "type": "keyword" + }, + "last_updated_timestamp": { + "type": "date", + "format": "epoch_second" + }, + "description": { + "type": "text", + "analyzer": "simple" + }, + "column_names": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "column_descriptions": { + "type": "text", + "analyzer": "simple" + }, + "tags": { + "type": "keyword" + }, + "badges": { + "type": "keyword" + }, + "cluster": { + "type": "text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "database": { + "type": "text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "key": { + "type": "keyword" + }, + "total_usage":{ + "type": "long" + }, + "unique_usage": { + "type": "long" + }, + "programmatic_descriptions": { + "type": "text", + "analyzer": "simple" + } + } + } + } + } + """ +) + +DASHBOARD_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent( + """ + { + "settings": { + "analysis": { + "normalizer": { + "lowercase_normalizer": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase", "asciifolding"] + } + } + } + }, + "mappings":{ + "dashboard":{ + "properties": { + "group_name": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + } + } + }, + "name": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + } + } + }, + "description": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "group_description": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "query_names": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "chart_names": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "tags": { + "type": "keyword" + }, + "badges": { + "type": "keyword" + } + } + } + } + } + """ +) + +USER_ELASTICSEARCH_INDEX_MAPPING = textwrap.dedent( + """ + { + "mappings":{ + "user":{ + "properties": { + "email": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "first_name": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "last_name": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "full_name": { + "type":"text", + "analyzer": "simple", + "fields": { + "raw": { + "type": "keyword" + } + } + }, + "total_read":{ + "type": "long" + }, + "total_own": { + "type": "long" + }, + "total_follow": { + "type": "long" + } + } + } + } + } + """ +) diff --git a/databuilder/publisher/elasticsearch_publisher.py b/databuilder/publisher/elasticsearch_publisher.py index 4583df187..a89650515 100644 --- a/databuilder/publisher/elasticsearch_publisher.py +++ b/databuilder/publisher/elasticsearch_publisher.py @@ -8,9 +8,8 @@ from pyhocon import ConfigTree from typing import List -from amundsen_common.models.index_map import TABLE_INDEX_MAP as TABLE_ELASTICSEARCH_INDEX_MAPPING - from databuilder.publisher.base_publisher import Publisher +from databuilder.publisher.elasticsearch_constants import TABLE_ELASTICSEARCH_INDEX_MAPPING LOGGER = logging.getLogger(__name__) diff --git a/docs/dashboard_ingestion_guide.md b/docs/dashboard_ingestion_guide.md index ee7278be5..64e57c779 100644 --- a/docs/dashboard_ingestion_guide.md +++ b/docs/dashboard_ingestion_guide.md @@ -111,7 +111,7 @@ job = DefaultJob(conf=job_config, job.launch() ``` -*Note that `DASHBOARD_ELASTICSEARCH_INDEX_MAPPING` is defined [here](https://github.com/amundsen-io/amundsencommon/blob/master/amundsen_common/models/index_map.py). +*Note that `DASHBOARD_ELASTICSEARCH_INDEX_MAPPING` is defined [here](../databuilder/publisher/elasticsearch_constants.py). ### 4. Remove stale data diff --git a/example/scripts/sample_data_loader.py b/example/scripts/sample_data_loader.py index 107fad9f4..5be9de1f0 100644 --- a/example/scripts/sample_data_loader.py +++ b/example/scripts/sample_data_loader.py @@ -29,8 +29,6 @@ from elasticsearch import Elasticsearch from pyhocon import ConfigFactory from sqlalchemy.ext.declarative import declarative_base -from amundsen_common.models.index_map import DASHBOARD_ELASTICSEARCH_INDEX_MAPPING -from amundsen_common.models.index_map import USER_INDEX_MAP as USER_ELASTICSEARCH_INDEX_MAPPING from databuilder.extractor.csv_extractor import CsvTableColumnExtractor, CsvExtractor from databuilder.extractor.neo4j_es_last_updated_extractor import Neo4jEsLastUpdatedExtractor @@ -38,6 +36,8 @@ from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader +from databuilder.publisher.elasticsearch_constants import DASHBOARD_ELASTICSEARCH_INDEX_MAPPING, \ + USER_ELASTICSEARCH_INDEX_MAPPING from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask diff --git a/example/scripts/sample_tableau_data_loader.py b/example/scripts/sample_tableau_data_loader.py index 096a246fa..07a093eab 100644 --- a/example/scripts/sample_tableau_data_loader.py +++ b/example/scripts/sample_tableau_data_loader.py @@ -23,12 +23,12 @@ from elasticsearch import Elasticsearch from pyhocon import ConfigFactory from sqlalchemy.ext.declarative import declarative_base -from amundsen_common.models.index_map import DASHBOARD_ELASTICSEARCH_INDEX_MAPPING from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor from databuilder.job.job import DefaultJob from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader +from databuilder.publisher.elasticsearch_constants import DASHBOARD_ELASTICSEARCH_INDEX_MAPPING from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher from databuilder.task.task import DefaultTask diff --git a/requirements.txt b/requirements.txt index fa893b1f2..671a43b84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,10 +38,6 @@ typing==3.6.4 # Upstream url: https://pypi.org/project/elasticsearch/ elasticsearch>=6.2.0,<7.0 -# A common package that holds the models deifnition and schemas that are used -# accross different amundsen repositories. -amundsen-common>=0.5.6,<1.0 - atomicwrites==1.1.5 more-itertools==4.2.0 pluggy>=0.6.0 diff --git a/setup.py b/setup.py index 846c5295d..3285fa368 100644 --- a/setup.py +++ b/setup.py @@ -70,6 +70,7 @@ install_requires=requirements, python_requires='>=3.6,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*', extras_require={ + ':python_version=="2.7"': ['typing>=3.6'], # allow typehinting PY2 'all': all_deps, 'kafka': kafka, # To use with Kafka source extractor 'cassandra': cassandra,