From d13d5d107177200a967697b3582f0ce3e341eec4 Mon Sep 17 00:00:00 2001 From: sshuster Date: Fri, 21 Feb 2020 09:36:00 -0800 Subject: [PATCH 1/6] Adding programmatic_descriptions to table search export --- .../extractor/neo4j_search_data_extractor.py | 1 + .../models/table_elasticsearch_document.py | 2 ++ databuilder/publisher/elasticsearch_publisher.py | 4 ++++ tests/unit/extractor/test_neo4j_extractor.py | 1 + .../test_file_system_elasticsearch_json_loader.py | 15 ++++++++++----- .../models/test_table_elasticsearch_document.py | 2 ++ 6 files changed, 20 insertions(+), 5 deletions(-) diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py index 19a1aa86b..63d849d8a 100644 --- a/databuilder/extractor/neo4j_search_data_extractor.py +++ b/databuilder/extractor/neo4j_search_data_extractor.py @@ -35,6 +35,7 @@ class Neo4jSearchDataExtractor(Extractor): WITH db, cluster, schema, table, table_description, tags, badges, total_usage, unique_usage, COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp) + OPTIONAL MATCH (table)-[:DESCRIPTION]->(programmatic_description:Programmatic_Description) RETURN db.name as database, cluster.name AS cluster, schema.name AS schema, table.name AS name, table.key AS key, table_description.description AS description, time_stamp.last_updated_timestamp AS last_updated_timestamp, diff --git a/databuilder/models/table_elasticsearch_document.py b/databuilder/models/table_elasticsearch_document.py index 577256cf3..8b4f33a7a 100644 --- a/databuilder/models/table_elasticsearch_document.py +++ b/databuilder/models/table_elasticsearch_document.py @@ -22,6 +22,7 @@ def __init__(self, tags, # type: List[str], badges=None, # type: Optional[List[str]] display_name=None, # type: Optional[str] + programmatic_descriptions=[] # type: List[str] ): # type: (...) -> None self.database = database @@ -40,3 +41,4 @@ def __init__(self, # todo: will include tag_type once we have better understanding from UI flow. self.tags = tags self.badges = badges + self.programmatic_descriptions = programmatic_descriptions diff --git a/databuilder/publisher/elasticsearch_publisher.py b/databuilder/publisher/elasticsearch_publisher.py index 97c8da029..6133ef96e 100644 --- a/databuilder/publisher/elasticsearch_publisher.py +++ b/databuilder/publisher/elasticsearch_publisher.py @@ -111,6 +111,10 @@ class ElasticsearchPublisher(Publisher): }, "unique_usage": { "type": "long" + }, + "programmatic_descriptions": { + "type": "text", + "analyzer": "simple" } } } diff --git a/tests/unit/extractor/test_neo4j_extractor.py b/tests/unit/extractor/test_neo4j_extractor.py index 7e14f372e..5bd9eb1cd 100644 --- a/tests/unit/extractor/test_neo4j_extractor.py +++ b/tests/unit/extractor/test_neo4j_extractor.py @@ -113,6 +113,7 @@ def test_extraction_with_model_class(self): total_usage=100, unique_usage=5, tags=['hive'], + programmatic_descriptions=['TEST'], badges=['badge1']) extractor.results = [result_dict] diff --git a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py index 7781e1c69..b34cd5814 100644 --- a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py +++ b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py @@ -72,7 +72,8 @@ def test_loading_with_different_object(self): column_descriptions=['test_comment1', 'test_comment2'], total_usage=10, unique_usage=5, - tags=['test_tag1', 'test_tag2']) + tags=['test_tag1', 'test_tag2'], + programmatic_descriptions=['test']) with self.assertRaises(Exception) as context: loader.load(data) # type: ignore @@ -101,7 +102,8 @@ def test_loading_with_single_object(self): total_usage=10, unique_usage=5, tags=['test_tag1', 'test_tag2'], - badges=['badge1']) + badges=['badge1'], + programmatic_descriptions=['test']) loader.load(data) loader.close() @@ -111,7 +113,8 @@ def test_loading_with_single_object(self): '"column_names": ["test_col1", "test_col2"], "name": "test_table", ' '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", ' '"description": "test_description", "unique_usage": 5, "total_usage": 10, ' - '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}') + '"tags": ["test_tag1", "test_tag2"], "programmatic_descriptions": ["test"], ' + '"badges": ["badge1"]}') ] self._check_results_helper(expected=expected) @@ -138,7 +141,8 @@ def test_loading_with_list_of_objects(self): total_usage=10, unique_usage=5, tags=['test_tag1', 'test_tag2'], - badges=['badge1'])] * 5 + badges=['badge1'], + programmatic_descriptions=['test'])] * 5 for d in data: loader.load(d) @@ -150,7 +154,8 @@ def test_loading_with_list_of_objects(self): '"column_names": ["test_col1", "test_col2"], "name": "test_table", ' '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", ' '"description": "test_description", "unique_usage": 5, "total_usage": 10, ' - '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}') + '"tags": ["test_tag1", "test_tag2"], "programmatic_descriptions":["test"], + '"badges": ["badge1"]}') ] * 5 self._check_results_helper(expected=expected) diff --git a/tests/unit/models/test_table_elasticsearch_document.py b/tests/unit/models/test_table_elasticsearch_document.py index 9ddea689f..ad10f584d 100644 --- a/tests/unit/models/test_table_elasticsearch_document.py +++ b/tests/unit/models/test_table_elasticsearch_document.py @@ -23,6 +23,7 @@ def test_to_json(self): total_usage=100, unique_usage=10, tags=['test'], + programmatic_descriptions=['test'], badges=['badge1']) expected_document_dict = {"database": "test_database", @@ -38,6 +39,7 @@ def test_to_json(self): "total_usage": 100, "unique_usage": 10, "tags": ["test"], + "programmatic_descriptions": ['test'] "badges": ["badge1"] } From 2081933caac3f0f95f47f38b7eca0b609b997b0a Mon Sep 17 00:00:00 2001 From: sshuster Date: Fri, 6 Mar 2020 06:04:50 -0800 Subject: [PATCH 2/6] fixing tests from merge --- tests/unit/loader/test_file_system_elasticsearch_json_loader.py | 2 +- tests/unit/models/test_table_elasticsearch_document.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py index b34cd5814..4d63d4aa2 100644 --- a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py +++ b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py @@ -154,7 +154,7 @@ def test_loading_with_list_of_objects(self): '"column_names": ["test_col1", "test_col2"], "name": "test_table", ' '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", ' '"description": "test_description", "unique_usage": 5, "total_usage": 10, ' - '"tags": ["test_tag1", "test_tag2"], "programmatic_descriptions":["test"], + '"tags": ["test_tag1", "test_tag2"], "programmatic_descriptions":["test"], ' '"badges": ["badge1"]}') ] * 5 diff --git a/tests/unit/models/test_table_elasticsearch_document.py b/tests/unit/models/test_table_elasticsearch_document.py index ad10f584d..11192aba4 100644 --- a/tests/unit/models/test_table_elasticsearch_document.py +++ b/tests/unit/models/test_table_elasticsearch_document.py @@ -39,7 +39,7 @@ def test_to_json(self): "total_usage": 100, "unique_usage": 10, "tags": ["test"], - "programmatic_descriptions": ['test'] + "programmatic_descriptions": ['test'], "badges": ["badge1"] } From 495f3bec210cee5b9f5c6a56c8b093123686a30a Mon Sep 17 00:00:00 2001 From: sshuster Date: Tue, 17 Mar 2020 09:19:24 -0700 Subject: [PATCH 3/6] Rebasing from upstream master --- .../extractor/neo4j_search_data_extractor.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py index 63d849d8a..5653d826f 100644 --- a/databuilder/extractor/neo4j_search_data_extractor.py +++ b/databuilder/extractor/neo4j_search_data_extractor.py @@ -23,19 +23,24 @@ class Neo4jSearchDataExtractor(Extractor): <-[:SCHEMA_OF]-(schema:Schema)<-[:TABLE_OF]-(table:Table) {publish_tag_filter} OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description) + OPTIONAL MATCH (table)-[:DESCRIPTION]->(prog_descs:Programmatic_Description) OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default' - WITH db, cluster, schema, table, table_description, COLLECT(DISTINCT tags.key) as tags + WITH db, cluster, schema, table, table_description, + COLLECT(prog_descs.description) as programmatic_descriptions, + COLLECT(DISTINCT tags.key) as tags OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge' - WITH db, cluster, schema, table, table_description, tags, COLLECT(DISTINCT badges.key) as badges + WITH db, cluster, schema, table, table_description, programmatic_descriptions, tags, + COLLECT(DISTINCT badges.key) as badges OPTIONAL MATCH (table)-[read:READ_BY]->(user:User) - WITH db, cluster, schema, table, table_description, tags, badges, SUM(read.read_count) AS total_usage, + WITH db, cluster, schema, table, table_description, programmatic_descriptions, tags, badges, + SUM(read.read_count) AS total_usage, COUNT(DISTINCT user.email) as unique_usage OPTIONAL MATCH (table)-[:COLUMN]->(col:Column) OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description) - WITH db, cluster, schema, table, table_description, tags, badges, total_usage, unique_usage, + WITH db, cluster, schema, table, table_description, + programmatic_descriptions, tags, badges, total_usage, unique_usage, COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp) - OPTIONAL MATCH (table)-[:DESCRIPTION]->(programmatic_description:Programmatic_Description) RETURN db.name as database, cluster.name AS cluster, schema.name AS schema, table.name AS name, table.key AS key, table_description.description AS description, time_stamp.last_updated_timestamp AS last_updated_timestamp, @@ -44,7 +49,8 @@ class Neo4jSearchDataExtractor(Extractor): total_usage, unique_usage, tags, - badges + badges, + programmatic_descriptions ORDER BY table.name; """ ) From e7660f8ca5d1d362fb0aca340042e78d574088b8 Mon Sep 17 00:00:00 2001 From: sshuster Date: Fri, 15 May 2020 11:38:25 -0700 Subject: [PATCH 4/6] fixing merge --- databuilder/extractor/neo4j_search_data_extractor.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py index 3e7292e3a..30007958e 100644 --- a/databuilder/extractor/neo4j_search_data_extractor.py +++ b/databuilder/extractor/neo4j_search_data_extractor.py @@ -30,20 +30,16 @@ class Neo4jSearchDataExtractor(Extractor): COLLECT(prog_descs.description) as programmatic_descriptions, COLLECT(DISTINCT tags.key) as tags OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge' - WITH db, cluster, schema, schema_description, table, table_description, tags, COLLECT(DISTINCT badges.key) AS - badges - WITH db, cluster, schema, table, table_description, programmatic_descriptions, tags, + WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, COLLECT(DISTINCT badges.key) as badges OPTIONAL MATCH (table)-[read:READ_BY]->(user:User) - WITH db, cluster, schema, schema_description, table, table_description, tags, badges, SUM(read.read_count) AS - total_usage, - WITH db, cluster, schema, table, table_description, programmatic_descriptions, tags, badges, + WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, badges, SUM(read.read_count) AS total_usage, COUNT(DISTINCT user.email) as unique_usage OPTIONAL MATCH (table)-[:COLUMN]->(col:Column) OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description) WITH db, cluster, schema, schema_description, table, table_description, tags, badges, total_usage, unique_usage, - programmatic_descriptions, tags, badges, total_usage, unique_usage, + programmatic_descriptions, COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp) RETURN db.name as database, cluster.name AS cluster, schema.name AS schema, From 43da03482367693dafb0e8ae441c5495af834df5 Mon Sep 17 00:00:00 2001 From: sshuster Date: Mon, 1 Jun 2020 10:13:43 -0700 Subject: [PATCH 5/6] fixing the neo4j query to be more optimized --- databuilder/extractor/neo4j_search_data_extractor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py index 30007958e..4872dd068 100644 --- a/databuilder/extractor/neo4j_search_data_extractor.py +++ b/databuilder/extractor/neo4j_search_data_extractor.py @@ -25,9 +25,10 @@ class Neo4jSearchDataExtractor(Extractor): OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description) OPTIONAL MATCH (schema)-[:DESCRIPTION]->(schema_description:Description) OPTIONAL MATCH (table)-[:DESCRIPTION]->(prog_descs:Programmatic_Description) - OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default' WITH db, cluster, schema, schema_description, table, table_description, - COLLECT(prog_descs.description) as programmatic_descriptions, + COLLECT(prog_descs.description) as programmatic_descriptions + OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default' + WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, COLLECT(DISTINCT tags.key) as tags OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge' WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, From 9edcd082831d8372493dbe4e9445868c760a33f6 Mon Sep 17 00:00:00 2001 From: sshuster Date: Wed, 3 Jun 2020 08:03:41 -0700 Subject: [PATCH 6/6] adding programmatic_descriptions to the elasticsearch_constants.py --- databuilder/publisher/elasticsearch_constants.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/databuilder/publisher/elasticsearch_constants.py b/databuilder/publisher/elasticsearch_constants.py index 1e407ef2e..c53a237b6 100644 --- a/databuilder/publisher/elasticsearch_constants.py +++ b/databuilder/publisher/elasticsearch_constants.py @@ -81,6 +81,10 @@ }, "unique_usage": { "type": "long" + }, + "programmatic_descriptions": { + "type": "text", + "analyzer": "simple" } } }