From d13d5d107177200a967697b3582f0ce3e341eec4 Mon Sep 17 00:00:00 2001
From: sshuster <sshuster@edmunds.com>
Date: Fri, 21 Feb 2020 09:36:00 -0800
Subject: [PATCH 1/6] Adding programmatic_descriptions to table search export

---
 .../extractor/neo4j_search_data_extractor.py      |  1 +
 .../models/table_elasticsearch_document.py        |  2 ++
 databuilder/publisher/elasticsearch_publisher.py  |  4 ++++
 tests/unit/extractor/test_neo4j_extractor.py      |  1 +
 .../test_file_system_elasticsearch_json_loader.py | 15 ++++++++++-----
 .../models/test_table_elasticsearch_document.py   |  2 ++
 6 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py
index 19a1aa86b..63d849d8a 100644
--- a/databuilder/extractor/neo4j_search_data_extractor.py
+++ b/databuilder/extractor/neo4j_search_data_extractor.py
@@ -35,6 +35,7 @@ class Neo4jSearchDataExtractor(Extractor):
         WITH db, cluster, schema, table, table_description, tags, badges, total_usage, unique_usage,
         COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions
         OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
+        OPTIONAL MATCH (table)-[:DESCRIPTION]->(programmatic_description:Programmatic_Description)
         RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
         table.name AS name, table.key AS key, table_description.description AS description,
         time_stamp.last_updated_timestamp AS last_updated_timestamp,
diff --git a/databuilder/models/table_elasticsearch_document.py b/databuilder/models/table_elasticsearch_document.py
index 577256cf3..8b4f33a7a 100644
--- a/databuilder/models/table_elasticsearch_document.py
+++ b/databuilder/models/table_elasticsearch_document.py
@@ -22,6 +22,7 @@ def __init__(self,
                  tags,  # type: List[str],
                  badges=None,  # type: Optional[List[str]]
                  display_name=None,  # type: Optional[str]
+                 programmatic_descriptions=[]  # type: List[str]
                  ):
         # type: (...) -> None
         self.database = database
@@ -40,3 +41,4 @@ def __init__(self,
         # todo: will include tag_type once we have better understanding from UI flow.
         self.tags = tags
         self.badges = badges
+        self.programmatic_descriptions = programmatic_descriptions
diff --git a/databuilder/publisher/elasticsearch_publisher.py b/databuilder/publisher/elasticsearch_publisher.py
index 97c8da029..6133ef96e 100644
--- a/databuilder/publisher/elasticsearch_publisher.py
+++ b/databuilder/publisher/elasticsearch_publisher.py
@@ -111,6 +111,10 @@ class ElasticsearchPublisher(Publisher):
                 },
                 "unique_usage": {
                   "type": "long"
+                },
+                "programmatic_descriptions": {
+                   "type": "text",
+                   "analyzer": "simple"
                 }
               }
             }
diff --git a/tests/unit/extractor/test_neo4j_extractor.py b/tests/unit/extractor/test_neo4j_extractor.py
index 7e14f372e..5bd9eb1cd 100644
--- a/tests/unit/extractor/test_neo4j_extractor.py
+++ b/tests/unit/extractor/test_neo4j_extractor.py
@@ -113,6 +113,7 @@ def test_extraction_with_model_class(self):
                                total_usage=100,
                                unique_usage=5,
                                tags=['hive'],
+                               programmatic_descriptions=['TEST'],
                                badges=['badge1'])
 
             extractor.results = [result_dict]
diff --git a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
index 7781e1c69..b34cd5814 100644
--- a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
+++ b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
@@ -72,7 +72,8 @@ def test_loading_with_different_object(self):
                     column_descriptions=['test_comment1', 'test_comment2'],
                     total_usage=10,
                     unique_usage=5,
-                    tags=['test_tag1', 'test_tag2'])
+                    tags=['test_tag1', 'test_tag2'],
+                    programmatic_descriptions=['test'])
 
         with self.assertRaises(Exception) as context:
             loader.load(data)  # type: ignore
@@ -101,7 +102,8 @@ def test_loading_with_single_object(self):
                                total_usage=10,
                                unique_usage=5,
                                tags=['test_tag1', 'test_tag2'],
-                               badges=['badge1'])
+                               badges=['badge1'],
+                               programmatic_descriptions=['test'])
         loader.load(data)
         loader.close()
 
@@ -111,7 +113,8 @@ def test_loading_with_single_object(self):
              '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
              '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
              '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
+             '"tags": ["test_tag1", "test_tag2"], "programmatic_descriptions": ["test"], '
+             '"badges": ["badge1"]}')
         ]
 
         self._check_results_helper(expected=expected)
@@ -138,7 +141,8 @@ def test_loading_with_list_of_objects(self):
                                 total_usage=10,
                                 unique_usage=5,
                                 tags=['test_tag1', 'test_tag2'],
-                                badges=['badge1'])] * 5
+                                badges=['badge1'],
+                                programmatic_descriptions=['test'])] * 5
 
         for d in data:
             loader.load(d)
@@ -150,7 +154,8 @@ def test_loading_with_list_of_objects(self):
              '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
              '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
              '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
+             '"tags": ["test_tag1", "test_tag2"], "programmatic_descriptions":["test"],
+             '"badges": ["badge1"]}')
         ] * 5
 
         self._check_results_helper(expected=expected)
diff --git a/tests/unit/models/test_table_elasticsearch_document.py b/tests/unit/models/test_table_elasticsearch_document.py
index 9ddea689f..ad10f584d 100644
--- a/tests/unit/models/test_table_elasticsearch_document.py
+++ b/tests/unit/models/test_table_elasticsearch_document.py
@@ -23,6 +23,7 @@ def test_to_json(self):
                                    total_usage=100,
                                    unique_usage=10,
                                    tags=['test'],
+                                   programmatic_descriptions=['test'],
                                    badges=['badge1'])
 
         expected_document_dict = {"database": "test_database",
@@ -38,6 +39,7 @@ def test_to_json(self):
                                   "total_usage": 100,
                                   "unique_usage": 10,
                                   "tags": ["test"],
+                                  "programmatic_descriptions": ['test']
                                   "badges": ["badge1"]
                                   }
 

From 2081933caac3f0f95f47f38b7eca0b609b997b0a Mon Sep 17 00:00:00 2001
From: sshuster <sshuster@edmunds.com>
Date: Fri, 6 Mar 2020 06:04:50 -0800
Subject: [PATCH 2/6] fixing tests from merge

---
 tests/unit/loader/test_file_system_elasticsearch_json_loader.py | 2 +-
 tests/unit/models/test_table_elasticsearch_document.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
index b34cd5814..4d63d4aa2 100644
--- a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
+++ b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
@@ -154,7 +154,7 @@ def test_loading_with_list_of_objects(self):
              '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
              '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
              '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"], "programmatic_descriptions":["test"],
+             '"tags": ["test_tag1", "test_tag2"], "programmatic_descriptions":["test"], '
              '"badges": ["badge1"]}')
         ] * 5
 
diff --git a/tests/unit/models/test_table_elasticsearch_document.py b/tests/unit/models/test_table_elasticsearch_document.py
index ad10f584d..11192aba4 100644
--- a/tests/unit/models/test_table_elasticsearch_document.py
+++ b/tests/unit/models/test_table_elasticsearch_document.py
@@ -39,7 +39,7 @@ def test_to_json(self):
                                   "total_usage": 100,
                                   "unique_usage": 10,
                                   "tags": ["test"],
-                                  "programmatic_descriptions": ['test']
+                                  "programmatic_descriptions": ['test'],
                                   "badges": ["badge1"]
                                   }
 

From 495f3bec210cee5b9f5c6a56c8b093123686a30a Mon Sep 17 00:00:00 2001
From: sshuster <sshuster@edmunds.com>
Date: Tue, 17 Mar 2020 09:19:24 -0700
Subject: [PATCH 3/6] Rebasing from upstream master

---
 .../extractor/neo4j_search_data_extractor.py   | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py
index 63d849d8a..5653d826f 100644
--- a/databuilder/extractor/neo4j_search_data_extractor.py
+++ b/databuilder/extractor/neo4j_search_data_extractor.py
@@ -23,19 +23,24 @@ class Neo4jSearchDataExtractor(Extractor):
         <-[:SCHEMA_OF]-(schema:Schema)<-[:TABLE_OF]-(table:Table)
         {publish_tag_filter}
         OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description)
+        OPTIONAL MATCH (table)-[:DESCRIPTION]->(prog_descs:Programmatic_Description)
         OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
-        WITH db, cluster, schema, table, table_description, COLLECT(DISTINCT tags.key) as tags
+        WITH db, cluster, schema, table, table_description,
+        COLLECT(prog_descs.description) as programmatic_descriptions,
+        COLLECT(DISTINCT tags.key) as tags
         OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
-        WITH db, cluster, schema, table, table_description, tags, COLLECT(DISTINCT badges.key) as badges
+        WITH db, cluster, schema, table, table_description, programmatic_descriptions, tags,
+        COLLECT(DISTINCT badges.key) as badges
         OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
-        WITH db, cluster, schema, table, table_description, tags, badges, SUM(read.read_count) AS total_usage,
+        WITH db, cluster, schema, table, table_description, programmatic_descriptions, tags, badges,
+        SUM(read.read_count) AS total_usage,
         COUNT(DISTINCT user.email) as unique_usage
         OPTIONAL MATCH (table)-[:COLUMN]->(col:Column)
         OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description)
-        WITH db, cluster, schema, table, table_description, tags, badges, total_usage, unique_usage,
+        WITH db, cluster, schema, table, table_description,
+        programmatic_descriptions, tags, badges, total_usage, unique_usage,
         COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions
         OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
-        OPTIONAL MATCH (table)-[:DESCRIPTION]->(programmatic_description:Programmatic_Description)
         RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
         table.name AS name, table.key AS key, table_description.description AS description,
         time_stamp.last_updated_timestamp AS last_updated_timestamp,
@@ -44,7 +49,8 @@ class Neo4jSearchDataExtractor(Extractor):
         total_usage,
         unique_usage,
         tags,
-        badges
+        badges,
+        programmatic_descriptions
         ORDER BY table.name;
         """
     )

From e7660f8ca5d1d362fb0aca340042e78d574088b8 Mon Sep 17 00:00:00 2001
From: sshuster <sshuster@edmunds.com>
Date: Fri, 15 May 2020 11:38:25 -0700
Subject: [PATCH 4/6] fixing merge

---
 databuilder/extractor/neo4j_search_data_extractor.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py
index 3e7292e3a..30007958e 100644
--- a/databuilder/extractor/neo4j_search_data_extractor.py
+++ b/databuilder/extractor/neo4j_search_data_extractor.py
@@ -30,20 +30,16 @@ class Neo4jSearchDataExtractor(Extractor):
         COLLECT(prog_descs.description) as programmatic_descriptions,
         COLLECT(DISTINCT tags.key) as tags
         OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
-        WITH db, cluster, schema, schema_description, table, table_description, tags, COLLECT(DISTINCT badges.key) AS
-        badges
-        WITH db, cluster, schema, table, table_description, programmatic_descriptions, tags,
+        WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags,
         COLLECT(DISTINCT badges.key) as badges
         OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
-        WITH db, cluster, schema, schema_description, table, table_description, tags, badges, SUM(read.read_count) AS
-        total_usage,
-        WITH db, cluster, schema, table, table_description, programmatic_descriptions, tags, badges,
+        WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, badges,
         SUM(read.read_count) AS total_usage,
         COUNT(DISTINCT user.email) as unique_usage
         OPTIONAL MATCH (table)-[:COLUMN]->(col:Column)
         OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description)
         WITH db, cluster, schema, schema_description, table, table_description, tags, badges, total_usage, unique_usage,
-        programmatic_descriptions, tags, badges, total_usage, unique_usage,
+        programmatic_descriptions,
         COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions
         OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
         RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,

From 43da03482367693dafb0e8ae441c5495af834df5 Mon Sep 17 00:00:00 2001
From: sshuster <sshuster@edmunds.com>
Date: Mon, 1 Jun 2020 10:13:43 -0700
Subject: [PATCH 5/6] fixing the neo4j query to be more optimized

---
 databuilder/extractor/neo4j_search_data_extractor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py
index 30007958e..4872dd068 100644
--- a/databuilder/extractor/neo4j_search_data_extractor.py
+++ b/databuilder/extractor/neo4j_search_data_extractor.py
@@ -25,9 +25,10 @@ class Neo4jSearchDataExtractor(Extractor):
         OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description)
         OPTIONAL MATCH (schema)-[:DESCRIPTION]->(schema_description:Description)
         OPTIONAL MATCH (table)-[:DESCRIPTION]->(prog_descs:Programmatic_Description)
-        OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
         WITH db, cluster, schema, schema_description, table, table_description,
-        COLLECT(prog_descs.description) as programmatic_descriptions,
+        COLLECT(prog_descs.description) as programmatic_descriptions
+        OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
+        WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions,
         COLLECT(DISTINCT tags.key) as tags
         OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
         WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags,

From 9edcd082831d8372493dbe4e9445868c760a33f6 Mon Sep 17 00:00:00 2001
From: sshuster <sshuster@edmunds.com>
Date: Wed, 3 Jun 2020 08:03:41 -0700
Subject: [PATCH 6/6] adding programmatic_descriptions to the
 elasticsearch_constants.py

---
 databuilder/publisher/elasticsearch_constants.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/databuilder/publisher/elasticsearch_constants.py b/databuilder/publisher/elasticsearch_constants.py
index 1e407ef2e..c53a237b6 100644
--- a/databuilder/publisher/elasticsearch_constants.py
+++ b/databuilder/publisher/elasticsearch_constants.py
@@ -81,6 +81,10 @@
             },
             "unique_usage": {
               "type": "long"
+            },
+            "programmatic_descriptions": {
+              "type": "text",
+              "analyzer": "simple"
             }
           }
         }