fix: Added role_name to default user query in Neo4jSearchDataExtractor (

#285)
amundsen-io · Jun 9, 2020 · a20084c · a20084c
1 parent 7aab3aa
commit a20084c
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 29 deletions.
diff --git a/databuilder/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/databuilder/extractor/neo4j_search_data_extractor.py
@@ -71,7 +71,7 @@ class Neo4jSearchDataExtractor(Extractor):
         return user.email as email, user.first_name as first_name, user.last_name as last_name,
         user.full_name as full_name, user.github_username as github_username, user.team_name as team_name,
         user.employee_type as employee_type, manager.email as manager_email,
-        user.slack_id as slack_id, user.is_active as is_active,
+        user.slack_id as slack_id, user.is_active as is_active, user.role_name as role_name,
         REDUCE(sum_r = 0, r in COLLECT(DISTINCT read)| sum_r + r.read_count) AS total_read,
         count(distinct b) as total_own,
         count(distinct c) AS total_follow

diff --git a/databuilder/example/scripts/sample_data_loader.py b/databuilder/example/scripts/sample_data_loader.py
@@ -21,7 +21,6 @@
 import os
 import sqlite3
 import sys
-import textwrap
 import uuid
 from elasticsearch import Elasticsearch
 from pyhocon import ConfigFactory
@@ -166,16 +165,16 @@ def create_last_updated_job():
 def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index',
                                    elasticsearch_doc_type_key='table',
                                    model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
-                                   cypher_query=None,
+                                   entity_type='table',
                                    elasticsearch_mapping=None):
     """
     :param elasticsearch_index_alias:  alias for Elasticsearch used in
                                        amundsensearchlibrary/search_service/config.py as an index
     :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
                                        `table_search_index`
     :param model_name:                 the Databuilder model class used in transporting between Extractor and Loader
-    :param cypher_query:               Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default)
-                                       it uses the `Table` query baked into the Extractor
+    :param entity_type:                Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine
+                                       Cypher query to extract data from Neo4j. Defaults to `table`.
     :param elasticsearch_mapping:      Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
                                        if None is given (default) it uses the `Table` query baked into the Publisher
     """
@@ -192,6 +191,7 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index
     elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())
 
     job_config = ConfigFactory.from_dict({
+        'extractor.search_data.entity_type': entity_type,
         'extractor.search_data.extractor.neo4j.graph_url': neo4j_endpoint,
         'extractor.search_data.extractor.neo4j.model_class': model_name,
         'extractor.search_data.extractor.neo4j.neo4j_auth_user': neo4j_user,
@@ -208,9 +208,6 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index
     })
 
     # only optionally add these keys, so need to dynamically `put` them
-    if cypher_query:
-        job_config.put('extractor.search_data.{}'.format(Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY),
-                       cypher_query)
     if elasticsearch_mapping:
         job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY),
                        elasticsearch_mapping)
@@ -255,29 +252,10 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index
         job_es_table = create_es_publisher_sample_job(
             elasticsearch_index_alias='table_search_index',
             elasticsearch_doc_type_key='table',
+            entity_type='table',
             model_name='databuilder.models.table_elasticsearch_document.TableESDocument')
         job_es_table.launch()
 
-        user_cypher_query = textwrap.dedent(
-            """
-            MATCH (user:User)
-            OPTIONAL MATCH (user)-[read:READ]->(a)
-            OPTIONAL MATCH (user)-[own:OWNER_OF]->(b)
-            OPTIONAL MATCH (user)-[follow:FOLLOWED_BY]->(c)
-            OPTIONAL MATCH (user)-[manage_by:MANAGE_BY]->(manager)
-            with user, a, b, c, read, own, follow, manager
-            where user.full_name is not null
-            return user.email as email, user.first_name as first_name, user.last_name as last_name,
-            user.full_name as full_name, user.github_username as github_username, user.team_name as team_name,
-            user.employee_type as employee_type, manager.email as manager_email, user.slack_id as slack_id,
-            user.role_name as role_name, user.is_active as is_active,
-            REDUCE(sum_r = 0, r in COLLECT(DISTINCT read)| sum_r + r.read_count) AS total_read,
-            count(distinct b) as total_own,
-            count(distinct c) AS total_follow
-            order by user.email
-            """
-        )
-
         user_elasticsearch_mapping = """
                 {
                   "mappings":{
@@ -338,6 +316,6 @@ def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index
             elasticsearch_index_alias='user_search_index',
             elasticsearch_doc_type_key='user',
             model_name='databuilder.models.user_elasticsearch_document.UserESDocument',
-            cypher_query=user_cypher_query,
+            entity_type='user',
             elasticsearch_mapping=user_elasticsearch_mapping)
         job_es_user.launch()