Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

issue-297/Adding programmatic_descriptions to table search export #198

Merged
18 changes: 12 additions & 6 deletions databuilder/extractor/neo4j_search_data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,23 @@ class Neo4jSearchDataExtractor(Extractor):
{publish_tag_filter}
OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description)
OPTIONAL MATCH (schema)-[:DESCRIPTION]->(schema_description:Description)
OPTIONAL MATCH (table)-[:DESCRIPTION]->(prog_descs:Programmatic_Description)
samshuster marked this conversation as resolved.
Show resolved Hide resolved
WITH db, cluster, schema, schema_description, table, table_description,
COLLECT(prog_descs.description) as programmatic_descriptions
OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
WITH db, cluster, schema, schema_description, table, table_description, COLLECT(DISTINCT tags.key) as tags
WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions,
COLLECT(DISTINCT tags.key) as tags
OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
WITH db, cluster, schema, schema_description, table, table_description, tags, COLLECT(DISTINCT badges.key) AS
badges
WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags,
COLLECT(DISTINCT badges.key) as badges
OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
WITH db, cluster, schema, schema_description, table, table_description, tags, badges, SUM(read.read_count) AS
total_usage,
WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, badges,
SUM(read.read_count) AS total_usage,
COUNT(DISTINCT user.email) as unique_usage
OPTIONAL MATCH (table)-[:COLUMN]->(col:Column)
OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description)
WITH db, cluster, schema, schema_description, table, table_description, tags, badges, total_usage, unique_usage,
programmatic_descriptions,
COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions
OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
Expand All @@ -47,7 +52,8 @@ class Neo4jSearchDataExtractor(Extractor):
total_usage,
unique_usage,
tags,
badges
badges,
programmatic_descriptions
ORDER BY table.name;
"""
)
Expand Down
2 changes: 2 additions & 0 deletions databuilder/models/table_elasticsearch_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self,
badges=None, # type: Optional[List[str]]
display_name=None, # type: Optional[str]
schema_description=None, # type: Optional[str]
programmatic_descriptions=[], # type: List[str]
):
# type: (...) -> None
self.database = database
Expand All @@ -42,3 +43,4 @@ def __init__(self,
self.tags = tags
self.badges = badges
self.schema_description = schema_description
self.programmatic_descriptions = programmatic_descriptions
4 changes: 4 additions & 0 deletions databuilder/publisher/elasticsearch_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
},
"unique_usage": {
"type": "long"
},
"programmatic_descriptions": {
"type": "text",
"analyzer": "simple"
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/extractor/test_neo4j_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ def test_extraction_with_model_class(self):
unique_usage=5,
tags=['hive'],
badges=['badge1'],
schema_description='schema_description')
schema_description='schema_description',
programmatic_descriptions=['TEST'])

extractor.results = [result_dict]
result_obj = extractor.extract()
Expand Down
17 changes: 12 additions & 5 deletions tests/unit/loader/test_file_system_elasticsearch_json_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def test_loading_with_different_object(self):
column_descriptions=['test_comment1', 'test_comment2'],
total_usage=10,
unique_usage=5,
tags=['test_tag1', 'test_tag2'])
tags=['test_tag1', 'test_tag2'],
programmatic_descriptions=['test'])

with self.assertRaises(Exception) as context:
loader.load(data) # type: ignore
Expand Down Expand Up @@ -102,7 +103,8 @@ def test_loading_with_single_object(self):
unique_usage=5,
tags=['test_tag1', 'test_tag2'],
badges=['badge1'],
schema_description='schema description')
schema_description='schema description',
programmatic_descriptions=['test'])
loader.load(data)
loader.close()

Expand All @@ -112,7 +114,9 @@ def test_loading_with_single_object(self):
'"column_names": ["test_col1", "test_col2"], "name": "test_table", '
'"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
'"description": "test_description", "unique_usage": 5, "total_usage": 10, '
'"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema description"}')
'"tags": ["test_tag1", "test_tag2"], "schema_description": "schema description", '
'"programmatic_descriptions": ["test"], '
'"badges": ["badge1"]}')
]

self._check_results_helper(expected=expected)
Expand Down Expand Up @@ -140,7 +144,8 @@ def test_loading_with_list_of_objects(self):
unique_usage=5,
tags=['test_tag1', 'test_tag2'],
badges=['badge1'],
schema_description='schema_description')] * 5
schema_description='schema_description',
programmatic_descriptions=['test'])] * 5

for d in data:
loader.load(d)
Expand All @@ -152,7 +157,9 @@ def test_loading_with_list_of_objects(self):
'"column_names": ["test_col1", "test_col2"], "name": "test_table", '
'"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
'"description": "test_description", "unique_usage": 5, "total_usage": 10, '
'"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema_description"}')
'"tags": ["test_tag1", "test_tag2"], "schema_description": "schema_description", '
'"programmatic_descriptions":["test"], '
'"badges": ["badge1"]}')
] * 5

self._check_results_helper(expected=expected)
2 changes: 2 additions & 0 deletions tests/unit/models/test_table_elasticsearch_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def test_to_json(self):
total_usage=100,
unique_usage=10,
tags=['test'],
programmatic_descriptions=['test'],
badges=['badge1'],
schema_description='schema description')

Expand All @@ -39,6 +40,7 @@ def test_to_json(self):
"total_usage": 100,
"unique_usage": 10,
"tags": ["test"],
"programmatic_descriptions": ['test'],
"badges": ["badge1"],
'schema_description': 'schema description'
}
Expand Down