generated from databricks-industry-solutions/industry-solutions-blueprints
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathextract_same_as_edges.py
83 lines (72 loc) · 2.68 KB
/
extract_same_as_edges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Databricks notebook source
# MAGIC %md
# MAGIC
# MAGIC # Extract `same_as` edges
# MAGIC
# MAGIC * For now, simulate entity resolution using hard coded same_as edges
# MAGIC * TODO: use zingg to create the same_as edges
# COMMAND ----------
# MAGIC %sql
# MAGIC
# MAGIC DROP TABLE IF EXISTS solacc_cga.same_as;
# MAGIC
# MAGIC CREATE TABLE IF NOT EXISTS {{tgt_db_name}}.same_as (
# MAGIC src string,
# MAGIC time_bkt timestamp,
# MAGIC sub_type string,
# MAGIC sub_id string,
# MAGIC sub_name string,
# MAGIC pred string,
# MAGIC pred_status string,
# MAGIC obj_type string,
# MAGIC obj_id string,
# MAGIC obj_name string
# MAGIC )
# MAGIC USING DELTA;
# COMMAND ----------
# DBTITLE 1,Add simple resolutions where the entities share the same email address
# MAGIC %sql
# MAGIC
# MAGIC TRUNCATE TABLE {{tgt_db_name}}.same_as;
# MAGIC
# MAGIC INSERT OVERWRITE {{tgt_db_name}}.same_as
# MAGIC (
# MAGIC WITH entity_mapping AS
# MAGIC (
# MAGIC SELECT DISTINCT okta.sub_type AS src_type, okta.sub_id AS src_id, aad.sub_type AS tgt_type, aad.sub_id AS tgt_id, aad.sub_name AS name
# MAGIC FROM {{tgt_db_name}}.okta_edges_gold_day AS okta join {{tgt_db_name}}.aad_edges_gold_day AS aad on okta.sub_name = aad.sub_name
# MAGIC )
# MAGIC SELECT 'okta' AS src, NULL AS time_bkt, src_type AS sub_type, src_id AS sub_id, name AS sub_name, 'same_as' AS pred, NULL AS pred_status, tgt_type AS obj_type, tgt_id AS obj_id, name AS obj_name
# MAGIC FROM entity_mapping
# MAGIC UNION
# MAGIC SELECT 'aad' AS src, NULL AS time_bkt, tgt_type AS sub_type, tgt_id AS sub_id, name AS sub_name, 'same_as' AS pred, NULL AS pred_status, src_type AS obj_type, src_id AS obj_id, name AS obj_name
# MAGIC FROM entity_mapping
# MAGIC );
# COMMAND ----------
# DBTITLE 1,Add a non-trivial identity resolutions
# MAGIC %sql
# MAGIC
# MAGIC
# MAGIC INSERT INTO {{tgt_db_name}}.same_as
# MAGIC (
# MAGIC SELECT DISTINCT 'aad' AS src, NULL AS time_bkt, src.sub_type, src.sub_id, src.sub_name, 'same_as' AS pred, NULL AS pred_status, tgt.sub_type, tgt.sub_id, tgt.sub_name
# MAGIC FROM
# MAGIC (
# MAGIC SELECT DISTINCT sub_type, sub_id, sub_name
# MAGIC FROM {{tgt_db_name}}.aad_edges_gold_day
# MAGIC WHERE sub_name = 'maria.cole@chang-fisher.com'
# MAGIC OR sub_name = 'maria.cook@summers.info'
# MAGIC ) AS src,
# MAGIC (
# MAGIC SELECT DISTINCT sub_type, sub_id, sub_name
# MAGIC FROM {{tgt_db_name}}.aad_edges_gold_day
# MAGIC WHERE sub_name = 'maria.cole@chang-fisher.com'
# MAGIC OR sub_name = 'maria.cook@summers.info'
# MAGIC ) AS tgt
# MAGIC WHERE src.sub_id != tgt.sub_id
# MAGIC )
# COMMAND ----------
# MAGIC %sql
# MAGIC
# MAGIC SELECT * FROM {{tgt_db_name}}.same_as
# COMMAND ----------