-
Notifications
You must be signed in to change notification settings - Fork 1
/
joco_entity_demos.sql.tmpl
150 lines (145 loc) · 5.16 KB
/
joco_entity_demos.sql.tmpl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
-- ASSUMPTIONS:
-- 1. only one matchdatetime for all experiments included
-- JoCO-specific entity_demos table creation
DROP TABLE IF EXISTS {{schema}}.entity_demos;
CREATE TABLE {{schema}}.entity_demos
AS
WITH ref_models_pre AS (
-- choose a random model for each train_end_time to pull out the cohort via
-- the predictions table
SELECT DISTINCT ON (train_end_time)
model_id, train_end_time
FROM tmp_bias_models
ORDER BY train_end_time, RANDOM()
)
, ref_models AS (
SELECT rmp.model_id, rmp.train_end_time,
((mg.model_config->'matchdatetime')::VARCHAR)::TIMESTAMP AS matchdatetime
FROM ref_models_pre rmp
JOIN model_metadata.models m USING(model_id)
JOIN model_metadata.model_groups mg USING(model_group_id)
)
, entity_lkup AS (
SELECT DISTINCT
p.entity_id AS old_entity_id,
l1.matchdatetime AS old_matchdatetime,
l2.entity_id AS curr_entity_id,
p.as_of_date
FROM test_results.predictions p
JOIN ref_models rm USING(model_id)
JOIN entities.entity_source_match_lookup l1 ON p.entity_id = l1.entity_id AND l1.matchdatetime = rm.matchdatetime
JOIN entities.entity_source_match_lookup l2 ON l1.source = l2.source AND l1.sourceid = l2.sourceid
WHERE l2.matchdatetime = (SELECT MAX(matchdatetime) FROM entities.entity_source_match_lookup)
)
, hispanic_flags as (
-- unlike other features, take max() for hispanic flag since data tends to be more potentially
-- prone to false negatives depending on how it is collected
SELECT p.old_entity_id, p.as_of_date,
MAX(CASE WHEN TRIM(ethnic_101) = 'Y' THEN 1 ELSE 0 END) AS hispanic_flag
FROM entity_lkup p
JOIN records.jims_booking_records b ON p.curr_entity_id = b.entity_id
JOIN raw.jocojimsinmatedata r
ON b.booking_no = encode(r.hash_booking_no_0, 'hex')
-- hispanic ethnicity data only seems available starting in 2007
WHERE b.knowledge_date >= '2007-01-01'::DATE
AND b.knowledge_date <= p.as_of_date
GROUP BY 1,2
)
, event_demo_info as (
select p.old_entity_id, p.as_of_date,
e.entity_info ->> 'race' AS race,
e.entity_info ->> 'sex' as sex,
extract(year FROM age(p.as_of_date, (e.entity_info ->> 'date_of_birth')::date)) AS age,
e.knowledge_date
from semantic.events e
join entity_lkup p
on e.entity_id = p.curr_entity_id and knowledge_date <= p.as_of_date
)
, event_demo_aggs as (
select old_entity_id,
as_of_date,
race,
sex,
case
when age < 18 then 'a under 18'
when age between 18 and 30 then 'b 18 to 30'
when age between 31 and 45 then 'c 31 to 45'
when age between 46 and 59 then 'd 46 to 59'
when age >= 60 then 'e 60 plus'
else NULL
end AS age_bucket,
COUNT(*) AS count_by_demo,
MAX(knowledge_date) AS max_knowledge_date
from event_demo_info
group by grouping sets ((old_entity_id, as_of_date, race), (old_entity_id, as_of_date, sex), (old_entity_id, as_of_date, age_bucket))
)
, race_count as (select distinct on (old_entity_id, as_of_date)
old_entity_id,
as_of_date,
race,
count_by_demo,
max_knowledge_date
FROM event_demo_aggs
WHERE race IS NOT NULL
order by
old_entity_id,
as_of_date,
count_by_demo desc,
max_knowledge_date desc,
Random()
)
, age_count as (select distinct on (old_entity_id, as_of_date)
old_entity_id,
as_of_date,
age_bucket,
count_by_demo,
max_knowledge_date
FROM event_demo_aggs
WHERE age_bucket IS NOT NULL
order by
old_entity_id,
as_of_date,
count_by_demo desc,
max_knowledge_date desc,
Random()
)
, sex_count as (select distinct on (old_entity_id, as_of_date)
old_entity_id,
as_of_date,
sex,
count_by_demo,
max_knowledge_date
FROM event_demo_aggs
WHERE sex IS NOT NULL
order by
old_entity_id,
as_of_date,
count_by_demo desc,
max_knowledge_date desc,
Random()
)
select p.old_entity_id AS entity_id,
p.old_matchdatetime AS matchdatetime,
p.as_of_date,
-- layer hispanic ethnicity flag over race (in practice, only a handful of non-white race in JIMS
-- data is marked hispanic, so we let it take precedence)
CASE WHEN hispanic_flag = 1 THEN 'H' ELSE rc.race END AS race,
CASE WHEN hispanic_flag = 1 THEN 'H' WHEN rc.race IN ('B', 'H', 'W') THEN rc.race ELSE 'W' END AS race_3way,
sex,
age_bucket
from (SELECT DISTINCT old_entity_id, old_matchdatetime, as_of_date FROM entity_lkup) p
left join hispanic_flags
using(old_entity_id, as_of_date)
left join race_count rc
using(old_entity_id, as_of_date)
left join age_count
using(old_entity_id, as_of_date)
left join sex_count
using(old_entity_id, as_of_date)
-- TODO: Far better would be to alter the bias preprocessing downstream to
-- not calculate metrics for the extremely small groups. This is a temporary
-- patch to avoid div/0 errors, but it affects all demographic analyses
-- rather than just age
where age_bucket != 'a under 18'
;
ALTER TABLE {{schema}}.entity_demos ADD PRIMARY KEY (entity_id, as_of_date, matchdatetime);