Skip to content

Commit

Permalink
Cohort def optimizations (#142)
Browse files Browse the repository at this point in the history
* Make first occurrence of event deterministic.

Added order by _date, _id to criteria queries to use the event ID as the tiebreaker.

Fixes #139.

* Cohort Definition Query Optimization

Optimized #primary_events and #qualified_events into single query, eliminating one temp table creation.
Partitioning on person_id for event_ids allowing MPP architectures to leverage hashing on person_id.

Fixes #141.

* Added proper partitioning on person and event for gain counts.

* Switched from qualified_events to included_events for end date selection.
  • Loading branch information
chrisknoll authored Nov 19, 2016
1 parent c7f9bb7 commit 5c0916d
Show file tree
Hide file tree
Showing 17 changed files with 42 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public class CohortExpressionQueryBuilder implements IGetCriteriaSqlDispatcher,
private final static String PROCEDURE_OCCURRENCE_TEMPLATE = ResourceHelper.GetResourceAsString("/resources/cohortdefinition/sql/procedureOccurrence.sql");
private final static String SPECIMEN_TEMPLATE = ResourceHelper.GetResourceAsString("/resources/cohortdefinition/sql/specimen.sql");
private final static String VISIT_OCCURRENCE_TEMPLATE = ResourceHelper.GetResourceAsString("/resources/cohortdefinition/sql/visitOccurrence.sql");
private final static String PRIMARY_CRITERIA_EVENTS_TABLE = "#primary_events";
private final static String PRIMARY_CRITERIA_EVENTS_TABLE = "primary_events";
private final static String INCLUSION_RULE_QUERY_TEMPLATE = ResourceHelper.GetResourceAsString("/resources/cohortdefinition/sql/inclusionrule.sql");

private final static String DEMOGRAPHIC_CRITERIA_QUERY_TEMPLATE = ResourceHelper.GetResourceAsString("/resources/cohortdefinition/sql/demographicCriteria.sql");
Expand Down Expand Up @@ -217,7 +217,7 @@ public String getPrimaryEventsQuery(PrimaryCriteria primaryCriteria) {
criteriaQueries.add(c.accept(this));
}

query = StringUtils.replace(query,"@criteriaQueries", StringUtils.join(criteriaQueries, "\nUNION\n"));
query = StringUtils.replace(query,"@criteriaQueries", StringUtils.join(criteriaQueries, "\nUNION ALL\n"));

ArrayList<String> primaryEventsFilters = new ArrayList<>();
primaryEventsFilters.add(String.format(
Expand Down Expand Up @@ -253,7 +253,7 @@ public String buildExpressionQuery(CohortExpression expression, BuildExpressionQ
CriteriaGroup acGroup = expression.additionalCriteria;
String acGroupQuery = this.getCriteriaGroupQuery(acGroup, this.PRIMARY_CRITERIA_EVENTS_TABLE);//acGroup.accept(this);
acGroupQuery = StringUtils.replace(acGroupQuery,"@indexId", "" + 0);
additionalCriteriaQuery = "\nJOIN (\n" + acGroupQuery + ") AC on AC.event_id = pe.event_id\n";
additionalCriteriaQuery = "\nJOIN (\n" + acGroupQuery + ") AC on AC.person_id = pe.person_id and AC.event_id = pe.event_id\n";
}
resultSql = StringUtils.replace(resultSql, "@additionalCriteriaQuery", additionalCriteriaQuery);

Expand Down Expand Up @@ -356,15 +356,15 @@ public String getCriteriaGroupQuery(CriteriaGroup group, String eventTable) {

query = StringUtils.replace(query, "@eventTable", eventTable);
query = StringUtils.replace(query, "@intersectClause", intersectClause);
query = StringUtils.replace(query, "@criteriaQueries", StringUtils.join(additionalCriteriaQueries, "\nUNION\n"));
query = StringUtils.replace(query, "@criteriaQueries", StringUtils.join(additionalCriteriaQueries, "\nUNION ALL\n"));

return query;
}

private String getInclusionRuleQuery(CriteriaGroup inclusionRule)
{
String resultSql = INCLUSION_RULE_QUERY_TEMPLATE;
String additionalCriteriaQuery = "\nJOIN (\n" + getCriteriaGroupQuery(inclusionRule, "#qualified_events") + ") AC on AC.event_id = pe.event_id";
String additionalCriteriaQuery = "\nJOIN (\n" + getCriteriaGroupQuery(inclusionRule, "#qualified_events") + ") AC on AC.person_id = pe.person_id AND AC.event_id = pe.event_id";
additionalCriteriaQuery = StringUtils.replace(additionalCriteriaQuery,"@indexId", "" + 0);
resultSql = StringUtils.replace(resultSql, "@additionalCriteriaQuery", additionalCriteriaQuery);
return resultSql;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
SELECT @indexId as index_id, p.event_id
SELECT @indexId as index_id, p.person_id, p.event_id
FROM @eventTable P
LEFT JOIN
(
@criteriaQuery
) A on A.person_id = P.person_id and @windowCriteria
GROUP BY p.event_id
GROUP BY p.person_id, p.event_id
@occurrenceCriteria

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.condition_era_start_date as start_date, C.condition_era_end_date as end_date, C.CONDITION_CONCEPT_ID as TARGET_CONCEPT_ID
from
(
select ce.*, ROW_NUMBER() over (PARTITION BY ce.person_id ORDER BY ce.condition_era_start_date) as ordinal
select ce.*, ROW_NUMBER() over (PARTITION BY ce.person_id ORDER BY ce.condition_era_start_date, ce.condition_era_id) as ordinal
FROM @cdm_database_schema.CONDITION_ERA ce
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.condition_start_date as start_date, COALESCE(C.condition_end_date, DATEADD(day,1,C.condition_start_date)) as end_date, C.CONDITION_CONCEPT_ID as TARGET_CONCEPT_ID
from
(
select co.*, ROW_NUMBER() over (PARTITION BY co.person_id ORDER BY co.condition_start_date) as ordinal
select co.*, ROW_NUMBER() over (PARTITION BY co.person_id ORDER BY co.condition_start_date, co.condition_occurrence_id) as ordinal
FROM @cdm_database_schema.CONDITION_OCCURRENCE co
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.device_exposure_start_date as start_date, C.device_exposure_end_date as end_date, C.device_concept_id as TARGET_CONCEPT_ID
from
(
select de.*, ROW_NUMBER() over (PARTITION BY de.person_id ORDER BY de.device_exposure_start_date) as ordinal
select de.*, ROW_NUMBER() over (PARTITION BY de.person_id ORDER BY de.device_exposure_start_date, de.device_exposure_id) as ordinal
FROM @cdm_database_schema.DEVICE_EXPOSURE de
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.dose_era_start_date as start_date, C.dose_era_end_date as end_date, C.drug_concept_id as TARGET_CONCEPT_ID
from
(
select de.*, ROW_NUMBER() over (PARTITION BY de.person_id ORDER BY de.dose_era_start_date) as ordinal
select de.*, ROW_NUMBER() over (PARTITION BY de.person_id ORDER BY de.dose_era_start_date, de.dose_era_id) as ordinal
FROM @cdm_database_schema.DOSE_ERA de
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.drug_era_start_date as start_date, C.drug_era_end_date as end_date, C.drug_concept_id as TARGET_CONCEPT_ID
from
(
select de.*, ROW_NUMBER() over (PARTITION BY de.person_id ORDER BY de.drug_era_start_date) as ordinal
select de.*, ROW_NUMBER() over (PARTITION BY de.person_id ORDER BY de.drug_era_start_date, de.drug_era_id) as ordinal
FROM @cdm_database_schema.DRUG_ERA de
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.drug_exposure_start_date as start_date, COALESCE(C.drug_exposure_end_date, DATEADD(day, 1, C.drug_exposure_start_date)) as end_date, C.drug_concept_id as TARGET_CONCEPT_ID
from
(
select de.*, ROW_NUMBER() over (PARTITION BY de.person_id ORDER BY de.drug_exposure_start_date) as ordinal
select de.*, ROW_NUMBER() over (PARTITION BY de.person_id ORDER BY de.drug_exposure_start_date, de.drug_exposure_id) as ordinal
FROM @cdm_database_schema.DRUG_EXPOSURE de
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
@codesetQuery
@primaryEventsQuery

with primary_events (event_id, person_id, start_date, end_date, op_start_date, op_end_date) as
(
@primaryEventsQuery
)
SELECT event_id, person_id, start_date, end_date, op_start_date, op_end_date
INTO #qualified_events
FROM
(
select pe.event_id, pe.person_id, pe.start_date, pe.end_date, pe.op_start_date, pe.op_end_date, row_number() over (partition by pe.person_id order by pe.start_date @QualifiedEventSort) as ordinal
FROM #primary_events pe
FROM primary_events pe
@additionalCriteriaQuery
) QE
@QualifiedLimitFilter
Expand All @@ -16,6 +19,7 @@ FROM
create table #inclusionRuleCohorts
(
inclusion_rule_id bigint,
person_id bigint,
event_id bigint
)
;
Expand All @@ -28,7 +32,7 @@ with cteIncludedEvents(event_id, person_id, start_date, end_date, op_start_date,
(
select Q.event_id, Q.person_id, Q.start_date, Q.end_date, Q.op_start_date, Q.op_end_date, SUM(coalesce(POWER(cast(2 as bigint), I.inclusion_rule_id), 0)) as inclusion_rule_mask
from #qualified_events Q
LEFT JOIN #inclusionRuleCohorts I on I.event_id = Q.event_id
LEFT JOIN #inclusionRuleCohorts I on I.person_id = Q.person_id and I.event_id = Q.event_id
GROUP BY Q.event_id, Q.person_id, Q.start_date, Q.end_date, Q.op_start_date, Q.op_end_date
) MG -- matching groups
{@ruleTotal != 0}?{
Expand All @@ -54,9 +58,9 @@ DELETE FROM @target_database_schema.@target_cohort_table where cohort_definition
INSERT INTO @target_database_schema.@target_cohort_table (cohort_definition_id, subject_id, cohort_start_date, cohort_end_date)
select @target_cohort_id as cohort_definition_id, F.person_id, F.start_date, F.end_date
FROM (
select Q.person_id, Q.start_date, E.end_date, row_number() over (partition by Q.event_id order by E.end_date) as ordinal
from #qualified_events Q
join #cohort_ends E on Q.event_id = E.event_id and Q.person_id = E.person_id and E.end_date >= Q.start_date
select I.person_id, I.start_date, E.end_date, row_number() over (partition by I.person_id, I.event_id order by E.end_date) as ordinal
from #included_events I
join #cohort_ends E on I.event_id = E.event_id and I.person_id = E.person_id and E.end_date >= I.start_date
) F
WHERE F.ordinal = 1
;
Expand All @@ -69,10 +73,10 @@ insert into @results_database_schema.cohort_inclusion_result (cohort_definition_
select @target_cohort_id as cohort_definition_id, inclusion_rule_mask, count(*) as person_count
from
(
select Q.event_id, SUM(coalesce(POWER(cast(2 as bigint), I.inclusion_rule_id), 0)) as inclusion_rule_mask
select Q.person_id, Q.event_id, SUM(coalesce(POWER(cast(2 as bigint), I.inclusion_rule_id), 0)) as inclusion_rule_mask
from #qualified_events Q
LEFT JOIN #inclusionRuleCohorts I on q.event_id = i.event_id
GROUP BY Q.event_id
LEFT JOIN #inclusionRuleCohorts I on q.person_id = i.person_id and q.event_id = i.event_id
GROUP BY Q.person_id, Q.event_id
) MG -- matching groups
group by inclusion_rule_mask
;
Expand All @@ -86,7 +90,7 @@ left join
(
select i.inclusion_rule_id, count(i.event_id) as person_count
from #qualified_events Q
JOIN #inclusionRuleCohorts i on Q.event_id = i.event_id
JOIN #inclusionRuleCohorts i on Q.person_id = I.person_id and Q.event_id = i.event_id
group by i.inclusion_rule_id
) T on ir.rule_sequence = T.inclusion_rule_id
CROSS JOIN (select count(*) as total_rules from @results_database_schema.cohort_inclusion where cohort_definition_id = @target_cohort_id) RuleTotal
Expand Down Expand Up @@ -121,8 +125,5 @@ DROP TABLE #qualified_events;
TRUNCATE TABLE #included_events;
DROP TABLE #included_events;

TRUNCATE TABLE #primary_events;
DROP TABLE #primary_events;

TRUNCATE TABLE #Codesets;
DROP TABLE #Codesets;
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
select @indexId as index_id, event_id
select @indexId as index_id, person_id, event_id
FROM
(
select E.event_id
select E.person_id, E.event_id
FROM @eventTable E
LEFT JOIN
(
@criteriaQueries
) CQ on E.event_id = CQ.event_id
GROUP BY E.event_id
) CQ on E.person_id = CQ.person_id and E.event_id = CQ.event_id
GROUP BY E.person_id, E.event_id
@intersectClause
) G
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
INSERT INTO #inclusionRuleCohorts (inclusion_rule_id, event_id)
select @inclusion_rule_id as inclusion_rule_id, event_id
INSERT INTO #inclusionRuleCohorts (inclusion_rule_id, person_id, event_id)
select @inclusion_rule_id as inclusion_rule_id, person_id, event_id
FROM
(
select pe.event_id
select pe.person_id, pe.event_id
FROM #qualified_events pe
@additionalCriteriaQuery
) Results
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.measurement_date as start_date, DATEADD(d,1,C.measurement_date) as END_DATE, C.measurement_concept_id as TARGET_CONCEPT_ID
from
(
select m.*, ROW_NUMBER() over (PARTITION BY m.person_id ORDER BY m.measurement_date) as ordinal
select m.*, ROW_NUMBER() over (PARTITION BY m.person_id ORDER BY m.measurement_date, m.measurement_id) as ordinal
FROM @cdm_database_schema.MEASUREMENT m
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.observation_date as start_date, DATEADD(d,1,C.observation_date) as END_DATE, C.observation_concept_id as TARGET_CONCEPT_ID
from
(
select o.*, ROW_NUMBER() over (PARTITION BY o.person_id ORDER BY o.observation_date) as ordinal
select o.*, ROW_NUMBER() over (PARTITION BY o.person_id ORDER BY o.observation_date, o.observation_id) as ordinal
FROM @cdm_database_schema.OBSERVATION o
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
select row_number() over (order by P.person_id, P.start_date) as event_id, P.person_id, P.start_date, P.end_date, OP.observation_period_start_date as op_start_date, OP.observation_period_end_date as op_end_date
INTO #primary_events
select row_number() over (PARTITION BY P.person_id order by P.start_date) as event_id, P.person_id, P.start_date, P.end_date, OP.observation_period_start_date as op_start_date, OP.observation_period_end_date as op_end_date
FROM
(
select P.person_id, P.start_date, P.end_date, ROW_NUMBER() OVER (PARTITION BY person_id ORDER BY start_date @EventSort) ordinal
Expand All @@ -8,6 +7,6 @@ FROM
@criteriaQueries
) P
) P
JOIN @cdm_database_schema.observation_period OP on P.person_id = OP.person_id and P.start_date between OP.observation_period_start_date and op.observation_period_end_date
JOIN @cdm_database_schema.observation_period OP on P.person_id = OP.person_id and P.start_date >= OP.observation_period_start_date and P.start_date <= op.observation_period_end_date
WHERE @primaryEventsFilter
;

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.procedure_date as start_date, DATEADD(d,1,C.procedure_date) as END_DATE, C.procedure_concept_id as TARGET_CONCEPT_ID
from
(
select po.*, ROW_NUMBER() over (PARTITION BY po.person_id ORDER BY po.procedure_date) as ordinal
select po.*, ROW_NUMBER() over (PARTITION BY po.person_id ORDER BY po.procedure_date, po.procedure_occurrence_id) as ordinal
FROM @cdm_database_schema.PROCEDURE_OCCURRENCE po
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.specimen_date as start_date, DATEADD(d,1,C.specimen_date) as end_date, C.specimen_concept_id as TARGET_CONCEPT_ID
from
(
select s.*, ROW_NUMBER() over (PARTITION BY s.person_id ORDER BY s.specimen_date) as ordinal
select s.*, ROW_NUMBER() over (PARTITION BY s.person_id ORDER BY s.specimen_date, s.specimen_id) as ordinal
FROM @cdm_database_schema.SPECIMEN s
@codesetClause
) C
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
select C.person_id, C.visit_start_date as start_date, C.visit_end_date as end_date, C.visit_concept_id as TARGET_CONCEPT_ID
from
(
select vo.*, ROW_NUMBER() over (PARTITION BY vo.person_id ORDER BY vo.visit_start_date) as ordinal
select vo.*, ROW_NUMBER() over (PARTITION BY vo.person_id ORDER BY vo.visit_start_date, vo.visit_occurrence_id) as ordinal
FROM @cdm_database_schema.VISIT_OCCURRENCE vo
@codesetClause
) C
Expand Down

0 comments on commit 5c0916d

Please sign in to comment.