From 668f6e4dc186ab81a3831107c8e5f8f63132a1f3 Mon Sep 17 00:00:00 2001 From: eileen Date: Wed, 27 Apr 2016 17:20:43 +1200 Subject: [PATCH] CRM-18842 Dedupe query: remove OR join in favour of more performant UNION Unions are much faster than OR joins. This change took the length of the query to get the dedupes on a large database from 'as long as it took for the server to fall over' to less than one second on a small group of contacts This query is only affecting one path - ie Individuals - at the moment as I can only extend that as fast as I can write tests. --- CRM/Dedupe/BAO/QueryBuilder.php | 24 +++++++++++++++++++ .../BAO/QueryBuilder/IndividualSupervised.php | 6 ++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/CRM/Dedupe/BAO/QueryBuilder.php b/CRM/Dedupe/BAO/QueryBuilder.php index 484ce391c64d..3daff16951f3 100644 --- a/CRM/Dedupe/BAO/QueryBuilder.php +++ b/CRM/Dedupe/BAO/QueryBuilder.php @@ -22,4 +22,28 @@ public static function internalFilters($rg, $strID1 = 'contact1.id', $strID2 = ' } } + /** + * If a contact list is specified then adjust the query to ensure one contact is in that list. + * + * Doing an OR join here will lead to a server-killing unindexed query. However, a union will + * perform better. + * + * @param array $contactList + * @param string $query + * @param string $strID1 + * @param string $strID2 + * + * @return string + */ + protected static function filterQueryByContactList(array $contactList, $query, $strID1 = 'contact1.id', $strID2 = 'contact2.id') { + if (empty($contactList)) { + return $query . " AND ($strID1 < $strID2)"; + } + $contactIDs = implode(',', $contactList); + return "$query AND $strID1 IN ($contactIDs) AND $strID1 > $strID2 + UNION $query AND $strID1 > $strID2 AND $strID2 IN ($contactIDs) AND $strID1 NOT IN ($contactIDs) + "; + + } + } diff --git a/CRM/Dedupe/BAO/QueryBuilder/IndividualSupervised.php b/CRM/Dedupe/BAO/QueryBuilder/IndividualSupervised.php index 806f023e6c48..e150f44fd06a 100644 --- a/CRM/Dedupe/BAO/QueryBuilder/IndividualSupervised.php +++ b/CRM/Dedupe/BAO/QueryBuilder/IndividualSupervised.php @@ -53,7 +53,7 @@ public static function record($rg) { * @return array */ public static function internal($rg) { - $query = " + $query = self::filterQueryByContactList($rg->contactIds, " SELECT contact1.id as id1, contact2.id as id2, {$rg->threshold} as weight FROM civicrm_contact as contact1 JOIN civicrm_email as email1 ON email1.contact_id=contact1.id @@ -63,8 +63,8 @@ public static function internal($rg) { JOIN civicrm_email as email2 ON email2.contact_id=contact2.id AND email1.email=email2.email - WHERE contact1.contact_type = 'Individual' - AND " . self::internalFilters($rg); + WHERE contact1.contact_type = 'Individual'"); + return array( "civicrm_contact.{$rg->name}.{$rg->threshold}" => $query, );