Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dedupe fixes CRM-18442, CRM-18539, CRM-18480 #8251

Merged
merged 3 commits into from
May 19, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CRM/Contact/Page/AJAX.php
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ public static function getDedupes() {
$contactType = CRM_Core_DAO::getFieldValue('CRM_Dedupe_DAO_RuleGroup', $rgid, 'contact_type');
}

$cacheKeyString = "merge {$contactType}_{$rgid}_{$gid}";
$cacheKeyString = CRM_Dedupe_Merger::getMergeCacheKeyString($rgid, $gid);
$searchRows = array();
$selectorElements = array('is_selected', 'is_selected_input', 'src_image', 'src', 'src_email', 'src_street', 'src_postcode', 'dst_image', 'dst', 'dst_email', 'dst_street', 'dst_postcode', 'conflicts', 'weight', 'actions');

Expand Down
15 changes: 12 additions & 3 deletions CRM/Core/BAO/PrevNextCache.php
Original file line number Diff line number Diff line change
Expand Up @@ -336,15 +336,19 @@ public static function getCount($cacheKey, $join = NULL, $where = NULL, $op = "=
}

/**
* Repopulate the cache of merge prospects.
*
* @param int $rgid
* @param int $gid
* @param NULL $cacheKeyString
* @param array $criteria
* Additional criteria to filter by.
*
* @return bool
*/
public static function refillCache($rgid = NULL, $gid = NULL, $cacheKeyString = NULL) {
public static function refillCache($rgid = NULL, $gid = NULL, $cacheKeyString = NULL, $criteria = array()) {
if (!$cacheKeyString && $rgid) {
$cacheKeyString = CRM_Dedupe_Merger::getMergeCacheKeyString($rgid, $gid);
$cacheKeyString = CRM_Dedupe_Merger::getMergeCacheKeyString($rgid, $gid, $criteria);
}

if (!$cacheKeyString) {
Expand All @@ -364,7 +368,12 @@ public static function refillCache($rgid = NULL, $gid = NULL, $cacheKeyString =
$foundDupes = CRM_Dedupe_Finder::dupesInGroup($rgid, $gid);
}
elseif ($rgid) {
$foundDupes = CRM_Dedupe_Finder::dupes($rgid);
$contactIDs = array();
if (!empty($criteria)) {
$contacts = civicrm_api3('Contact', 'get', array_merge(array('options' => array('limit' => 0), 'return' => 'id'), $criteria['contact']));
$contactIDs = array_keys($contacts['values']);
}
$foundDupes = CRM_Dedupe_Finder::dupes($rgid, $contactIDs);
}

if (!empty($foundDupes)) {
Expand Down
24 changes: 24 additions & 0 deletions CRM/Dedupe/BAO/QueryBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,28 @@ public static function internalFilters($rg, $strID1 = 'contact1.id', $strID2 = '
}
}

/**
* If a contact list is specified then adjust the query to ensure one contact is in that list.
*
* Doing an OR join here will lead to a server-killing unindexed query. However, a union will
* perform better.
*
* @param array $contactList
* @param string $query
* @param string $strID1
* @param string $strID2
*
* @return string
*/
protected static function filterQueryByContactList(array $contactList, $query, $strID1 = 'contact1.id', $strID2 = 'contact2.id') {
if (empty($contactList)) {
return $query . " AND ($strID1 < $strID2)";
}
$contactIDs = implode(',', $contactList);
return "$query AND $strID1 IN ($contactIDs) AND $strID1 > $strID2
UNION $query AND $strID1 > $strID2 AND $strID2 IN ($contactIDs) AND $strID1 NOT IN ($contactIDs)
";

}

}
6 changes: 3 additions & 3 deletions CRM/Dedupe/BAO/QueryBuilder/IndividualSupervised.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public static function record($rg) {
* @return array
*/
public static function internal($rg) {
$query = "
$query = self::filterQueryByContactList($rg->contactIds, "
SELECT contact1.id as id1, contact2.id as id2, {$rg->threshold} as weight
FROM civicrm_contact as contact1
JOIN civicrm_email as email1 ON email1.contact_id=contact1.id
Expand All @@ -63,8 +63,8 @@ public static function internal($rg) {
JOIN civicrm_email as email2 ON
email2.contact_id=contact2.id AND
email1.email=email2.email
WHERE contact1.contact_type = 'Individual'
AND " . self::internalFilters($rg);
WHERE contact1.contact_type = 'Individual'");

return array(
"civicrm_contact.{$rg->name}.{$rg->threshold}" => $query,
);
Expand Down
33 changes: 24 additions & 9 deletions CRM/Dedupe/Merger.php
Original file line number Diff line number Diff line change
Expand Up @@ -590,15 +590,18 @@ public static function retrieveFields($main, $other) {
* @param int $batchLimit number of merges to carry out in one batch.
* @param int $isSelected if records with is_selected column needs to be processed.
*
* @param array $criteria
* Criteria to use in the filter.
*
* @return array|bool
*/
public static function batchMerge($rgid, $gid = NULL, $mode = 'safe', $autoFlip = TRUE, $batchLimit = 1, $isSelected = 2) {
public static function batchMerge($rgid, $gid = NULL, $mode = 'safe', $autoFlip = TRUE, $batchLimit = 1, $isSelected = 2, $criteria = array()) {
$redirectForPerformance = ($batchLimit > 1) ? TRUE : FALSE;
$reloadCacheIfEmpty = (!$redirectForPerformance && $isSelected == 2);
$dupePairs = self::getDuplicatePairs($rgid, $gid, $reloadCacheIfEmpty, $batchLimit, $isSelected, '', ($mode == 'aggressive'));
$dupePairs = self::getDuplicatePairs($rgid, $gid, $reloadCacheIfEmpty, $batchLimit, $isSelected, '', ($mode == 'aggressive'), $criteria);

$cacheParams = array(
'cache_key_string' => self::getMergeCacheKeyString($rgid, $gid),
'cache_key_string' => self::getMergeCacheKeyString($rgid, $gid, $criteria),
// @todo stop passing these parameters in & instead calculate them in the merge function based
// on the 'real' params like $isRespectExclusions $batchLimit and $isSelected.
'join' => self::getJoinOnDedupeTable(),
Expand Down Expand Up @@ -730,9 +733,14 @@ public static function merge($dupePairs = array(), $cacheParams = array(), $mode
// doNotResetCache flag
$config = CRM_Core_Config::singleton();
$config->doNotResetCache = 1;
$deletedContacts = array();

while (!empty($dupePairs)) {
foreach ($dupePairs as $dupes) {
foreach ($dupePairs as $index => $dupes) {
if (in_array($dupes['dstID'], $deletedContacts) || in_array($dupes['srcID'], $deletedContacts)) {
unset($dupePairs[$index]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be useful to add something into the message that's displayed to say that x contacts were not merged, because their duplicate had already been deleted.

ie:
4 contacts merged
2 pairs skipped due to contact deletion

Although that's not very descriptive. Or is this something for a follow-up?

continue;
}
CRM_Utils_Hook::merge('flip', $dupes, $dupes['dstID'], $dupes['srcID']);
$mainId = $dupes['dstID'];
$otherId = $dupes['srcID'];
Expand Down Expand Up @@ -763,6 +771,7 @@ public static function merge($dupePairs = array(), $cacheParams = array(), $mode
if (!CRM_Dedupe_Merger::skipMerge($mainId, $otherId, $migrationInfo, $mode, $conflicts)) {
CRM_Dedupe_Merger::moveAllBelongings($mainId, $otherId, $migrationInfo);
$resultStats['merged'][] = array('main_id' => $mainId, 'other_id' => $otherId);
$deletedContacts[] = $otherId;
}
else {
$resultStats['skipped'][] = array('main_id' => $mainId, 'other_id' => $otherId);
Expand Down Expand Up @@ -1020,7 +1029,7 @@ public static function getRowsElementsAndInfo($mainId, $otherId) {

// CRM-18480: Cancel the process if the contact is already deleted
if (isset($result['values'][$cid]['contact_is_deleted']) && !empty($result['values'][$cid]['contact_is_deleted'])) {
CRM_Core_Error::fatal(ts('Cannot merge because the \'%1\' contact (ID %2) has been deleted.', array(1 => $moniker, 2 => $cid)));
throw new CRM_Core_Exception(ts('Cannot merge because the \'%1\' contact (ID %2) has been deleted.', array(1 => $moniker, 2 => $cid)));
}

$$moniker = $result['values'][$cid];
Expand Down Expand Up @@ -1978,20 +1987,22 @@ public static function createMergeActivities($mainId, $otherId) {
* @param bool $isSelected
* @param array $orderByClause
* @param bool $includeConflicts
* @param array $criteria
* Additional criteria to narrow down the merge group.
*
* @return array
* Array of matches meeting the criteria.
*/
public static function getDuplicatePairs($rule_group_id, $group_id, $reloadCacheIfEmpty, $batchLimit, $isSelected, $orderByClause = '', $includeConflicts = TRUE) {
public static function getDuplicatePairs($rule_group_id, $group_id, $reloadCacheIfEmpty, $batchLimit, $isSelected, $orderByClause = '', $includeConflicts = TRUE, $criteria = array()) {
$where = self::getWhereString($batchLimit, $isSelected);
$cacheKeyString = self::getMergeCacheKeyString($rule_group_id, $group_id, $includeConflicts);
$cacheKeyString = self::getMergeCacheKeyString($rule_group_id, $group_id, $criteria);
$join = self::getJoinOnDedupeTable();
$dupePairs = CRM_Core_BAO_PrevNextCache::retrieve($cacheKeyString, $join, $where, 0, 0, array(), $orderByClause, $includeConflicts);
if (empty($dupePairs) && $reloadCacheIfEmpty) {
// If we haven't found any dupes, probably cache is empty.
// Try filling cache and give another try. We don't need to specify include conflicts here are there will not be any
// until we have done some processing.
CRM_Core_BAO_PrevNextCache::refillCache($rule_group_id, $group_id, $cacheKeyString);
CRM_Core_BAO_PrevNextCache::refillCache($rule_group_id, $group_id, $cacheKeyString, $criteria);
$dupePairs = CRM_Core_BAO_PrevNextCache::retrieve($cacheKeyString, $join, $where, 0, 0, array(), $orderByClause, $includeConflicts);
return $dupePairs;
}
Expand All @@ -2003,14 +2014,18 @@ public static function getDuplicatePairs($rule_group_id, $group_id, $reloadCache
*
* @param int $rule_group_id
* @param int $group_id
* @param array $criteria
* Additional criteria to narrow down the merge group.
* Currently we are only supporting the key 'contact' within it.
*
* @return string
*/
public static function getMergeCacheKeyString($rule_group_id, $group_id) {
public static function getMergeCacheKeyString($rule_group_id, $group_id, $criteria = array()) {
$contactType = CRM_Dedupe_BAO_RuleGroup::getContactTypeForRuleGroup($rule_group_id);
$cacheKeyString = "merge {$contactType}";
$cacheKeyString .= $rule_group_id ? "_{$rule_group_id}" : '_0';
$cacheKeyString .= $group_id ? "_{$group_id}" : '_0';
$cacheKeyString .= !empty($criteria) ? serialize($criteria) : '_0';
return $cacheKeyString;
}

Expand Down
2 changes: 1 addition & 1 deletion api/v3/Job.php
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ function civicrm_api3_job_process_batch_merge($params) {
$mode = CRM_Utils_Array::value('mode', $params, 'safe');
$autoFlip = CRM_Utils_Array::value('auto_flip', $params, TRUE);

$result = CRM_Dedupe_Merger::batchMerge($rule_group_id, $gid, $mode, $autoFlip);
$result = CRM_Dedupe_Merger::batchMerge($rule_group_id, $gid, $mode, $autoFlip, 1, 2, CRM_Utils_Array::value('criteria', $params, array()));

return civicrm_api3_create_success($result, $params);
}
Expand Down
4 changes: 2 additions & 2 deletions tests/phpunit/CRM/Dedupe/MergerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ public function testBatchMergeSelectedDuplicates() {

// Retrieve pairs from prev next cache table
$select = array('pn.is_selected' => 'is_selected');
$cacheKeyString = "merge Individual_{$dao->id}_{$this->_groupId}";
$cacheKeyString = "merge Individual_{$dao->id}_{$this->_groupId}_0";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why doesn't this use getMergeCacheKeyString ? Because it's in a test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I think it should - I was in test-debug mode at that time

$pnDupePairs = CRM_Core_BAO_PrevNextCache::retrieve($cacheKeyString, NULL, NULL, 0, 0, $select);

$this->assertEquals(count($foundDupes), count($pnDupePairs), 'Check number of dupe pairs in prev next cache.');
Expand Down Expand Up @@ -226,7 +226,7 @@ public function testBatchMergeAllDuplicates() {

// Retrieve pairs from prev next cache table
$select = array('pn.is_selected' => 'is_selected');
$cacheKeyString = "merge Individual_{$dao->id}_{$this->_groupId}";
$cacheKeyString = "merge Individual_{$dao->id}_{$this->_groupId}_0";
$pnDupePairs = CRM_Core_BAO_PrevNextCache::retrieve($cacheKeyString, NULL, NULL, 0, 0, $select);

$this->assertEquals(count($foundDupes), count($pnDupePairs), 'Check number of dupe pairs in prev next cache.');
Expand Down
33 changes: 33 additions & 0 deletions tests/phpunit/api/v3/JobTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,39 @@ public function testBatchMergeEmailHandling() {
), 4);
}

/**
* Test the batch merge by id range.
*
* We have 2 sets of 5 matches & set the merge only to merge the lower set.
*/
public function testBatchMergeIDRange() {
for ($x = 0; $x <= 4; $x++) {
$id = $this->individualCreate(array('email' => 'batman@gotham.met'));
}
for ($x = 0; $x <= 4; $x++) {
$this->individualCreate(array('email' => 'robin@gotham.met'));
}
$result = $this->callAPISuccess('Job', 'process_batch_merge', array('criteria' => array('contact' => array('id' => array('<' => $id)))));
$this->assertEquals(4, count($result['values']['merged']));
$this->callAPISuccessGetCount('Contact', array('email' => 'batman@gotham.met'), 1);
$this->callAPISuccessGetCount('Contact', array('email' => 'robin@gotham.met'), 5);
$contacts = $this->callAPISuccess('Contact', 'get', array('is_deleted' => 0));
$deletedContacts = $this->callAPISuccess('Contact', 'get', array('is_deleted' => 0));
$this->callAPISuccessGetCount('Email', array(
'email' => 'batman@gotham.met',
'contact_id' => array('IN' => array_keys($contacts['values'])),
), 1);
$this->callAPISuccessGetCount('Email', array(
'email' => 'batman@gotham.met',
'contact_id' => array('IN' => array_keys($deletedContacts['values'])),
), 1);
$this->callAPISuccessGetCount('Email', array(
'email' => 'robin@gotham.met',
'contact_id' => array('IN' => array_keys($contacts['values'])),
), 5);

}

/**
* Get data for batch merge.
*/
Expand Down