Skip to content

Commit

Permalink
Merge pull request #25527 from eileenmcnaughton/finder_build
Browse files Browse the repository at this point in the history
Fix dedupe finder performance issue on looking up table size
  • Loading branch information
eileenmcnaughton authored Mar 10, 2023
2 parents d7acaca + bafcc24 commit fd60462
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 22 deletions.
43 changes: 43 additions & 0 deletions CRM/Core/BAO/SchemaHandler.php
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,49 @@ public static function getInUseCollation(): string {
return \Civi::$statics[__CLASS__][__FUNCTION__];
}

/**
* Get estimated number of rows in the given tables.
*
* Note that this query is less precise than SELECT(*) - especially on
* larger tables but performs significantly better.
* See https://dba.stackexchange.com/questions/184685/why-is-count-slow-when-explain-knows-the-answer
*
* @param array $tables
* e.g ['civicrm_contact', 'civicrm_activity']
*
* @return array
* e.g ['civicrm_contact' => 200000, 'civicrm_activity' => 100000]
*/
public static function getRowCountForTables(array $tables): array {
$cachedResults = Civi::$statics[__CLASS__][__FUNCTION__] ?? [];
// Compile list of tables not already cached.
$tablesToCheck = array_keys(array_diff_key(array_flip($tables), $cachedResults));
$result = CRM_Core_DAO::executeQuery('
SELECT TABLE_ROWS as row_count, TABLE_NAME as table_name FROM information_schema.TABLES WHERE
TABLE_NAME IN("' . implode('","', $tablesToCheck) . '")
AND TABLE_SCHEMA = DATABASE()'
);
while ($result->fetch()) {
$cachedResults[$result->table_name] = (int) $result->row_count;
}
Civi::$statics[__CLASS__][__FUNCTION__] = $cachedResults;
return array_intersect_key($cachedResults, array_fill_keys($tables, TRUE));
}

/**
* Get estimated number of rows in the given table.
*
* @see self::getRowCountForTables
*
* @param string $tableName
*
* @return int
* The approximate number of rows in the table. This is also 0 if the table does not exist.
*/
public static function getRowCountForTable(string $tableName): int {
return self::getRowCountForTables([$tableName])[$tableName] ?? 0;
}

/**
* Does the database support utf8mb4.
*
Expand Down
4 changes: 3 additions & 1 deletion CRM/Core/DAO.php
Original file line number Diff line number Diff line change
Expand Up @@ -1081,9 +1081,11 @@ public static function objectExists($value, $daoName, $daoID, $fieldName = 'name
}

/**
* Scans all the tables using a slow query and table name.
* Gets the names of all the tables in the schema.
*
* @return array
*
* @throws \CRM_Core_Exception
*/
public static function getTableNames(): array {
$dao = CRM_Core_DAO::executeQuery(
Expand Down
43 changes: 23 additions & 20 deletions CRM/Dedupe/BAO/DedupeRuleGroup.php
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ public function fillTable() {
$query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
$dao = CRM_Core_DAO::executeQuery($query);

// FIXME: we need to be more acurate with affected rows, especially for insert vs duplicate insert.
// FIXME: we need to be more accurate with affected rows, especially for insert vs duplicate insert.
// And that will help optimize further.
$affectedRows = $dao->affectedRows();

Expand Down Expand Up @@ -337,28 +337,31 @@ public static function isQuerySetInclusive($tableQueries, $threshold, $exclWeigh
}

/**
* sort queries by number of records for the table associated with them.
* @param $tableQueries
* Sort queries by number of records for the table associated with them.
*
* @param array $tableQueries
*/
public static function orderByTableCount(&$tableQueries) {
static $tableCount = [];

$tempArray = [];
foreach ($tableQueries as $key => $query) {
$table = explode(".", $key);
$table = $table[0];
if (!array_key_exists($table, $tableCount)) {
$query = "SELECT COUNT(*) FROM {$table}";
$tableCount[$table] = CRM_Core_DAO::singleValueQuery($query);
}
$tempArray[$key] = $tableCount[$table];
}
public static function orderByTableCount(array &$tableQueries): void {
uksort($tableQueries, 'self::isTableBigger');
}

asort($tempArray);
foreach ($tempArray as $key => $count) {
$tempArray[$key] = $tableQueries[$key];
/**
* Is the table extracted from the first string larger than the second string.
*
* @param string $a
* e.g civicrm_contact.first_name
* @param string $b
* e.g civicrm_address.street_address
*
* @return int
*/
private static function isTableBigger(string $a, string $b): int {
$tableA = explode('.', $a)[0];
$tableB = explode('.', $b)[0];
if ($tableA === $tableB) {
return 0;
}
$tableQueries = $tempArray;
return CRM_Core_BAO_SchemaHandler::getRowCountForTable($tableA) <=> CRM_Core_BAO_SchemaHandler::getRowCountForTable($tableB);
}

/**
Expand Down
19 changes: 19 additions & 0 deletions tests/phpunit/CRM/Core/BAO/SchemaHandlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,25 @@ public function columnTests(): array {
return $columns;
}

/**
* Test the drop index if exists function for a non-existent index.
*
* @throws \CRM_Core_Exception
*/
public function testGetRowCountForTable(): void {
// Hopefully running ANALYZE TABLE will consistently update the 'approximate' values
// so we can test them.
CRM_Core_DAO::singleValueQuery('ANALYZE TABLE civicrm_domain');
CRM_Core_DAO::singleValueQuery('ANALYZE TABLE civicrm_worldregion');
CRM_Core_DAO::singleValueQuery('ANALYZE TABLE civicrm_acl');
$this->assertEquals([
'civicrm_worldregion' => 6,
'civicrm_acl' => 0,
'civicrm_domain' => 2,
], CRM_Core_BAO_SchemaHandler::getRowCountForTables(['civicrm_domain', 'civicrm_acl', 'random_name', 'civicrm_worldregion']));
$this->assertEquals(2, CRM_Core_BAO_SchemaHandler::getRowCountForTable('civicrm_domain'));
}

/**
* @param string $tableName
* @param string $columnName
Expand Down
2 changes: 1 addition & 1 deletion tests/phpunit/CRM/Dedupe/DedupeFinderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public function tearDown(): void {
/**
* Test the unsupervised dedupe rule against a group.
*
* @throws \Exception
* @throws \CRM_Core_Exception
*/
public function testUnsupervisedDupes(): void {
// make dupe checks based on following contact sets:
Expand Down

0 comments on commit fd60462

Please sign in to comment.