From d422e33079844d69ca91c0d098f1a0484a5081c6 Mon Sep 17 00:00:00 2001 From: Rocco Tripaldi Date: Mon, 5 Aug 2019 11:01:31 -0400 Subject: [PATCH] Jetpack Sync: New endpoint to retrieve an ID range. (#12671) * Jetpack Sync: Adds An endpoint to retrieve the minimum and maximum ID from the database table corresponding to the given object type. * Let's get a range of min / max ids based on batch size * improve data typing * removing permission check for now * Basing it off sync modules * Sync: The replica store already had some min/max logic. So I pulled that into it's own method that can be used by individual modules * We can infer from the table name. Thanks @tyxla * Update packages/sync/src/Replicastore.php Co-Authored-By: Marin Atanasov <8436925+tyxla@users.noreply.github.com> * Fixing documentation * Extra whitespace * improve comments * This endpoint should be a sync endpoint * Adjust poopy syntax * fix some wonky syntax * White space fixes * Update json-endpoints/jetpack/class.jetpack-json-api-sync-endpoint.php Co-Authored-By: Marin Atanasov <8436925+tyxla@users.noreply.github.com> * Update packages/sync/src/Replicastore.php Co-Authored-By: Marin Atanasov <8436925+tyxla@users.noreply.github.com> --- .../class.jetpack-json-api-sync-endpoint.php | 33 ++++++++ .../jetpack/json-api-jetpack-endpoints.php | 20 +++++ packages/sync/src/Replicastore.php | 66 +++++++++++---- packages/sync/src/modules/Comments.php | 26 +++++- packages/sync/src/modules/Module.php | 82 +++++++++++++++++++ packages/sync/src/modules/Posts.php | 29 ++++++- .../sync/src/modules/Term_Relationships.php | 22 +++++ packages/sync/src/modules/Terms.php | 26 +++++- packages/sync/src/modules/Users.php | 15 +++- 9 files changed, 294 insertions(+), 25 deletions(-) diff --git a/json-endpoints/jetpack/class.jetpack-json-api-sync-endpoint.php b/json-endpoints/jetpack/class.jetpack-json-api-sync-endpoint.php index a34b060e3d118..b9ebf48499594 100644 --- a/json-endpoints/jetpack/class.jetpack-json-api-sync-endpoint.php +++ b/json-endpoints/jetpack/class.jetpack-json-api-sync-endpoint.php @@ -324,3 +324,36 @@ protected function result() { ); } } + +class Jetpack_JSON_API_Sync_Object_Id_Range extends Jetpack_JSON_API_Sync_Endpoint { + protected function result() { + $args = $this->query_args(); + + $module_name = $args['sync_module']; + $batch_size = $args['batch_size']; + + if ( ! $this->is_valid_sync_module( $module_name ) ) { + return new WP_Error( 'invalid_module', 'This sync module cannot be used to calculate a range.', 400 ); + } + + $module = Modules::get_module( $module_name ); + + return array( + 'ranges' => $module->get_min_max_object_ids_for_batches( $batch_size ), + ); + } + + protected function is_valid_sync_module( $module_name ) { + return in_array( + $module_name, + array( + 'comments', + 'posts', + 'terms', + 'term_relationships', + 'users', + ), + true + ); + } +} diff --git a/json-endpoints/jetpack/json-api-jetpack-endpoints.php b/json-endpoints/jetpack/json-api-jetpack-endpoints.php index 36d5a67cacf7c..d428e612be125 100644 --- a/json-endpoints/jetpack/json-api-jetpack-endpoints.php +++ b/json-endpoints/jetpack/json-api-jetpack-endpoints.php @@ -647,6 +647,26 @@ 'example_request' => 'https://public-api.wordpress.com/rest/v1.1/sites/example.wordpress.org/sync/unlock' ) ); +// GET /sites/%s/sync/object-id-range +new Jetpack_JSON_API_Sync_Object_Id_Range( array( + 'description' => 'Gets minimum and maximum object ids for each batch of given batch size.', + 'method' => 'GET', + 'path' => '/sites/%s/sync/object-id-range', + 'group' => '__do_not_document', + 'stat' => 'sync-object-id-range', + 'path_labels' => array( + '$site' => '(int|string) The site ID, The site domain' + ), + 'query_parameters' => array( + 'batch_size' => '(int=1000) The amount of objects per batch.', + 'sync_module' => '(string=posts) The sync module used to enumerate the ranges.', + ), + 'response_format' => array( + 'ranges' => '(array) An array of range objects with min and max properties for each batch.', + ), + 'example_request' => 'https://public-api.wordpress.com/rest/v1.1/sites/example.wordpress.org/sync/object-id-range?batch_size=100&sync_module=comments' +) ); + // POST /sites/%s/sync/checkout new Jetpack_JSON_API_Sync_Checkout_Endpoint( array( 'description' => 'Locks the queue and returns items and the buffer ID.', diff --git a/packages/sync/src/Replicastore.php b/packages/sync/src/Replicastore.php index 77ab60a76539c..ea34ca0a5b3b5 100644 --- a/packages/sync/src/Replicastore.php +++ b/packages/sync/src/Replicastore.php @@ -1202,6 +1202,41 @@ public function get_checksum_columns_for_object_type( $object_type ) { } } + /** + * Grabs the minimum and maximum object ids for the given parameters. + * + * @access public + * + * @param string $id_field The id column in the table to query. + * @param string $object_table The table to query. + * @param string $where A sql where clause without 'WHERE'. + * @param int $bucket_size The maximum amount of objects to include in the query. + * For `term_relationships` table, the bucket size will refer to the amount + * of distinct object ids. This will likely include more database rows than + * the bucket size implies. + * + * @return object An object with min_id and max_id properties. + */ + public function get_min_max_object_id( $id_field, $object_table, $where, $bucket_size ) { + global $wpdb; + + // The term relationship table's unique key is a combination of 2 columns. `DISTINCT` helps us get a more acurate query. + $distinct_sql = ( $wpdb->term_relationships === $object_table ) ? 'DISTINCT' : ''; + $where_sql = $where ? "WHERE $where" : ''; + + // Since MIN() and MAX() do not work with LIMIT, we'll need to adjust the dataset we query if a limit is present. + // With a limit present, we'll look at a dataset consisting of object_ids that meet the constructs of the $where clause. + // Without a limit, we can use the actual table as a dataset. + $from = $bucket_size ? + "( SELECT $distinct_sql $id_field FROM $object_table $where_sql ORDER BY $id_field ASC LIMIT $bucket_size ) as ids" : + "$object_table $where_sql ORDER BY $id_field ASC"; + + return $wpdb->get_row( + // phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared + "SELECT MIN($id_field) as min, MAX($id_field) as max FROM $from" + ); + } + /** * Retrieve the checksum histogram for a specific object type. * @@ -1278,7 +1313,8 @@ public function checksum_histogram( $object_type, $buckets, $start_id = null, $e $previous_max_id = 0; $histogram = array(); - $where = '1=1'; + // This is used for the min / max query, while $where_sql is used for the checksum query. + $where = $where_sql; if ( $start_id ) { $where .= " AND $id_field >= " . intval( $start_id ); @@ -1288,39 +1324,35 @@ public function checksum_histogram( $object_type, $buckets, $start_id = null, $e $where .= " AND $id_field <= " . intval( $end_id ); } - $distinct = ''; - if ( 'term_relationships' === $object_type ) { - $distinct = 'DISTINCT'; - } - do { - list( $first_id, $last_id ) = $wpdb->get_row( - // phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared - "SELECT MIN($id_field) as min_id, MAX($id_field) as max_id FROM ( SELECT $distinct $id_field FROM $object_table WHERE $where AND $id_field > $previous_max_id ORDER BY $id_field ASC LIMIT $bucket_size ) as ids", - ARRAY_N + $result = $this->get_min_max_object_id( + $id_field, + $object_table, + $where . " AND $id_field > $previous_max_id", + $bucket_size ); - if ( null === $first_id || null === $last_id ) { + if ( null === $result->min || null === $result->max ) { // Nothing to checksum here... break; } // Get the checksum value. - $value = $this->table_checksum( $object_table, $columns, $id_field, $where_sql, $first_id, $last_id, $strip_non_ascii, $salt ); + $value = $this->table_checksum( $object_table, $columns, $id_field, $where_sql, $result->min, $result->max, $strip_non_ascii, $salt ); if ( is_wp_error( $value ) ) { return $value; } - if ( null === $first_id || null === $last_id ) { + if ( null === $result->min || null === $result->max ) { break; - } elseif ( $first_id === $last_id ) { - $histogram[ $first_id ] = $value; + } elseif ( $result->min === $result->max ) { + $histogram[ $result->min ] = $value; } else { - $histogram[ "{$first_id}-{$last_id}" ] = $value; + $histogram[ "{$result->min}-{$result->max}" ] = $value; } - $previous_max_id = $last_id; + $previous_max_id = $result->max; } while ( true ); return $histogram; diff --git a/packages/sync/src/modules/Comments.php b/packages/sync/src/modules/Comments.php index 1233ad9c6d604..0af3c5d9949da 100644 --- a/packages/sync/src/modules/Comments.php +++ b/packages/sync/src/modules/Comments.php @@ -24,6 +24,28 @@ public function name() { return 'comments'; } + /** + * The id field in the database. + * + * @access public + * + * @return string + */ + public function id_field() { + return 'comment_ID'; + } + + /** + * The table in the database. + * + * @access public + * + * @return string + */ + public function table_name() { + return 'comments'; + } + /** * Retrieve a comment by its ID. * @@ -223,12 +245,12 @@ public function estimate_full_sync_actions( $config ) { /** * Retrieve the WHERE SQL clause based on the module config. * - * @access private + * @access public * * @param array $config Full sync configuration for this sync module. * @return string WHERE SQL clause, or `null` if no comments are specified in the module config. */ - private function get_where_sql( $config ) { + public function get_where_sql( $config ) { if ( is_array( $config ) ) { return 'comment_ID IN (' . implode( ',', array_map( 'intval', $config ) ) . ')'; } diff --git a/packages/sync/src/modules/Module.php b/packages/sync/src/modules/Module.php index 7ba5255d05ef7..d2142b8ab95f6 100644 --- a/packages/sync/src/modules/Module.php +++ b/packages/sync/src/modules/Module.php @@ -8,6 +8,7 @@ namespace Automattic\Jetpack\Sync\Modules; use Automattic\Jetpack\Sync\Listener; +use Automattic\Jetpack\Sync\Replicastore; /** * Basic methods implemented by Jetpack Sync extensions. @@ -33,6 +34,28 @@ abstract class Module { */ abstract public function name(); + /** + * The id field in the database. + * + * @access public + * + * @return string + */ + public function id_field() { + return 'ID'; + } + + /** + * The table in the database. + * + * @access public + * + * @return string|bool + */ + public function table_name() { + return false; + } + // phpcs:disable VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable /** @@ -373,4 +396,63 @@ public function get_objects_by_id( $object_type, $ids ) { return $objects; } + + /** + * Gets a list of minimum and maximum object ids for each batch based on the given batch size. + * + * @access public + * + * @param int $batch_size The batch size for objects. + * @param string|bool $where_sql The sql where clause minus 'WHERE', or false if no where clause is needed. + * + * @return array|bool An array of min and max ids for each batch. FALSE if no table can be found. + */ + public function get_min_max_object_ids_for_batches( $batch_size, $where_sql = false ) { + global $wpdb; + + if ( ! $this->table_name() ) { + return false; + } + + $results = array(); + $table = $wpdb->{$this->table_name()}; + $current_max = 0; + $current_min = 1; + $id_field = $this->id_field(); + $replicastore = new Replicastore(); + + $total = $replicastore->get_min_max_object_id( + $id_field, + $table, + $where_sql, + false + ); + + while ( $total->max > $current_max ) { + $where = $where_sql ? + $where_sql . " AND $id_field > $current_max" : + "$id_field > $current_max"; + $result = $replicastore->get_min_max_object_id( + $id_field, + $table, + $where, + $batch_size + ); + if ( empty( $result->min ) && empty( $result->max ) ) { + // Our query produced no min and max. We can assume the min from the previous query, + // and the total max we found in the initial query. + $current_max = (int) $total->max; + $result = (object) array( + 'min' => $current_min, + 'max' => $current_max, + ); + } else { + $current_min = (int) $result->min; + $current_max = (int) $result->max; + } + $results[] = $result; + } + + return $results; + } } diff --git a/packages/sync/src/modules/Posts.php b/packages/sync/src/modules/Posts.php index 8892d9b6e8aec..516a630e212bd 100644 --- a/packages/sync/src/modules/Posts.php +++ b/packages/sync/src/modules/Posts.php @@ -74,6 +74,17 @@ public function name() { return 'posts'; } + /** + * The table in the database. + * + * @access public + * + * @return string + */ + public function table_name() { + return 'posts'; + } + /** * Retrieve a post by its ID. * @@ -217,12 +228,12 @@ public function estimate_full_sync_actions( $config ) { /** * Retrieve the WHERE SQL clause based on the module config. * - * @access private + * @access public * * @param array $config Full sync configuration for this sync module. * @return string WHERE SQL clause, or `null` if no comments are specified in the module config. */ - private function get_where_sql( $config ) { + public function get_where_sql( $config ) { $where_sql = Settings::get_blacklisted_post_types_sql(); // Config is a list of post IDs to sync. @@ -642,4 +653,18 @@ public function expand_post_ids( $args ) { $previous_interval_end, ); } + + /** + * Gets a list of minimum and maximum object ids for each batch based on the given batch size. + * + * @access public + * + * @param int $batch_size The batch size for objects. + * @param string|bool $where_sql The sql where clause minus 'WHERE', or false if no where clause is needed. + * + * @return array|bool An array of min and max ids for each batch. FALSE if no table can be found. + */ + public function get_min_max_object_ids_for_batches( $batch_size, $where_sql = false ) { + return parent::get_min_max_object_ids_for_batches( $batch_size, $this->get_where_sql( false ) ); + } } diff --git a/packages/sync/src/modules/Term_Relationships.php b/packages/sync/src/modules/Term_Relationships.php index 6aa7ba8a3dcc5..7599a82b6f681 100644 --- a/packages/sync/src/modules/Term_Relationships.php +++ b/packages/sync/src/modules/Term_Relationships.php @@ -26,6 +26,28 @@ public function name() { return 'term_relationships'; } + /** + * The id field in the database. + * + * @access public + * + * @return string + */ + public function id_field() { + return 'object_id'; + } + + /** + * The table in the database. + * + * @access public + * + * @return string + */ + public function table_name() { + return 'term_relationships'; + } + /** * Initialize term relationships action listeners for full sync. * diff --git a/packages/sync/src/modules/Terms.php b/packages/sync/src/modules/Terms.php index a5502a1d78d87..36afc5d76b521 100644 --- a/packages/sync/src/modules/Terms.php +++ b/packages/sync/src/modules/Terms.php @@ -34,6 +34,28 @@ public function name() { return 'terms'; } + /** + * The id field in the database. + * + * @access public + * + * @return string + */ + public function id_field() { + return 'term_id'; + } + + /** + * The table in the database. + * + * @access public + * + * @return string + */ + public function table_name() { + return 'terms'; + } + /** * Allows WordPress.com servers to retrieve term-related objects via the sync API. * @@ -133,12 +155,12 @@ public function enqueue_full_sync_actions( $config, $max_items_to_enqueue, $stat /** * Retrieve the WHERE SQL clause based on the module config. * - * @access private + * @access public * * @param array $config Full sync configuration for this sync module. * @return string WHERE SQL clause, or `null` if no comments are specified in the module config. */ - private function get_where_sql( $config ) { + public function get_where_sql( $config ) { $where_sql = Settings::get_blacklisted_taxonomies_sql(); if ( is_array( $config ) ) { diff --git a/packages/sync/src/modules/Users.php b/packages/sync/src/modules/Users.php index be46841459034..21974a5bf77f9 100644 --- a/packages/sync/src/modules/Users.php +++ b/packages/sync/src/modules/Users.php @@ -41,6 +41,17 @@ public function name() { return 'users'; } + /** + * The table in the database. + * + * @access public + * + * @return string + */ + public function table_name() { + return 'users'; + } + /** * Retrieve a user by its ID. * This is here to support the backfill API. @@ -635,12 +646,12 @@ public function estimate_full_sync_actions( $config ) { /** * Retrieve the WHERE SQL clause based on the module config. * - * @access private + * @access public * * @param array $config Full sync configuration for this sync module. * @return string WHERE SQL clause, or `null` if no comments are specified in the module config. */ - private function get_where_sql( $config ) { + public function get_where_sql( $config ) { global $wpdb; $query = "meta_key = '{$wpdb->prefix}capabilities'";