Skip to content

Commit

Permalink
Jetpack Sync: New endpoint to retrieve an ID range. (#12671)
Browse files Browse the repository at this point in the history
* Jetpack Sync:

Adds An endpoint to retrieve the minimum and maximum ID from the database table corresponding to the given object type.

* Let's get a range of min / max ids based on batch size

* improve data typing

* removing permission check for now

* Basing it off sync modules

* Sync: The replica store already had some min/max logic. So I pulled that into it's own method that can be used by individual modules

* We can infer  from the table name. Thanks @tyxla

* Update packages/sync/src/Replicastore.php

Co-Authored-By: Marin Atanasov <8436925+tyxla@users.noreply.github.com>

* Fixing documentation

* Extra whitespace

* improve comments

* This endpoint should be a sync endpoint

* Adjust poopy syntax

* fix some wonky syntax

* White space fixes

* Update json-endpoints/jetpack/class.jetpack-json-api-sync-endpoint.php

Co-Authored-By: Marin Atanasov <8436925+tyxla@users.noreply.github.com>

* Update packages/sync/src/Replicastore.php

Co-Authored-By: Marin Atanasov <8436925+tyxla@users.noreply.github.com>
  • Loading branch information
roccotripaldi and tyxla authored Aug 5, 2019
1 parent 237b51b commit d422e33
Show file tree
Hide file tree
Showing 9 changed files with 294 additions and 25 deletions.
33 changes: 33 additions & 0 deletions json-endpoints/jetpack/class.jetpack-json-api-sync-endpoint.php
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,36 @@ protected function result() {
);
}
}

class Jetpack_JSON_API_Sync_Object_Id_Range extends Jetpack_JSON_API_Sync_Endpoint {
protected function result() {
$args = $this->query_args();

$module_name = $args['sync_module'];
$batch_size = $args['batch_size'];

if ( ! $this->is_valid_sync_module( $module_name ) ) {
return new WP_Error( 'invalid_module', 'This sync module cannot be used to calculate a range.', 400 );
}

$module = Modules::get_module( $module_name );

return array(
'ranges' => $module->get_min_max_object_ids_for_batches( $batch_size ),
);
}

protected function is_valid_sync_module( $module_name ) {
return in_array(
$module_name,
array(
'comments',
'posts',
'terms',
'term_relationships',
'users',
),
true
);
}
}
20 changes: 20 additions & 0 deletions json-endpoints/jetpack/json-api-jetpack-endpoints.php
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,26 @@
'example_request' => 'https://public-api.wordpress.com/rest/v1.1/sites/example.wordpress.org/sync/unlock'
) );

// GET /sites/%s/sync/object-id-range
new Jetpack_JSON_API_Sync_Object_Id_Range( array(
'description' => 'Gets minimum and maximum object ids for each batch of given batch size.',
'method' => 'GET',
'path' => '/sites/%s/sync/object-id-range',
'group' => '__do_not_document',
'stat' => 'sync-object-id-range',
'path_labels' => array(
'$site' => '(int|string) The site ID, The site domain'
),
'query_parameters' => array(
'batch_size' => '(int=1000) The amount of objects per batch.',
'sync_module' => '(string=posts) The sync module used to enumerate the ranges.',
),
'response_format' => array(
'ranges' => '(array) An array of range objects with min and max properties for each batch.',
),
'example_request' => 'https://public-api.wordpress.com/rest/v1.1/sites/example.wordpress.org/sync/object-id-range?batch_size=100&sync_module=comments'
) );

// POST /sites/%s/sync/checkout
new Jetpack_JSON_API_Sync_Checkout_Endpoint( array(
'description' => 'Locks the queue and returns items and the buffer ID.',
Expand Down
66 changes: 49 additions & 17 deletions packages/sync/src/Replicastore.php
Original file line number Diff line number Diff line change
Expand Up @@ -1202,6 +1202,41 @@ public function get_checksum_columns_for_object_type( $object_type ) {
}
}

/**
* Grabs the minimum and maximum object ids for the given parameters.
*
* @access public
*
* @param string $id_field The id column in the table to query.
* @param string $object_table The table to query.
* @param string $where A sql where clause without 'WHERE'.
* @param int $bucket_size The maximum amount of objects to include in the query.
* For `term_relationships` table, the bucket size will refer to the amount
* of distinct object ids. This will likely include more database rows than
* the bucket size implies.
*
* @return object An object with min_id and max_id properties.
*/
public function get_min_max_object_id( $id_field, $object_table, $where, $bucket_size ) {
global $wpdb;

// The term relationship table's unique key is a combination of 2 columns. `DISTINCT` helps us get a more acurate query.
$distinct_sql = ( $wpdb->term_relationships === $object_table ) ? 'DISTINCT' : '';
$where_sql = $where ? "WHERE $where" : '';

// Since MIN() and MAX() do not work with LIMIT, we'll need to adjust the dataset we query if a limit is present.
// With a limit present, we'll look at a dataset consisting of object_ids that meet the constructs of the $where clause.
// Without a limit, we can use the actual table as a dataset.
$from = $bucket_size ?
"( SELECT $distinct_sql $id_field FROM $object_table $where_sql ORDER BY $id_field ASC LIMIT $bucket_size ) as ids" :
"$object_table $where_sql ORDER BY $id_field ASC";

return $wpdb->get_row(
// phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared
"SELECT MIN($id_field) as min, MAX($id_field) as max FROM $from"
);
}

/**
* Retrieve the checksum histogram for a specific object type.
*
Expand Down Expand Up @@ -1278,7 +1313,8 @@ public function checksum_histogram( $object_type, $buckets, $start_id = null, $e
$previous_max_id = 0;
$histogram = array();

$where = '1=1';
// This is used for the min / max query, while $where_sql is used for the checksum query.
$where = $where_sql;

if ( $start_id ) {
$where .= " AND $id_field >= " . intval( $start_id );
Expand All @@ -1288,39 +1324,35 @@ public function checksum_histogram( $object_type, $buckets, $start_id = null, $e
$where .= " AND $id_field <= " . intval( $end_id );
}

$distinct = '';
if ( 'term_relationships' === $object_type ) {
$distinct = 'DISTINCT';
}

do {
list( $first_id, $last_id ) = $wpdb->get_row(
// phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared
"SELECT MIN($id_field) as min_id, MAX($id_field) as max_id FROM ( SELECT $distinct $id_field FROM $object_table WHERE $where AND $id_field > $previous_max_id ORDER BY $id_field ASC LIMIT $bucket_size ) as ids",
ARRAY_N
$result = $this->get_min_max_object_id(
$id_field,
$object_table,
$where . " AND $id_field > $previous_max_id",
$bucket_size
);

if ( null === $first_id || null === $last_id ) {
if ( null === $result->min || null === $result->max ) {
// Nothing to checksum here...
break;
}

// Get the checksum value.
$value = $this->table_checksum( $object_table, $columns, $id_field, $where_sql, $first_id, $last_id, $strip_non_ascii, $salt );
$value = $this->table_checksum( $object_table, $columns, $id_field, $where_sql, $result->min, $result->max, $strip_non_ascii, $salt );

if ( is_wp_error( $value ) ) {
return $value;
}

if ( null === $first_id || null === $last_id ) {
if ( null === $result->min || null === $result->max ) {
break;
} elseif ( $first_id === $last_id ) {
$histogram[ $first_id ] = $value;
} elseif ( $result->min === $result->max ) {
$histogram[ $result->min ] = $value;
} else {
$histogram[ "{$first_id}-{$last_id}" ] = $value;
$histogram[ "{$result->min}-{$result->max}" ] = $value;
}

$previous_max_id = $last_id;
$previous_max_id = $result->max;
} while ( true );

return $histogram;
Expand Down
26 changes: 24 additions & 2 deletions packages/sync/src/modules/Comments.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,28 @@ public function name() {
return 'comments';
}

/**
* The id field in the database.
*
* @access public
*
* @return string
*/
public function id_field() {
return 'comment_ID';
}

/**
* The table in the database.
*
* @access public
*
* @return string
*/
public function table_name() {
return 'comments';
}

/**
* Retrieve a comment by its ID.
*
Expand Down Expand Up @@ -223,12 +245,12 @@ public function estimate_full_sync_actions( $config ) {
/**
* Retrieve the WHERE SQL clause based on the module config.
*
* @access private
* @access public
*
* @param array $config Full sync configuration for this sync module.
* @return string WHERE SQL clause, or `null` if no comments are specified in the module config.
*/
private function get_where_sql( $config ) {
public function get_where_sql( $config ) {
if ( is_array( $config ) ) {
return 'comment_ID IN (' . implode( ',', array_map( 'intval', $config ) ) . ')';
}
Expand Down
82 changes: 82 additions & 0 deletions packages/sync/src/modules/Module.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
namespace Automattic\Jetpack\Sync\Modules;

use Automattic\Jetpack\Sync\Listener;
use Automattic\Jetpack\Sync\Replicastore;

/**
* Basic methods implemented by Jetpack Sync extensions.
Expand All @@ -33,6 +34,28 @@ abstract class Module {
*/
abstract public function name();

/**
* The id field in the database.
*
* @access public
*
* @return string
*/
public function id_field() {
return 'ID';
}

/**
* The table in the database.
*
* @access public
*
* @return string|bool
*/
public function table_name() {
return false;
}

// phpcs:disable VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable

/**
Expand Down Expand Up @@ -373,4 +396,63 @@ public function get_objects_by_id( $object_type, $ids ) {

return $objects;
}

/**
* Gets a list of minimum and maximum object ids for each batch based on the given batch size.
*
* @access public
*
* @param int $batch_size The batch size for objects.
* @param string|bool $where_sql The sql where clause minus 'WHERE', or false if no where clause is needed.
*
* @return array|bool An array of min and max ids for each batch. FALSE if no table can be found.
*/
public function get_min_max_object_ids_for_batches( $batch_size, $where_sql = false ) {
global $wpdb;

if ( ! $this->table_name() ) {
return false;
}

$results = array();
$table = $wpdb->{$this->table_name()};
$current_max = 0;
$current_min = 1;
$id_field = $this->id_field();
$replicastore = new Replicastore();

$total = $replicastore->get_min_max_object_id(
$id_field,
$table,
$where_sql,
false
);

while ( $total->max > $current_max ) {
$where = $where_sql ?
$where_sql . " AND $id_field > $current_max" :
"$id_field > $current_max";
$result = $replicastore->get_min_max_object_id(
$id_field,
$table,
$where,
$batch_size
);
if ( empty( $result->min ) && empty( $result->max ) ) {
// Our query produced no min and max. We can assume the min from the previous query,
// and the total max we found in the initial query.
$current_max = (int) $total->max;
$result = (object) array(
'min' => $current_min,
'max' => $current_max,
);
} else {
$current_min = (int) $result->min;
$current_max = (int) $result->max;
}
$results[] = $result;
}

return $results;
}
}
29 changes: 27 additions & 2 deletions packages/sync/src/modules/Posts.php
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,17 @@ public function name() {
return 'posts';
}

/**
* The table in the database.
*
* @access public
*
* @return string
*/
public function table_name() {
return 'posts';
}

/**
* Retrieve a post by its ID.
*
Expand Down Expand Up @@ -217,12 +228,12 @@ public function estimate_full_sync_actions( $config ) {
/**
* Retrieve the WHERE SQL clause based on the module config.
*
* @access private
* @access public
*
* @param array $config Full sync configuration for this sync module.
* @return string WHERE SQL clause, or `null` if no comments are specified in the module config.
*/
private function get_where_sql( $config ) {
public function get_where_sql( $config ) {
$where_sql = Settings::get_blacklisted_post_types_sql();

// Config is a list of post IDs to sync.
Expand Down Expand Up @@ -642,4 +653,18 @@ public function expand_post_ids( $args ) {
$previous_interval_end,
);
}

/**
* Gets a list of minimum and maximum object ids for each batch based on the given batch size.
*
* @access public
*
* @param int $batch_size The batch size for objects.
* @param string|bool $where_sql The sql where clause minus 'WHERE', or false if no where clause is needed.
*
* @return array|bool An array of min and max ids for each batch. FALSE if no table can be found.
*/
public function get_min_max_object_ids_for_batches( $batch_size, $where_sql = false ) {
return parent::get_min_max_object_ids_for_batches( $batch_size, $this->get_where_sql( false ) );
}
}
22 changes: 22 additions & 0 deletions packages/sync/src/modules/Term_Relationships.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,28 @@ public function name() {
return 'term_relationships';
}

/**
* The id field in the database.
*
* @access public
*
* @return string
*/
public function id_field() {
return 'object_id';
}

/**
* The table in the database.
*
* @access public
*
* @return string
*/
public function table_name() {
return 'term_relationships';
}

/**
* Initialize term relationships action listeners for full sync.
*
Expand Down
Loading

0 comments on commit d422e33

Please sign in to comment.