From d0bee8f118f0f438391c4d4b78599077a6ae4811 Mon Sep 17 00:00:00 2001 From: Dennis Lawler <4824647+drawlerr@users.noreply.github.com> Date: Tue, 18 Feb 2020 11:02:54 -0800 Subject: [PATCH] Use zeroes instead of whitespaces as padding bytes (#899) --- docs/migrate.rst | 10 +++++++ esrally/track/params.py | 2 +- tests/track/params_test.py | 54 ++++++++++++++++++-------------------- 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/docs/migrate.rst b/docs/migrate.rst index eeffdf93a..7e8de6fc3 100644 --- a/docs/migrate.rst +++ b/docs/migrate.rst @@ -1,6 +1,16 @@ Migration Guide =============== +Migrating to Rally 1.4.1 +------------------------ + +Document IDs are now padded with 0 instead of spaces +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When Rally 1.4.1 generates document IDs, it will pad them with '0' instead of ' ' - 0000000000 instead of ' 0', etc. +Elasticsearch has optimizations for numeric IDs, so observed performance in Elasticsearch should improve slightly. + + Migrating to Rally 1.4.0 ------------------------ diff --git a/esrally/track/params.py b/esrally/track/params.py index 4d9f7a118..08618630e 100644 --- a/esrally/track/params.py +++ b/esrally/track/params.py @@ -622,7 +622,7 @@ def build_conflicting_ids(conflicts, docs_to_index, offset, shuffle=random.shuff all_ids = [0] * docs_to_index for i in range(docs_to_index): # always consider the offset as each client will index its own range and we don't want uncontrolled conflicts across clients - all_ids[i] = "%10d" % (offset + i) + all_ids[i] = "%010d" % (offset + i) if conflicts == IndexIdConflict.RandomConflicts: shuffle(all_ids) return all_ids diff --git a/tests/track/params_test.py b/tests/track/params_test.py index 7ad57600d..f182c30b0 100644 --- a/tests/track/params_test.py +++ b/tests/track/params_test.py @@ -88,34 +88,34 @@ def test_no_id_conflicts(self): def test_sequential_conflicts(self): self.assertEqual( [ - " 0", - " 1", - " 2", - " 3", - " 4", - " 5", - " 6", - " 7", - " 8", - " 9", - " 10", + '0000000000', + '0000000001', + '0000000002', + '0000000003', + '0000000004', + '0000000005', + '0000000006', + '0000000007', + '0000000008', + '0000000009', + '0000000010' ], params.build_conflicting_ids(params.IndexIdConflict.SequentialConflicts, 11, 0) ) self.assertEqual( [ - " 5", - " 6", - " 7", - " 8", - " 9", - " 10", - " 11", - " 12", - " 13", - " 14", - " 15", + '0000000005', + '0000000006', + '0000000007', + '0000000008', + '0000000009', + '0000000010', + '0000000011', + '0000000012', + '0000000013', + '0000000014', + '0000000015' ], params.build_conflicting_ids(params.IndexIdConflict.SequentialConflicts, 11, 5) ) @@ -125,18 +125,14 @@ def test_random_conflicts(self): self.assertEqual( [ - " 2", - " 1", - " 0" + '0000000002', '0000000001', '0000000000' ], params.build_conflicting_ids(params.IndexIdConflict.RandomConflicts, 3, 0, shuffle=predictable_shuffle) ) self.assertEqual( [ - " 7", - " 6", - " 5" + '0000000007', '0000000006', '0000000005' ], params.build_conflicting_ids(params.IndexIdConflict.RandomConflicts, 3, 5, shuffle=predictable_shuffle) ) @@ -670,7 +666,7 @@ def number_of_bulks(corpora, partition_index, total_partitions, bulk_size): def test_build_conflicting_ids(self): self.assertIsNone(params.build_conflicting_ids(params.IndexIdConflict.NoConflicts, 3, 0)) - self.assertEqual([" 0", " 1", " 2"], + self.assertEqual(["0000000000", "0000000001", "0000000002"], params.build_conflicting_ids(params.IndexIdConflict.SequentialConflicts, 3, 0)) # we cannot tell anything specific about the contents... self.assertEqual(3, len(params.build_conflicting_ids(params.IndexIdConflict.RandomConflicts, 3, 0)))