Skip to content

Commit

Permalink
Temporarily simplify magic header check in *_data_manager.cc
Browse files Browse the repository at this point in the history
Historically we have passed the entire magic number of the embedded data
file to each *_data_manager.cc file so that it can compare the values
at run time. However the problem here is that propagating non-ASCII byte
data from the build system to *.cc file is not that simple task. This is
going to be more problematic when we start using Bazel on Windows, where
the backslash character has another special path separator meaning.

As a temporary solution to unblock Bazel migration (google#948), this commit
relaxes the verification by only propagating the magic number byte
length instead of the entire byte array. Note that each data file still
has a checksum and the data integrity remains to be verifiable . The only
thing we can no longer verify is some accidental mistakes at the build
rules level, which is supposed to be unlikely to happen.

PiperOrigin-RevId: 646540433
  • Loading branch information
yukawa authored and hiroyuki-komatsu committed Jun 25, 2024
1 parent 6891660 commit 3f1570b
Show file tree
Hide file tree
Showing 12 changed files with 75 additions and 23 deletions.
10 changes: 10 additions & 0 deletions src/data_manager/data_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,16 @@ DataManager::Status DataManager::InitFromArray(absl::string_view array,
return InitFromReader(reader);
}

DataManager::Status DataManager::InitFromArray(absl::string_view array,
size_t magic_length) {
DataSetReader reader;
if (!reader.Init(array, magic_length)) {
LOG(ERROR) << "Binary data of size " << array.size() << " is broken";
return DataManager::Status::DATA_BROKEN;
}
return InitFromReader(reader);
}

DataManager::Status DataManager::InitFromReader(const DataSetReader &reader) {
const Status status = InitUserPosManagerDataFromReader(
reader, &pos_matcher_data_, &user_pos_token_array_data_,
Expand Down
2 changes: 1 addition & 1 deletion src/data_manager/data_manager.gypi
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
'<@(additional_dendencies)',
],
'defines': [
'MOZC_DATASET_MAGIC_NUMBER="<(magic_number)"',
'MOZC_DATASET_MAGIC_NUMBER_LENGTH=<(magic_number_length)',
],
},
{
Expand Down
2 changes: 2 additions & 0 deletions src/data_manager/data_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,10 @@ class DataManager : public DataManagerInterface {
// Parses |array| and extracts byte blocks of data set. The |array| must
// outlive this instance. The second version specifies a custom magic number
// to expect (e.g., mock data set has a different magic number).
// The third version specifies the length of the magic number in bytes.
Status InitFromArray(absl::string_view array);
Status InitFromArray(absl::string_view array, absl::string_view magic);
Status InitFromArray(absl::string_view array, size_t magic_length);

// The same as above InitFromArray() but the data is loaded using mmap, which
// is owned in this instance.
Expand Down
10 changes: 7 additions & 3 deletions src/data_manager/dataset_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,12 @@ bool DataSetReader::Init(absl::string_view memblock, absl::string_view magic) {
return false;
}

return Init(memblock, magic.size());
}

bool DataSetReader::Init(absl::string_view memblock, size_t magic_length) {
// Check minimum required data size.
if (memblock.size() < magic.size() + kFooterSize) {
if (memblock.size() < magic_length + kFooterSize) {
LOG(ERROR) << "Broken: data is too small";
return false;
}
Expand Down Expand Up @@ -99,7 +103,7 @@ bool DataSetReader::Init(absl::string_view memblock, absl::string_view magic) {

// Note: This subtraction doesn't cause underflow by the above check.
const uint64_t content_and_metadata_size =
memblock.size() - magic.size() - kFooterSize;
memblock.size() - magic_length - kFooterSize;
if (metadata_size == 0 || content_and_metadata_size < metadata_size) {
LOG(ERROR) << "Broken: metadata size is broken or metadata is broken";
return false;
Expand All @@ -119,7 +123,7 @@ bool DataSetReader::Init(absl::string_view memblock, absl::string_view magic) {
}

// Construct a mapping from name to data chunk.
uint64_t prev_chunk_end = magic.size();
uint64_t prev_chunk_end = magic_length;
for (int i = 0; i < metadata.entries_size(); ++i) {
const auto& e = metadata.entries(i);
if (e.offset() < prev_chunk_end || e.offset() >= metadata_offset) {
Expand Down
3 changes: 3 additions & 0 deletions src/data_manager/dataset_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ class DataSetReader {
// method doesn't verify checksum for performance. One can separately call
// VerifyChecksum().
bool Init(absl::string_view memblock, absl::string_view magic);
// A variant of Init() that takes the length of the magic number rather than
// the magic number itself.
bool Init(absl::string_view memblock, size_t magic_length);

// Gets the byte data corresponding to |name|. If the data for |name| doesn't
// exist, returns false.
Expand Down
18 changes: 17 additions & 1 deletion src/data_manager/mozc_data.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def mozc_dataset(
outs = [outs[0]],
cmd = (
"$(location //data_manager:dataset_writer_main) " +
"--magic='" + magic + "' --output=$@ " + arguments
"--magic='" + _byte_array_to_cstring(magic.value) + "' --output=$@ " + arguments
),
tools = ["//data_manager:dataset_writer_main"],
)
Expand Down Expand Up @@ -710,3 +710,19 @@ def mozc_dataset(
),
tools = ["//rewriter:gen_usage_rewriter_dictionary_main"],
)

def _byte_array_to_cstring(byte_array):
result = ""
for byte in byte_array:
if byte < 0 or byte > 255:
fail("Invalid byte: " + byte)
high = byte // 16
low = byte % 16
result += ("\\x%X%X" % (high, low))
return result

def magic_number(byte_array):
return struct(
value = byte_array,
length = len(byte_array),
)
13 changes: 11 additions & 2 deletions src/data_manager/oss/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,22 @@ load(
)
load(
"//data_manager:mozc_data.bzl",
"magic_number",
"mozc_dataset",
)

package(default_visibility = ["//visibility:public"])

# "\xEFMOZC\r\n"
MOZC_DATASET_MAGIC_NUMBER = "\\xEF\\x4D\\x4F\\x5A\\x43\\x0D\\x0A"
MOZC_DATASET_MAGIC_NUMBER = magic_number([
0xEF, # \xEF
0x4D, # M
0x4F, # O
0x5A, # Z
0x43, # C
0x0D, # <CR>
0x0A, # <LF>
])

mozc_dataset(
name = "mozc_dataset_for_oss",
Expand Down Expand Up @@ -116,7 +125,7 @@ mozc_cc_library(
],
hdrs = ["oss_data_manager.h"],
local_defines = [
"MOZC_DATASET_MAGIC_NUMBER='\"" + MOZC_DATASET_MAGIC_NUMBER + "\"'",
"MOZC_DATASET_MAGIC_NUMBER_LENGTH=%d" % MOZC_DATASET_MAGIC_NUMBER.length,
],
deps = [
"//base:embedded_file",
Expand Down
14 changes: 7 additions & 7 deletions src/data_manager/oss/oss_data_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

#include "data_manager/oss/oss_data_manager.h"

#include <iterator>
#include <cstddef>

#include "absl/log/check.h"
#include "absl/strings/string_view.h"
Expand All @@ -42,17 +42,17 @@ namespace {
// kOssMozcDataSet is embedded.
#include "data_manager/oss/mozc_data.inc"

#ifndef MOZC_DATASET_MAGIC_NUMBER
#error "MOZC_DATASET_MAGIC_NUMBER is not defined by build system"
#endif // MOZC_DATASET_MAGIC_NUMBER
#ifndef MOZC_DATASET_MAGIC_NUMBER_LENGTH
#error "MOZC_DATASET_MAGIC_NUMBER_LENGTH is not defined by build system"
#endif // MOZC_DATASET_MAGIC_NUMBER_LENGTH

constexpr char kMagicNumber[] = MOZC_DATASET_MAGIC_NUMBER;
constexpr size_t kMagicNumberLength = MOZC_DATASET_MAGIC_NUMBER_LENGTH;

} // namespace

OssDataManager::OssDataManager() {
const absl::string_view magic(kMagicNumber, std::size(kMagicNumber) - 1);
CHECK_EQ(Status::OK, InitFromArray(LoadEmbeddedFile(kOssMozcDataSet), magic))
CHECK_EQ(Status::OK, InitFromArray(LoadEmbeddedFile(kOssMozcDataSet),
kMagicNumberLength))
<< "Embedded mozc_imy.h for OSS is broken";
}

Expand Down
1 change: 1 addition & 0 deletions src/data_manager/oss/oss_data_manager.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
],
# Hex-escaped string of "\xEFMOZC\r\n"
'magic_number': "\\xEF\\x4D\\x4F\\x5A\\x43\\x0D\\x0A",
'magic_number_length': '7',
'mozc_data_varname': 'kOssMozcDataSet',
'out_mozc_data': 'mozc.data',
'out_mozc_data_header': 'mozc_data.inc',
Expand Down
10 changes: 8 additions & 2 deletions src/data_manager/testing/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,19 @@ load(
)
load(
"//data_manager:mozc_data.bzl",
"magic_number",
"mozc_dataset",
)

package(default_visibility = ["//visibility:public"])

# MOCK
MOZC_DATASET_MAGIC_NUMBER = "\\x4D\\x4F\\x43\\x4B"
MOZC_DATASET_MAGIC_NUMBER = magic_number([
0x4D, # M
0x4F, # O
0x43, # C
0x4B, # K
])

mozc_cc_library(
name = "mock_data_manager",
Expand All @@ -51,7 +57,7 @@ mozc_cc_library(
],
hdrs = ["mock_data_manager.h"],
local_defines = [
"MOZC_DATASET_MAGIC_NUMBER='\"" + MOZC_DATASET_MAGIC_NUMBER + "\"'",
"MOZC_DATASET_MAGIC_NUMBER_LENGTH=%d" % MOZC_DATASET_MAGIC_NUMBER.length,
],
deps = [
"//base:embedded_file",
Expand Down
14 changes: 7 additions & 7 deletions src/data_manager/testing/mock_data_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

#include "data_manager/testing/mock_data_manager.h"

#include <iterator>
#include <cstddef>

#include "absl/log/check.h"
#include "absl/strings/string_view.h"
Expand All @@ -42,17 +42,17 @@ namespace {
// EmbeddedFile kMockMozcDataSet is defined in this header file.
#include "data_manager/testing/mock_mozc_data.inc"

#ifndef MOZC_DATASET_MAGIC_NUMBER
#error "MOZC_DATASET_MAGIC_NUMBER is not defined by build system"
#endif // MOZC_DATASET_MAGIC_NUMBER
#ifndef MOZC_DATASET_MAGIC_NUMBER_LENGTH
#error "MOZC_DATASET_MAGIC_NUMBER_LENGTH is not defined by build system"
#endif // MOZC_DATASET_MAGIC_NUMBER_LENGTH

constexpr char kMagicNumber[] = MOZC_DATASET_MAGIC_NUMBER;
constexpr size_t kMagicNumberLength = MOZC_DATASET_MAGIC_NUMBER_LENGTH;

} // namespace

MockDataManager::MockDataManager() {
const absl::string_view magic(kMagicNumber, std::size(kMagicNumber) - 1);
CHECK_EQ(Status::OK, InitFromArray(LoadEmbeddedFile(kMockMozcDataSet), magic))
CHECK_EQ(Status::OK, InitFromArray(LoadEmbeddedFile(kMockMozcDataSet),
kMagicNumberLength))
<< "Embedded mock_mozc_data.h is broken";
}

Expand Down
1 change: 1 addition & 0 deletions src/data_manager/testing/mock_data_manager.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
'<(mozc_oss_src_dir)/data/dictionary_manual/domain.txt',
],
'magic_number': '\\x4D\\x4F\\x43\\x4B', # MOCK
'magic_number_length': '4',
'mozc_data_varname': 'kMockMozcDataSet',
'out_mozc_data': 'mock_mozc.data',
'out_mozc_data_header': 'mock_mozc_data.inc',
Expand Down

0 comments on commit 3f1570b

Please sign in to comment.