Skip to content

Commit

Permalink
perf(dict): improve dict complile performance (#663)
Browse files Browse the repository at this point in the history
* refactor(utilities.cc): speed up processing of Checksum

* feat,refactor(dict): tweak to speed up EncodePhrase

- Replace map with unordered_map for related data structs
- Replace boost::lexicast to std::stox

This may save 8 % ~ 10 % of the build time of the dictionaries.

* feat(algo): add custom string algorithms

* refactor(algo,dict): apply custom string algorithms to speed up some operations

* refactor(strings.cc/.h): remove unnecessary `RIME_API` modifiers
  • Loading branch information
WhiredPlanck authored Jun 23, 2023
1 parent 6772e2d commit bd3c7c7
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 17 deletions.
9 changes: 4 additions & 5 deletions src/rime/algo/encoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,20 @@
#include <utf8.h>
#include <rime/config.h>
#include <rime/algo/encoder.h>
#include <rime/algo/strings.h>

namespace rime {

static const int kEncoderDfsLimit = 32;
static const int kMaxPhraseLength = 32;

string RawCode::ToString() const {
return boost::join(*this, " ");
return strings::join(*this, " ");
}

void RawCode::FromString(const string &code_str) {
boost::split(*dynamic_cast<vector<string> *>(this),
code_str,
boost::algorithm::is_space(),
boost::algorithm::token_compress_on);
*dynamic_cast<vector<string> *>(this) =
strings::split(code_str, " ");
}

TableEncoder::TableEncoder(PhraseCollector* collector)
Expand Down
37 changes: 37 additions & 0 deletions src/rime/algo/strings.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include <rime/algo/strings.h>

namespace rime {
namespace strings {

vector<string> split(const string& str, const string& delim, SplitBehavior behavior) {
vector<string> strings;
size_t lastPos, pos;
if (behavior == SplitBehavior::SkipEmpty) {
lastPos = str.find_first_not_of(delim, 0);
} else {
lastPos = 0;
}
pos = str.find_first_of(delim, lastPos);

while (std::string::npos != pos || std::string::npos != lastPos) {
strings.emplace_back(str.substr(lastPos, pos - lastPos));
if (behavior == SplitBehavior::SkipEmpty) {
lastPos = str.find_first_not_of(delim, pos);
} else {
if (pos == std::string::npos) {
break;
}
lastPos = pos + 1;
}
pos = str.find_first_of(delim, lastPos);
}
return strings;
};

vector<string> split(const string& str, const string& delim) {
return split(str, delim, SplitBehavior::SkipEmpty);
};

} // namespace strings
} // namespace rime

43 changes: 43 additions & 0 deletions src/rime/algo/strings.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#ifndef RIME_STRINGS_H_
#define RIME_STRINGS_H_

#include <rime/common.h>
#include <initializer_list>

namespace rime {
namespace strings {

enum class SplitBehavior { KeepEmpty, SkipEmpty };

vector<string> split(const string& str, const string& delim, SplitBehavior behavior);

vector<string> split(const string& str, const string& delim);

template <typename Iter, typename T>
string join(Iter start, Iter end, T &&delim) {
string result;
if (start != end) {
result += (*start);
start++;
}
for (; start != end; start++) {
result += (delim);
result += (*start);
}
return result;
}

template <typename C, typename T>
inline string join(C &&container, T &&delim) {
return join(std::begin(container), std::end(container), delim);
}

template <typename C, typename T>
inline string join(std::initializer_list<C> &&container, T &&delim) {
return join(std::begin(container), std::end(container), delim);
}

} // namespace strings
} // namespace rime

#endif // RIME_STRINGS_H_
6 changes: 4 additions & 2 deletions src/rime/algo/utilities.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// 2013-01-30 GONG Chen <chen.sst@gmail.com>
//
#include <fstream>
#include <sstream>
#include <boost/algorithm/string.hpp>
#include <rime/algo/utilities.h>

Expand Down Expand Up @@ -35,8 +36,9 @@ ChecksumComputer::ChecksumComputer(uint32_t initial_remainder)

void ChecksumComputer::ProcessFile(const string& file_name) {
std::ifstream fin(file_name.c_str());
string file_content((std::istreambuf_iterator<char>(fin)),
std::istreambuf_iterator<char>());
std::stringstream buffer;
buffer << fin.rdbuf();
const auto& file_content(buffer.str());
crc_.process_bytes(file_content.data(), file_content.length());
}

Expand Down
15 changes: 7 additions & 8 deletions src/rime/dict/entry_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <fstream>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <rime/algo/strings.h>
#include <rime/dict/dict_settings.h>
#include <rime/dict/entry_collector.h>
#include <rime/dict/preset_vocabulary.h>
Expand Down Expand Up @@ -86,9 +87,7 @@ void EntryCollector::Collect(const string& dict_file) {
continue;
}
// read a dict entry
vector<string> row;
boost::algorithm::split(row, line,
boost::algorithm::is_any_of("\t"));
auto row = strings::split(line, "\t");
int num_columns = static_cast<int>(row.size());
if (num_columns <= text_column || row[text_column].empty()) {
LOG(WARNING) << "Missing entry text at #" << num_entries << ".";
Expand Down Expand Up @@ -165,7 +164,7 @@ void EntryCollector::CreateEntry(const string &word,
if (scaled) {
double percentage = 100.0;
try {
percentage = boost::lexical_cast<double>(
percentage = std::stod(
weight_str.substr(0, weight_str.length() - 1));
}
catch (...) {
Expand All @@ -176,7 +175,7 @@ void EntryCollector::CreateEntry(const string &word,
}
else if (!weight_str.empty()) { // absolute weight
try {
e.weight = boost::lexical_cast<double>(weight_str);
e.weight = std::stod(weight_str);
}
catch (...) {
LOG(WARNING) << "invalid entry definition at #" << num_entries << ".";
Expand Down Expand Up @@ -212,16 +211,16 @@ void EntryCollector::CreateEntry(const string &word,

bool EntryCollector::TranslateWord(const string& word,
vector<string>* result) {
ReverseLookupTable::const_iterator s = stems.find(word);
const auto& s = stems.find(word);
if (s != stems.end()) {
for (const string& stem : s->second) {
result->push_back(stem);
}
return true;
}
WordMap::const_iterator w = words.find(word);
const auto& w = words.find(word);
if (w != words.end()) {
for (const auto& v : w->second) {
for (const auto& v : w->second) {
const double kMinimalWeight = 0.05; // 5%
double min_weight = total_weight[word] * kMinimalWeight;
if (v.second < min_weight)
Expand Down
2 changes: 1 addition & 1 deletion src/rime/dict/entry_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ struct RawDictEntry {
// code -> weight
using WeightMap = map<string, double>;
// word -> { code -> weight }
using WordMap = map<string, WeightMap>;
using WordMap = hash_map<string, WeightMap>;
// [ (word, weight), ... ]
using EncodeQueue = std::queue<pair<string, string>>;

Expand Down
2 changes: 1 addition & 1 deletion src/rime/dict/vocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class Vocabulary : public map<int, VocabularyPage> {
};

// word -> { code, ... }
using ReverseLookupTable = map<string, set<string>>;
using ReverseLookupTable = hash_map<string, set<string>>;

} // namespace rime

Expand Down

0 comments on commit bd3c7c7

Please sign in to comment.