perf(dict): improve dict complile performance (#663)

* refactor(utilities.cc): speed up processing of Checksum * feat,refactor(dict): tweak to speed up EncodePhrase - Replace map with unordered_map for related data structs - Replace boost::lexicast to std::stox This may save 8 % ~ 10 % of the build time of the dictionaries. * feat(algo): add custom string algorithms * refactor(algo,dict): apply custom string algorithms to speed up some operations * refactor(strings.cc/.h): remove unnecessary `RIME_API` modifiers
rime · Jun 23, 2023 · bd3c7c7 · bd3c7c7
1 parent 6772e2d
commit bd3c7c7
Show file tree

Hide file tree

Showing 7 changed files with 97 additions and 17 deletions.
diff --git a/src/rime/algo/encoder.cc b/src/rime/algo/encoder.cc
@@ -8,21 +8,20 @@
 #include <utf8.h>
 #include <rime/config.h>
 #include <rime/algo/encoder.h>
+#include <rime/algo/strings.h>
 
 namespace rime {
 
 static const int kEncoderDfsLimit = 32;
 static const int kMaxPhraseLength = 32;
 
 string RawCode::ToString() const {
-  return boost::join(*this, " ");
+  return strings::join(*this, " ");
 }
 
 void RawCode::FromString(const string &code_str) {
-  boost::split(*dynamic_cast<vector<string> *>(this),
-               code_str,
-               boost::algorithm::is_space(),
-               boost::algorithm::token_compress_on);
+  *dynamic_cast<vector<string> *>(this) =
+    strings::split(code_str, " ");
 }
 
 TableEncoder::TableEncoder(PhraseCollector* collector)

diff --git a/src/rime/algo/strings.cc b/src/rime/algo/strings.cc
@@ -0,0 +1,37 @@
+#include <rime/algo/strings.h>
+
+namespace rime {
+namespace strings {
+
+vector<string> split(const string& str, const string& delim, SplitBehavior behavior) {
+  vector<string> strings;
+  size_t lastPos, pos;
+  if (behavior == SplitBehavior::SkipEmpty) {
+    lastPos = str.find_first_not_of(delim, 0);
+  } else {
+    lastPos = 0;
+  }
+  pos = str.find_first_of(delim, lastPos);
+
+  while (std::string::npos != pos || std::string::npos != lastPos) {
+    strings.emplace_back(str.substr(lastPos, pos - lastPos));
+    if (behavior == SplitBehavior::SkipEmpty) {
+        lastPos = str.find_first_not_of(delim, pos);
+    } else {
+        if (pos == std::string::npos) {
+            break;
+        }
+        lastPos = pos + 1;
+    }
+        pos = str.find_first_of(delim, lastPos);
+    }
+    return strings;
+};
+
+vector<string> split(const string& str, const string& delim) {
+    return split(str, delim, SplitBehavior::SkipEmpty);
+};
+
+} // namespace strings
+} // namespace rime
+
diff --git a/src/rime/algo/strings.h b/src/rime/algo/strings.h
@@ -0,0 +1,43 @@
+#ifndef RIME_STRINGS_H_
+#define RIME_STRINGS_H_
+
+#include <rime/common.h>
+#include <initializer_list>
+
+namespace rime {
+namespace strings {
+
+enum class SplitBehavior { KeepEmpty, SkipEmpty };
+
+vector<string> split(const string& str, const string& delim, SplitBehavior behavior);
+
+vector<string> split(const string& str, const string& delim);
+
+template <typename Iter, typename T>
+string join(Iter start, Iter end, T &&delim) {
+    string result;
+    if (start != end) {
+        result += (*start);
+        start++;
+    }
+    for (; start != end; start++) {
+        result += (delim);
+        result += (*start);
+    }
+    return result;
+}
+
+template <typename C, typename T>
+inline string join(C &&container, T &&delim) {
+    return join(std::begin(container), std::end(container), delim);
+}
+
+template <typename C, typename T>
+inline string join(std::initializer_list<C> &&container, T &&delim) {
+    return join(std::begin(container), std::end(container), delim);
+}
+
+} // namespace strings
+} // namespace rime
+
+#endif // RIME_STRINGS_H_
diff --git a/src/rime/algo/utilities.cc b/src/rime/algo/utilities.cc
@@ -5,6 +5,7 @@
 // 2013-01-30 GONG Chen <chen.sst@gmail.com>
 //
 #include <fstream>
+#include <sstream>
 #include <boost/algorithm/string.hpp>
 #include <rime/algo/utilities.h>
 
@@ -35,8 +36,9 @@ ChecksumComputer::ChecksumComputer(uint32_t initial_remainder)
 
 void ChecksumComputer::ProcessFile(const string& file_name) {
   std::ifstream fin(file_name.c_str());
-  string file_content((std::istreambuf_iterator<char>(fin)),
-                           std::istreambuf_iterator<char>());
+  std::stringstream buffer;
+  buffer << fin.rdbuf();
+  const auto& file_content(buffer.str());
   crc_.process_bytes(file_content.data(), file_content.length());
 }
 

diff --git a/src/rime/dict/entry_collector.cc b/src/rime/dict/entry_collector.cc
@@ -7,6 +7,7 @@
 #include <fstream>
 #include <boost/algorithm/string.hpp>
 #include <boost/lexical_cast.hpp>
+#include <rime/algo/strings.h>
 #include <rime/dict/dict_settings.h>
 #include <rime/dict/entry_collector.h>
 #include <rime/dict/preset_vocabulary.h>
@@ -86,9 +87,7 @@ void EntryCollector::Collect(const string& dict_file) {
       continue;
     }
     // read a dict entry
-    vector<string> row;
-    boost::algorithm::split(row, line,
-                            boost::algorithm::is_any_of("\t"));
+    auto row = strings::split(line, "\t");
     int num_columns = static_cast<int>(row.size());
     if (num_columns <= text_column || row[text_column].empty()) {
       LOG(WARNING) << "Missing entry text at #" << num_entries << ".";
@@ -165,7 +164,7 @@ void EntryCollector::CreateEntry(const string &word,
   if (scaled) {
     double percentage = 100.0;
     try {
-      percentage = boost::lexical_cast<double>(
+      percentage = std::stod(
           weight_str.substr(0, weight_str.length() - 1));
     }
     catch (...) {
@@ -176,7 +175,7 @@ void EntryCollector::CreateEntry(const string &word,
   }
   else if (!weight_str.empty()) {  // absolute weight
     try {
-      e.weight = boost::lexical_cast<double>(weight_str);
+      e.weight = std::stod(weight_str);
     }
     catch (...) {
       LOG(WARNING) << "invalid entry definition at #" << num_entries << ".";
@@ -212,16 +211,16 @@ void EntryCollector::CreateEntry(const string &word,
 
 bool EntryCollector::TranslateWord(const string& word,
                                    vector<string>* result) {
-  ReverseLookupTable::const_iterator s = stems.find(word);
+  const auto& s = stems.find(word);
   if (s != stems.end()) {
     for (const string& stem : s->second) {
       result->push_back(stem);
     }
     return true;
   }
-  WordMap::const_iterator w = words.find(word);
+  const auto& w = words.find(word);
   if (w != words.end()) {
-    for (const auto& v : w->second) {
+    for (const auto& v : w->second) {  
       const double kMinimalWeight = 0.05;  // 5%
       double min_weight = total_weight[word] * kMinimalWeight;
       if (v.second < min_weight)

diff --git a/src/rime/dict/entry_collector.h b/src/rime/dict/entry_collector.h
@@ -24,7 +24,7 @@ struct RawDictEntry {
 // code -> weight
 using WeightMap = map<string, double>;
 // word -> { code -> weight }
-using WordMap = map<string, WeightMap>;
+using WordMap = hash_map<string, WeightMap>;
 // [ (word, weight), ... ]
 using EncodeQueue = std::queue<pair<string, string>>;
 

diff --git a/src/rime/dict/vocabulary.h b/src/rime/dict/vocabulary.h
@@ -91,7 +91,7 @@ class Vocabulary : public map<int, VocabularyPage> {
 };
 
 // word -> { code, ... }
-using ReverseLookupTable = map<string, set<string>>;
+using ReverseLookupTable = hash_map<string, set<string>>;
 
 }  // namespace rime