Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: spelling correction #228

Merged
merged 25 commits into from
Dec 14, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ else()
endif()

if(BUILD_SHARED_LIBS)
add_library(rime ${rime_src})
add_library(rime ${rime_src} rime/algo/corrector.h rime/algo/corrector.cc)
lotem marked this conversation as resolved.
Show resolved Hide resolved
target_link_libraries(rime ${rime_deps})
set_target_properties(rime PROPERTIES DEFINE_SYMBOL "RIME_EXPORTS")
set_target_properties(rime PROPERTIES VERSION ${rime_version} SOVERSION ${rime_soversion})
Expand Down
2 changes: 2 additions & 0 deletions src/rime/algo/algebra.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class Script : public map<string, vector<Spelling>> {
const SpellingProperties& sp,
const vector<Spelling>& v);
void Dump(const string& file_name) const;


};

class Projection {
Expand Down
34 changes: 34 additions & 0 deletions src/rime/algo/corrector.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//
// Copyright RIME Developers
// Distributed under the BSD License
//
// Created by nameoverflow on 2018/11/14.
//

#include "corrector.h"

using namespace rime;

void DFSCollect(const string &origin, const string &deleted, size_t ed, Script &result);

Script CorrectionCollector::Collect(size_t edit_distance) {
Script script;

for (auto &v : syllabary_) {
DFSCollect(v, v, edit_distance, script);
}

return script;
}

void DFSCollect(const string &origin, const string &deleted, size_t ed, Script &result) {
if (ed <= 0) return;
for (size_t i = 0; i < deleted.size(); i++) {
string temp = deleted;
temp.erase(i, 1);
Spelling spelling(origin);
spelling.properties.type = kCorrection;
result[temp].push_back(spelling);
lotem marked this conversation as resolved.
Show resolved Hide resolved
DFSCollect(origin, temp, ed - 1, result);
}
}
37 changes: 37 additions & 0 deletions src/rime/algo/corrector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//
// Copyright RIME Developers
// Distributed under the BSD License
//
// Created by nameoverflow on 2018/11/14.
//

#ifndef RIME_CORRECTOR_H
#define RIME_CORRECTOR_H

#include <rime/common.h>
#include <rime/dict/vocabulary.h>
#include <rime/dict/prism.h>
#include "spelling.h"
#include "algebra.h"

namespace rime {

class CorrectionCollector {
public:
explicit CorrectionCollector(const Syllabary& syllabary): syllabary_(syllabary) {}

Script Collect(size_t edit_distance);

private:
const Syllabary& syllabary_;
};

class Corrector : public Prism {
public:

};


} // namespace rime

#endif //RIME_CORRECTOR_H
2 changes: 1 addition & 1 deletion src/rime/algo/spelling.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace rime {

enum SpellingType { kNormalSpelling, kFuzzySpelling,
kAbbreviation, kCompletion, kAmbiguousSpelling,
kInvalidSpelling };
kCorrection, kInvalidSpelling };

struct SpellingProperties {
SpellingType type = kNormalSpelling;
Expand Down
3 changes: 2 additions & 1 deletion src/rime/algo/syllabifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ using VertexQueue = std::priority_queue<Vertex,

int Syllabifier::BuildSyllableGraph(const string &input,
Prism &prism,
SyllableGraph *graph) {
SyllableGraph *graph,
optional<Prism&> corretion) {
if (input.empty())
return 0;

Expand Down
3 changes: 2 additions & 1 deletion src/rime/algo/syllabifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ class Syllabifier {

RIME_API int BuildSyllableGraph(const string &input,
Prism &prism,
SyllableGraph *graph);
SyllableGraph *graph,
optional<Prism&> corretion);

protected:
void CheckOverlappedSpellings(SyllableGraph *graph,
Expand Down
2 changes: 2 additions & 0 deletions src/rime/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include <boost/optional>
#define BOOST_BIND_NO_PLACEHOLDERS
#ifdef BOOST_SIGNALS2
#include <boost/signals2/connection.hpp>
Expand Down Expand Up @@ -47,6 +48,7 @@ using std::pair;
using std::set;
using std::string;
using std::vector;
using boost::optional;

template <class Key, class T>
using hash_map = std::unordered_map<Key, T>;
Expand Down
30 changes: 28 additions & 2 deletions src/rime/dict/dict_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <rime/resource.h>
#include <rime/service.h>
#include <rime/algo/algebra.h>
#include <rime/algo/corrector.h>
#include <rime/algo/utilities.h>
#include <rime/dict/dictionary.h>
#include <rime/dict/dict_compiler.h>
Expand Down Expand Up @@ -212,7 +213,7 @@ bool DictCompiler::BuildPrism(const string &schema_file,
Syllabary syllabary;
if (!table_->Load() || !table_->GetSyllabary(&syllabary) || syllabary.empty())
return false;
// apply spelling algebra
// apply spelling algebra and prepare corrections (if enabled)
Script script;
if (!schema_file.empty()) {
Config config;
Expand All @@ -230,6 +231,30 @@ bool DictCompiler::BuildPrism(const string &schema_file,
script.clear();
}
}

// build corrector
int correction_level = 0;
if (config.GetInt("speller/correction_level", &correction_level) &&
correction_level > 0) {

Syllabary correct_syllabary;
if (!script.empty()) {
for (auto &v : script) {
correct_syllabary.insert(v.first);
}
} else {
correct_syllabary = syllabary;
}

CorrectionCollector collector(correct_syllabary);
auto correction_script = collector.Collect((size_t)correction_level);
correction_->Remove();
if (!correction_->Build(syllabary, &correction_script,
dict_file_checksum, schema_file_checksum) ||
!correction_->Save()) {
return false;
}
}
}
if ((options_ & kDump) && !script.empty()) {
boost::filesystem::path path(prism_->file_name());
Expand All @@ -239,12 +264,13 @@ bool DictCompiler::BuildPrism(const string &schema_file,
// build .prism.bin
{
prism_->Remove();
if (!prism_->Build(syllabary, script.empty() ? NULL : &script,
if (!prism_->Build(syllabary, script.empty() ? nullptr : &script,
dict_file_checksum, schema_file_checksum) ||
!prism_->Save()) {
return false;
}
}

return true;
}

Expand Down
1 change: 1 addition & 0 deletions src/rime/dict/dict_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class DictCompiler {

string dict_name_;
an<Prism> prism_;
an<Prism> correction_;
an<Table> table_;
int options_ = 0;
string prefix_;
Expand Down
2 changes: 1 addition & 1 deletion src/rime/dict/prism.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class Prism : public MappedFile {

RIME_API bool Load();
RIME_API bool Save();
RIME_API bool Build(const Syllabary& syllabary,
RIME_API virtual bool Build(const Syllabary& syllabary,
const Script* script = NULL,
uint32_t dict_file_checksum = 0,
uint32_t schema_file_checksum = 0);
Expand Down
54 changes: 54 additions & 0 deletions thirdparty/include/darts.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,12 @@ class DoubleArrayImpl {
std::size_t max_num_results, std::size_t length = 0,
std::size_t node_pos = 0) const;

// Performs commonPrefixSearch() for every possible one-element-deleted `key`.
lotem marked this conversation as resolved.
Show resolved Hide resolved
template <class U>
inline std::size_t commonPrefixDeletedSearch(const key_type *key, U *results,
std::size_t max_num_results, std::size_t length = 0,
std::size_t node_pos = 0) const;

// In Darts-clone, a dictionary is a deterministic finite-state automaton
// (DFA) and traverse() tests transitions on the DFA. The initial state is
// `node_pos' and traverse() chooses transitions labeled key[key_pos],
Expand Down Expand Up @@ -484,6 +490,54 @@ inline std::size_t DoubleArrayImpl<A, B, T, C>::commonPrefixSearch(
return num_results;
}

template <typename A, typename B, typename T, typename C>
template <typename U>
inline std::size_t DoubleArrayImpl<A, B, T, C>::commonPrefixDeletedSearch(
const key_type *key, U *results, std::size_t max_num_results,
std::size_t length, std::size_t node_pos) const {
std::size_t num_results = 0;

unit_type unit = array_[node_pos];
node_pos ^= unit.offset();
if (length != 0) {
for (std::size_t i = 0; i < length; ++i) {
node_pos ^= static_cast<uchar_type>(key[i]);
unit = array_[node_pos];
if (unit.label() != static_cast<uchar_type>(key[i])) {
return num_results;
}

node_pos ^= unit.offset();
if (unit.has_leaf()) {
if (num_results < max_num_results) {
set_result(&results[num_results], static_cast<value_type>(
array_[node_pos].value()), i + 1);
}
++num_results;
}
}
} else {
for ( ; key[length] != '\0'; ++length) {
node_pos ^= static_cast<uchar_type>(key[length]);
unit = array_[node_pos];
if (unit.label() != static_cast<uchar_type>(key[length])) {
return num_results;
}

node_pos ^= unit.offset();
if (unit.has_leaf()) {
if (num_results < max_num_results) {
set_result(&results[num_results], static_cast<value_type>(
array_[node_pos].value()), length + 1);
}
++num_results;
}
}
}

return num_results;
}

template <typename A, typename B, typename T, typename C>
inline typename DoubleArrayImpl<A, B, T, C>::value_type
DoubleArrayImpl<A, B, T, C>::traverse(const key_type *key,
Expand Down