From af18b1d17d561e6c5421e992509334d8b6e2cef4 Mon Sep 17 00:00:00 2001 From: Jerzy Kozera Date: Fri, 7 Oct 2016 17:04:49 +0200 Subject: [PATCH] Better sorting of search results (#613, #100) Uses an O(m+n) algorithm based on https://github.com/bevacqua/fuzzysearch - should be faster than the one initially proposed in PR #281. --- src/app/app.pro | 3 + src/libs/registry/docset.cpp | 203 +++++++++++++++++++++++++++++-- src/libs/registry/searchresult.h | 5 +- 3 files changed, 201 insertions(+), 10 deletions(-) diff --git a/src/app/app.pro b/src/app/app.pro index 7dfd39a05..55b45a15d 100644 --- a/src/app/app.pro +++ b/src/app/app.pro @@ -17,11 +17,14 @@ unix:!macx { target.path = $$PREFIX/bin INSTALLS += target + LIBS += -lsqlite3 } win32 { TARGET = zeal RC_ICONS = resources/zeal.ico + INCLUDEPATH += $$[QT_INSTALL_PREFIX]/src/3rdparty/sqlite + SOURCES += $$[QT_INSTALL_PREFIX]/src/3rdparty/sqlite/sqlite3.c } macx { diff --git a/src/libs/registry/docset.cpp b/src/libs/registry/docset.cpp index c6e7395da..26c8cd21b 100644 --- a/src/libs/registry/docset.cpp +++ b/src/libs/registry/docset.cpp @@ -28,16 +28,49 @@ #include +#include #include #include #include #include #include #include +#include #include #include #include +#include + +#include +#include + +extern "C" { +struct sqlite3; +struct sqlite3_context; +struct sqlite3_value; +int sqlite3_value_int(sqlite3_value*); +const unsigned char *sqlite3_value_text(sqlite3_value*); +void sqlite3_result_int(sqlite3_context*, int); +#define SQLITE_UTF8 1 /* IMP: R-37514-35566 */ +int sqlite3_create_function( + sqlite3 *db, + const char *zFunctionName, + int nArg, + int eTextRep, + void *pApp, + void (*xFunc)(sqlite3_context*,int,sqlite3_value**), + void (*xStep)(sqlite3_context*,int,sqlite3_value**), + void (*xFinal)(sqlite3_context*) +); +} + +static void scoreFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +); + using namespace Zeal::Registry; namespace { @@ -255,12 +288,12 @@ QList Docset::search(const QString &query, const CancellationToken QString queryStr; if (m_type == Docset::Type::Dash) { - queryStr = QStringLiteral("SELECT name, type, path " + queryStr = QStringLiteral("SELECT name, type, path, zealScore('%1', name) as score " " FROM searchIndex " - "WHERE (name LIKE '%%1%' ESCAPE '\\') " - "ORDER BY name COLLATE NOCASE").arg(sanitizedQuery); + "WHERE score > 0 " + "ORDER BY score DESC").arg(sanitizedQuery); } else { - queryStr = QStringLiteral("SELECT ztokenname, ztypename, zpath, zanchor " + queryStr = QStringLiteral("SELECT ztokenname, ztypename, zpath, zanchor, zealScore('%1', ztokenname) as score " " FROM ztoken " "LEFT JOIN ztokenmetainformation " " ON ztoken.zmetainformation = ztokenmetainformation.z_pk " @@ -268,8 +301,8 @@ QList Docset::search(const QString &query, const CancellationToken " ON ztokenmetainformation.zfile = zfilepath.z_pk " "LEFT JOIN ztokentype " " ON ztoken.ztokentype = ztokentype.z_pk " - "WHERE (ztokenname LIKE '%%1%' ESCAPE '\\') " - "ORDER BY ztokenname COLLATE NOCASE").arg(sanitizedQuery); + "WHERE score > 0 " + "ORDER BY score DESC").arg(sanitizedQuery); } // Limit for very short queries. @@ -284,7 +317,9 @@ QList Docset::search(const QString &query, const CancellationToken results.append({sqlQuery.value(0).toString(), parseSymbolType(sqlQuery.value(1).toString()), const_cast(this), - createPageUrl(sqlQuery.value(2).toString(), sqlQuery.value(3).toString())}); + createPageUrl(sqlQuery.value(2).toString(), sqlQuery.value(3).toString()), + sqlQuery.value(4).toInt() + }); } return results; @@ -323,7 +358,8 @@ QList Docset::relatedLinks(const QUrl &url) const results.append({sqlQuery.value(0).toString(), parseSymbolType(sqlQuery.value(1).toString()), const_cast(this), - createPageUrl(sqlQuery.value(2).toString(), sqlQuery.value(3).toString())}); + createPageUrl(sqlQuery.value(2).toString(), sqlQuery.value(3).toString()), + 0}); } if (results.size() == 1) @@ -334,7 +370,10 @@ QList Docset::relatedLinks(const QUrl &url) const QSqlDatabase Docset::database() const { - return QSqlDatabase::database(m_name, true); + QSqlDatabase db = QSqlDatabase::database(m_name, true); + sqlite3 *handle = *static_cast(db.driver()->handle().data()); + sqlite3_create_function(handle, "zealScore", 2, SQLITE_UTF8, 0, scoreFunc, 0, 0); + return db; } void Docset::loadMetadata() @@ -619,3 +658,149 @@ QString Docset::parseSymbolType(const QString &str) return aliases.value(str, str); } + +// ported from DevDocs' searcher.coffee: +// (https://github.com/Thibaut/devdocs/blob/50f583246d5fbd92be7b71a50bfa56cf4e239c14/assets/javascripts/app/searcher.coffee#L91) +static void match_fuzzy( + const char* needle, const char* haystack, + int* start, int* len, int* needle_len +) { + int i = 0, j = 0; + for (; needle[i] != 0; ++i) { + while(haystack[j] != 0) { + if (needle[i] == haystack[j++]) { + if (*start == -1) *start = j - 1; // first matched char + *len = j - *start; + goto outer; + } + } + *start = -1; // end of haystack, char not found + return; + outer: continue; + } + if (needle_len) + *needle_len = i; +} + +static int max(int a, int b) { + if (a > b) return a; + else return b; +} + +static int score_exact(int match_index, int match_len, const char* value) { + int score = 100, value_len = strlen((const char*)value); + // Remove one point for each unmatched character. + score -= (value_len - match_len); + if (match_index > 0) { + if (value[match_index - 1] == '.') { + // If the character preceding the query is a dot, assign the same + // score as if the query was found at the beginning of the string, + // minus one. + score += (match_index - 1); + } else if (match_len == 1) { + // Don't match a single-character query unless it's found at the + // beginning of the string or is preceded by a dot. + return 0; + } else { + // (1) Remove one point for each unmatched character up to + // the nearest preceding dot or the beginning of the + // string. + // (2) Remove one point for each unmatched character + // following the query. + int i = match_index - 2; + while (i >= 0 && value[i] != '.') --i; + score -= (match_index - i) + // (1) + (value_len - match_len - match_index); // (2) + } + // Remove one point for each dot preceding the query, except for the + // one immediately before the query. + int separators = 0, + i = match_index - 2; + while (i >= 0) { + if (value[i] == '.') { + separators += 1; + } + i--; + } + score -= separators; + } + + // Remove five points for each dot following the query. + int separators = 0; + int i = value_len - match_len - match_index - 1; + while (i >= 0){ + if (value[match_index + match_len + i] == '.') { + separators += 1; + } + i--; + } + score -= separators * 5; + + return max(1, score); +} + +static int score_fuzzy(int match_index, int match_len, const char* value) { + if (match_index == 0 || value[match_index-1] == '.') { + return max(66, 100 - match_len); + } else { + if (value[match_index + match_len - 1] == 0) { + return max(33, 67 - match_len); + } else { + return max(1, 34 - match_len); + } + } +} + +static void replace_all(std::string str, const std::string& from, const std::string& to) { + size_t start_pos = 0; + while((start_pos = str.find(from, start_pos)) != std::string::npos) { + str.replace(start_pos, from.length(), to); + start_pos += to.length(); + } +} + +static void scoreFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +) { + const char* needle = (const char*)sqlite3_value_text(argv[0]); + const char* haystack_with_separators = (const char*)sqlite3_value_text(argv[1]); + std::string haystack_str(haystack_with_separators); + replace_all(haystack_str, "::", "."); // C++ + replace_all(haystack_str, "/", "."); // Go + replace_all(haystack_str, "_", "."); + const char* haystack = haystack_str.c_str(); + int match1 = -1, match1_len, needle_len; + match_fuzzy(needle, haystack, &match1, &match1_len, &needle_len); + if (match1 == -1) { + sqlite3_result_int(context, 0); + return; + } + + if (needle_len == match1_len) { // exact match + sqlite3_result_int(context, score_exact( + match1, match1_len, haystack + )); + return; + } + + int best = score_fuzzy(match1, match1_len, haystack); + int last_index_of_dot = -1, i; + for (i = 0; haystack[i] != 0; ++i) { + if (haystack[i] == '.') last_index_of_dot = i; + } + if (last_index_of_dot != -1) { + int match2 = -1, match2_len; + match_fuzzy( + needle, haystack + last_index_of_dot + 1, &match2, &match2_len, + 0 + ); + if (match2 != -1) { + best = max(best, + score_fuzzy(match2, match2_len, haystack + last_index_of_dot + 1) + ); + } + } + sqlite3_result_int(context, best); +} diff --git a/src/libs/registry/searchresult.h b/src/libs/registry/searchresult.h index 5ece8073b..899ac2cb6 100644 --- a/src/libs/registry/searchresult.h +++ b/src/libs/registry/searchresult.h @@ -41,9 +41,12 @@ struct SearchResult QUrl url; + int score; + inline bool operator<(const SearchResult &other) const { - return QString::compare(name, other.name, Qt::CaseInsensitive) < 0; + if (score == other.score) return QString::compare(name, other.name, Qt::CaseInsensitive) < 0; + return score > other.score; } };