From 19ea6670fe5f10f77b098f45728dd8667582e217 Mon Sep 17 00:00:00 2001 From: Jerzy Kozera Date: Fri, 7 Oct 2016 17:04:49 +0200 Subject: [PATCH] Better sorting of search results (#613, #100) Uses an O(m+n) algorithm based on https://github.com/bevacqua/fuzzysearch - should be faster than the one initially proposed in PR #281. --- src/libs/registry/docset.cpp | 196 +++++++++++++++++++++++++++++-- src/libs/registry/searchresult.h | 5 +- src/libs/util/sqlitedatabase.cpp | 35 +++++- src/libs/util/sqlitedatabase.h | 3 + 4 files changed, 226 insertions(+), 13 deletions(-) diff --git a/src/libs/registry/docset.cpp b/src/libs/registry/docset.cpp index 281e4ee25..f84ea7fee 100644 --- a/src/libs/registry/docset.cpp +++ b/src/libs/registry/docset.cpp @@ -36,6 +36,8 @@ #include #include +static int scoreFunc(const char*, const char*); + using namespace Zeal::Registry; namespace { @@ -121,6 +123,7 @@ Docset::Docset(const QString &path) : return; } + m_db->createScoreFunc("zealScore", scoreFunc); m_type = m_db->tables().contains(QStringLiteral("searchIndex")) ? Type::Dash : Type::ZDash; createIndex(); @@ -252,12 +255,12 @@ QList Docset::search(const QString &query, const CancellationToken QString queryStr; if (m_type == Docset::Type::Dash) { - queryStr = QStringLiteral("SELECT name, type, path " + queryStr = QStringLiteral("SELECT name, type, path, '', zealScore('%1', name) as score " " FROM searchIndex " - "WHERE (name LIKE '%%1%' ESCAPE '\\') " - "ORDER BY name COLLATE NOCASE").arg(sanitizedQuery); + "WHERE score > 0 " + "ORDER BY score DESC").arg(sanitizedQuery); } else { - queryStr = QStringLiteral("SELECT ztokenname, ztypename, zpath, zanchor " + queryStr = QStringLiteral("SELECT ztokenname, ztypename, zpath, zanchor, zealScore('%1', ztokenname) as score " " FROM ztoken " "LEFT JOIN ztokenmetainformation " " ON ztoken.zmetainformation = ztokenmetainformation.z_pk " @@ -265,8 +268,8 @@ QList Docset::search(const QString &query, const CancellationToken " ON ztokenmetainformation.zfile = zfilepath.z_pk " "LEFT JOIN ztokentype " " ON ztoken.ztokentype = ztokentype.z_pk " - "WHERE (ztokenname LIKE '%%1%' ESCAPE '\\') " - "ORDER BY ztokenname COLLATE NOCASE").arg(sanitizedQuery); + "WHERE score > 0 " + "ORDER BY score DESC").arg(sanitizedQuery); } // Limit for very short queries. @@ -281,7 +284,8 @@ QList Docset::search(const QString &query, const CancellationToken results.append({m_db->value(0).toString(), parseSymbolType(m_db->value(1).toString()), const_cast(this), - createPageUrl(m_db->value(2).toString(), m_db->value(3).toString())}); + createPageUrl(m_db->value(2).toString(), m_db->value(3).toString()), + m_db->value(4).toInt()}); } return results; @@ -320,7 +324,8 @@ QList Docset::relatedLinks(const QUrl &url) const results.append({m_db->value(0).toString(), parseSymbolType(m_db->value(1).toString()), const_cast(this), - createPageUrl(m_db->value(2).toString(), m_db->value(3).toString())}); + createPageUrl(m_db->value(2).toString(), m_db->value(3).toString()), + 0}); } if (results.size() == 1) @@ -607,3 +612,178 @@ QString Docset::parseSymbolType(const QString &str) return aliases.value(str, str); } + +// ported from DevDocs' searcher.coffee: +// (https://github.com/Thibaut/devdocs/blob/50f583246d5fbd92be7b71a50bfa56cf4e239c14/assets/javascripts/app/searcher.coffee#L91) +static void matchFuzzy( + int nLen, const char *needle, int hLen, const char *haystack, int *start, int *len +) { + int j = 0, groups = 0; + for (int i = 0; i < nLen; ++i) { + bool found = false, first = true; + int distance = 0; + while (j < hLen) { + bool match = needle[i] == haystack[j++]; + if (match) { + if (*start == -1) *start = j - 1; // first matched char + *len = j - *start; + found = true; + break; // continue the outer loop + } else { + // optimizations to reduce returned number of results + // (search was returning too many irrelevant results with large docsets) + if (first) { + groups += 1; + if (groups > 3) // optimization #1: too many mismatches + break; + first = false; + } + if (i != 0) { + distance += 1; + if (distance > 8) { // optimization #2: too large distance between found chars + break; + } + } + } + } + + if (!found) { + // end of haystack, char not found + *start = -1; + return; + } + } +} + +static int scoreExact(int matchIndex, int matchLen, const char* value, int valueLen) +{ + int score = 100; + const char DOT = '.'; + // Remove one point for each unmatched character. + score -= (valueLen - matchLen); + if (matchIndex > 0) { + if (value[matchIndex - 1] == DOT) { + // If the character preceding the query is a dot, assign the same + // score as if the query was found at the beginning of the string, + // minus one. + score += (matchIndex - 1); + } else if (matchLen == 1) { + // Don't match a single-character query unless it's found at the + // beginning of the string or is preceded by a dot. + return 0; + } else { + // (1) Remove one point for each unmatched character up to + // the nearest preceding dot or the beginning of the + // string. + // (2) Remove one point for each unmatched character + // following the query. + int i = matchIndex - 2; + while (i >= 0 && value[i] != DOT) --i; + score -= (matchIndex - i) + // (1) + (valueLen - matchLen - matchIndex); // (2) + } + + // Remove one point for each dot preceding the query, except for the + // one immediately before the query. + int separators = 0, + i = matchIndex - 2; + + while (i >= 0) { + if (value[i] == DOT) + ++separators; + --i; + } + + score -= separators; + } + + // Remove five points for each dot following the query. + int separators = 0; + int i = valueLen - matchLen - matchIndex - 1; + while (i >= 0) { + if (value[matchIndex + matchLen + i] == DOT) { + ++separators; + } + --i; + } + + score -= separators * 5; + + return qMax(1, score); +} + +static int scoreFuzzy(int matchIndex, int matchLen, const char *value) +{ + if (matchIndex == 0 || value[matchIndex - 1] == '.') { + return qMax(66, 100 - matchLen); + } else { + if (value[matchLen] == 0) { + return qMax(33, 67 - matchLen); + } else { + return qMax(1, 34 - matchLen); + } + } +} + +static int scoreFunc(const char *needleOrig, const char* haystackOrig) +{ + int haystackLen = 0, needleLen = 0; + while (haystackOrig[++haystackLen] != 0); + while (needleOrig[++needleLen] != 0); + char *needle = new char[needleLen + 1], *haystack = new char[haystackLen + 1]; + for (int i = 0; i < needleLen + 1; ++i) { + char c = needleOrig[i]; + if (c >= 'A' && c <= 'Z') + c += 32; + needle[i] = c; + } + for (int i = 0, j = 0; i < haystackLen + 1; ++i) { + char c = haystackOrig[i]; + if ( + (i > 0 && haystackOrig[i - 1] == ':' && c == ':') // C++ (::) + || c == '/' // Go + || c == '_' + || c == ' ' // some Guides + ) { + haystack[j++] = '.'; + } else { + if (c >= 'A' && c <= 'Z') + c += 32; + haystack[j++] = c; + } + } + + int best = 0, match1 = -1, match1Len; + + matchFuzzy(needleLen, needle, haystackLen, haystack, &match1, &match1Len); + + if (match1 == -1) { // no match + // simply return best=0 below + } else if (needleLen == match1Len) { // exact match + best = scoreExact(match1, match1Len, haystack, haystackLen); + } else { + best = scoreFuzzy(match1, match1Len, haystack); + + int indexOfLastDot = -1; + for (int i = 0; haystack[i] != 0; ++i) { + if (haystack[i] == '.') indexOfLastDot = i; + } + + if (indexOfLastDot != -1) { + int match2 = -1, match2Len; + matchFuzzy( + needleLen, needle, haystackLen - (indexOfLastDot + 1), haystack + indexOfLastDot + 1, + &match2, &match2Len + ); + if (match2 != -1) { + best = qMax( + best, scoreFuzzy(match2, match2Len, haystack + indexOfLastDot + 1) + ); + } + } + } + + delete[] needle; + delete[] haystack; + return best; +} diff --git a/src/libs/registry/searchresult.h b/src/libs/registry/searchresult.h index 5ece8073b..899ac2cb6 100644 --- a/src/libs/registry/searchresult.h +++ b/src/libs/registry/searchresult.h @@ -41,9 +41,12 @@ struct SearchResult QUrl url; + int score; + inline bool operator<(const SearchResult &other) const { - return QString::compare(name, other.name, Qt::CaseInsensitive) < 0; + if (score == other.score) return QString::compare(name, other.name, Qt::CaseInsensitive) < 0; + return score > other.score; } }; diff --git a/src/libs/util/sqlitedatabase.cpp b/src/libs/util/sqlitedatabase.cpp index 36c2b22f5..ace68d720 100644 --- a/src/libs/util/sqlitedatabase.cpp +++ b/src/libs/util/sqlitedatabase.cpp @@ -79,9 +79,14 @@ bool SQLiteDatabase::execute(const QString &queryStr) m_lastError.clear(); + sqlite3_mutex_enter(sqlite3_db_mutex(m_db)); const void *pzTail = nullptr; - if (sqlite3_prepare16_v2(m_db, queryStr.constData(), (queryStr.size() + 1) * sizeof(QChar), - &m_stmt, &pzTail) != SQLITE_OK) { + const int res = sqlite3_prepare16_v2( + m_db, queryStr.constData(), (queryStr.size() + 1) * sizeof(QChar), &m_stmt, &pzTail + ); + sqlite3_mutex_leave(sqlite3_db_mutex(m_db)); + + if (res != SQLITE_OK) { // "Unable to execute statement" updateLastError(); finalize(); @@ -101,7 +106,11 @@ bool SQLiteDatabase::next() if (m_stmt == nullptr) return false; - switch(sqlite3_step(m_stmt)) { + sqlite3_mutex_enter(sqlite3_db_mutex(m_db)); + const int res = sqlite3_step(m_stmt); + sqlite3_mutex_leave(sqlite3_db_mutex(m_db)); + + switch(res) { case SQLITE_ROW: return true; case SQLITE_DONE: @@ -116,6 +125,20 @@ bool SQLiteDatabase::next() return false; } +static void scoreFuncWrapper(sqlite3_context *context, int /* argc */, sqlite3_value **argv) { + const char* needle = (const char*)sqlite3_value_text(argv[0]); + const char* haystack = (const char*)sqlite3_value_text(argv[1]); + + auto f = reinterpret_cast(sqlite3_user_data(context)); + sqlite3_result_int(context, (*f)(needle, haystack)); +} + +void SQLiteDatabase::createScoreFunc(const QString& name, SQLiteDatabase::scoreFunc f) { + sqlite3_create_function( + m_db, name.toUtf8(), 2, SQLITE_UTF8, (void*)f, scoreFuncWrapper, nullptr, nullptr + ); +} + QVariant SQLiteDatabase::value(int index) const { Q_ASSERT(index >= 0); @@ -124,7 +147,11 @@ QVariant SQLiteDatabase::value(int index) const if (index >= sqlite3_data_count(m_stmt)) return QVariant(); - switch (sqlite3_column_type(m_stmt, index)) { + sqlite3_mutex_enter(sqlite3_db_mutex(m_db)); + const int type = sqlite3_column_type(m_stmt, index); + sqlite3_mutex_leave(sqlite3_db_mutex(m_db)); + + switch (type) { case SQLITE_INTEGER: return sqlite3_column_int64(m_stmt, index); case SQLITE_NULL: diff --git a/src/libs/util/sqlitedatabase.h b/src/libs/util/sqlitedatabase.h index 6429d33d3..776ca0e21 100644 --- a/src/libs/util/sqlitedatabase.h +++ b/src/libs/util/sqlitedatabase.h @@ -44,6 +44,9 @@ class SQLiteDatabase bool execute(const QString &queryStr); bool next(); + typedef int (*scoreFunc)(const char* needle, const char* haystack); + void createScoreFunc(const QString& funcName, scoreFunc f); + QVariant value(int index) const; QString lastError() const;