Skip to content

Commit

Permalink
Better sorting of search results (zealdocs#613, zealdocs#100)
Browse files Browse the repository at this point in the history
Uses an O(m+n) algorithm based on https://github.com/bevacqua/fuzzysearch
- should be faster than the one initially proposed in PR zealdocs#281.
  • Loading branch information
jkozera committed Oct 21, 2016
1 parent 6cb63ee commit 19ea667
Show file tree
Hide file tree
Showing 4 changed files with 226 additions and 13 deletions.
196 changes: 188 additions & 8 deletions src/libs/registry/docset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
#include <QRegularExpression>
#include <QVariant>

static int scoreFunc(const char*, const char*);

using namespace Zeal::Registry;

namespace {
Expand Down Expand Up @@ -121,6 +123,7 @@ Docset::Docset(const QString &path) :
return;
}

m_db->createScoreFunc("zealScore", scoreFunc);
m_type = m_db->tables().contains(QStringLiteral("searchIndex")) ? Type::Dash : Type::ZDash;

createIndex();
Expand Down Expand Up @@ -252,21 +255,21 @@ QList<SearchResult> Docset::search(const QString &query, const CancellationToken

QString queryStr;
if (m_type == Docset::Type::Dash) {
queryStr = QStringLiteral("SELECT name, type, path "
queryStr = QStringLiteral("SELECT name, type, path, '', zealScore('%1', name) as score "
" FROM searchIndex "
"WHERE (name LIKE '%%1%' ESCAPE '\\') "
"ORDER BY name COLLATE NOCASE").arg(sanitizedQuery);
"WHERE score > 0 "
"ORDER BY score DESC").arg(sanitizedQuery);
} else {
queryStr = QStringLiteral("SELECT ztokenname, ztypename, zpath, zanchor "
queryStr = QStringLiteral("SELECT ztokenname, ztypename, zpath, zanchor, zealScore('%1', ztokenname) as score "
" FROM ztoken "
"LEFT JOIN ztokenmetainformation "
" ON ztoken.zmetainformation = ztokenmetainformation.z_pk "
"LEFT JOIN zfilepath "
" ON ztokenmetainformation.zfile = zfilepath.z_pk "
"LEFT JOIN ztokentype "
" ON ztoken.ztokentype = ztokentype.z_pk "
"WHERE (ztokenname LIKE '%%1%' ESCAPE '\\') "
"ORDER BY ztokenname COLLATE NOCASE").arg(sanitizedQuery);
"WHERE score > 0 "
"ORDER BY score DESC").arg(sanitizedQuery);
}

// Limit for very short queries.
Expand All @@ -281,7 +284,8 @@ QList<SearchResult> Docset::search(const QString &query, const CancellationToken
results.append({m_db->value(0).toString(),
parseSymbolType(m_db->value(1).toString()),
const_cast<Docset *>(this),
createPageUrl(m_db->value(2).toString(), m_db->value(3).toString())});
createPageUrl(m_db->value(2).toString(), m_db->value(3).toString()),
m_db->value(4).toInt()});
}

return results;
Expand Down Expand Up @@ -320,7 +324,8 @@ QList<SearchResult> Docset::relatedLinks(const QUrl &url) const
results.append({m_db->value(0).toString(),
parseSymbolType(m_db->value(1).toString()),
const_cast<Docset *>(this),
createPageUrl(m_db->value(2).toString(), m_db->value(3).toString())});
createPageUrl(m_db->value(2).toString(), m_db->value(3).toString()),
0});
}

if (results.size() == 1)
Expand Down Expand Up @@ -607,3 +612,178 @@ QString Docset::parseSymbolType(const QString &str)

return aliases.value(str, str);
}

// ported from DevDocs' searcher.coffee:
// (https://github.com/Thibaut/devdocs/blob/50f583246d5fbd92be7b71a50bfa56cf4e239c14/assets/javascripts/app/searcher.coffee#L91)
static void matchFuzzy(
int nLen, const char *needle, int hLen, const char *haystack, int *start, int *len
) {
int j = 0, groups = 0;
for (int i = 0; i < nLen; ++i) {
bool found = false, first = true;
int distance = 0;
while (j < hLen) {
bool match = needle[i] == haystack[j++];
if (match) {
if (*start == -1) *start = j - 1; // first matched char
*len = j - *start;
found = true;
break; // continue the outer loop
} else {
// optimizations to reduce returned number of results
// (search was returning too many irrelevant results with large docsets)
if (first) {
groups += 1;
if (groups > 3) // optimization #1: too many mismatches
break;
first = false;
}
if (i != 0) {
distance += 1;
if (distance > 8) { // optimization #2: too large distance between found chars
break;
}
}
}
}

if (!found) {
// end of haystack, char not found
*start = -1;
return;
}
}
}

static int scoreExact(int matchIndex, int matchLen, const char* value, int valueLen)
{
int score = 100;
const char DOT = '.';
// Remove one point for each unmatched character.
score -= (valueLen - matchLen);
if (matchIndex > 0) {
if (value[matchIndex - 1] == DOT) {
// If the character preceding the query is a dot, assign the same
// score as if the query was found at the beginning of the string,
// minus one.
score += (matchIndex - 1);
} else if (matchLen == 1) {
// Don't match a single-character query unless it's found at the
// beginning of the string or is preceded by a dot.
return 0;
} else {
// (1) Remove one point for each unmatched character up to
// the nearest preceding dot or the beginning of the
// string.
// (2) Remove one point for each unmatched character
// following the query.
int i = matchIndex - 2;
while (i >= 0 && value[i] != DOT) --i;
score -= (matchIndex - i) + // (1)
(valueLen - matchLen - matchIndex); // (2)
}

// Remove one point for each dot preceding the query, except for the
// one immediately before the query.
int separators = 0,
i = matchIndex - 2;

while (i >= 0) {
if (value[i] == DOT)
++separators;
--i;
}

score -= separators;
}

// Remove five points for each dot following the query.
int separators = 0;
int i = valueLen - matchLen - matchIndex - 1;
while (i >= 0) {
if (value[matchIndex + matchLen + i] == DOT) {
++separators;
}
--i;
}

score -= separators * 5;

return qMax(1, score);
}

static int scoreFuzzy(int matchIndex, int matchLen, const char *value)
{
if (matchIndex == 0 || value[matchIndex - 1] == '.') {
return qMax(66, 100 - matchLen);
} else {
if (value[matchLen] == 0) {
return qMax(33, 67 - matchLen);
} else {
return qMax(1, 34 - matchLen);
}
}
}

static int scoreFunc(const char *needleOrig, const char* haystackOrig)
{
int haystackLen = 0, needleLen = 0;
while (haystackOrig[++haystackLen] != 0);
while (needleOrig[++needleLen] != 0);
char *needle = new char[needleLen + 1], *haystack = new char[haystackLen + 1];
for (int i = 0; i < needleLen + 1; ++i) {
char c = needleOrig[i];
if (c >= 'A' && c <= 'Z')
c += 32;
needle[i] = c;
}
for (int i = 0, j = 0; i < haystackLen + 1; ++i) {
char c = haystackOrig[i];
if (
(i > 0 && haystackOrig[i - 1] == ':' && c == ':') // C++ (::)
|| c == '/' // Go
|| c == '_'
|| c == ' ' // some Guides
) {
haystack[j++] = '.';
} else {
if (c >= 'A' && c <= 'Z')
c += 32;
haystack[j++] = c;
}
}

int best = 0, match1 = -1, match1Len;

matchFuzzy(needleLen, needle, haystackLen, haystack, &match1, &match1Len);

if (match1 == -1) { // no match
// simply return best=0 below
} else if (needleLen == match1Len) { // exact match
best = scoreExact(match1, match1Len, haystack, haystackLen);
} else {
best = scoreFuzzy(match1, match1Len, haystack);

int indexOfLastDot = -1;
for (int i = 0; haystack[i] != 0; ++i) {
if (haystack[i] == '.') indexOfLastDot = i;
}

if (indexOfLastDot != -1) {
int match2 = -1, match2Len;
matchFuzzy(
needleLen, needle, haystackLen - (indexOfLastDot + 1), haystack + indexOfLastDot + 1,
&match2, &match2Len
);
if (match2 != -1) {
best = qMax(
best, scoreFuzzy(match2, match2Len, haystack + indexOfLastDot + 1)
);
}
}
}

delete[] needle;
delete[] haystack;
return best;
}
5 changes: 4 additions & 1 deletion src/libs/registry/searchresult.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ struct SearchResult

QUrl url;

int score;

inline bool operator<(const SearchResult &other) const
{
return QString::compare(name, other.name, Qt::CaseInsensitive) < 0;
if (score == other.score) return QString::compare(name, other.name, Qt::CaseInsensitive) < 0;
return score > other.score;
}
};

Expand Down
35 changes: 31 additions & 4 deletions src/libs/util/sqlitedatabase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,14 @@ bool SQLiteDatabase::execute(const QString &queryStr)

m_lastError.clear();

sqlite3_mutex_enter(sqlite3_db_mutex(m_db));
const void *pzTail = nullptr;
if (sqlite3_prepare16_v2(m_db, queryStr.constData(), (queryStr.size() + 1) * sizeof(QChar),
&m_stmt, &pzTail) != SQLITE_OK) {
const int res = sqlite3_prepare16_v2(
m_db, queryStr.constData(), (queryStr.size() + 1) * sizeof(QChar), &m_stmt, &pzTail
);
sqlite3_mutex_leave(sqlite3_db_mutex(m_db));

if (res != SQLITE_OK) {
// "Unable to execute statement"
updateLastError();
finalize();
Expand All @@ -101,7 +106,11 @@ bool SQLiteDatabase::next()
if (m_stmt == nullptr)
return false;

switch(sqlite3_step(m_stmt)) {
sqlite3_mutex_enter(sqlite3_db_mutex(m_db));
const int res = sqlite3_step(m_stmt);
sqlite3_mutex_leave(sqlite3_db_mutex(m_db));

switch(res) {
case SQLITE_ROW:
return true;
case SQLITE_DONE:
Expand All @@ -116,6 +125,20 @@ bool SQLiteDatabase::next()
return false;
}

static void scoreFuncWrapper(sqlite3_context *context, int /* argc */, sqlite3_value **argv) {
const char* needle = (const char*)sqlite3_value_text(argv[0]);
const char* haystack = (const char*)sqlite3_value_text(argv[1]);

auto f = reinterpret_cast<SQLiteDatabase::scoreFunc>(sqlite3_user_data(context));
sqlite3_result_int(context, (*f)(needle, haystack));
}

void SQLiteDatabase::createScoreFunc(const QString& name, SQLiteDatabase::scoreFunc f) {
sqlite3_create_function(
m_db, name.toUtf8(), 2, SQLITE_UTF8, (void*)f, scoreFuncWrapper, nullptr, nullptr
);
}

QVariant SQLiteDatabase::value(int index) const
{
Q_ASSERT(index >= 0);
Expand All @@ -124,7 +147,11 @@ QVariant SQLiteDatabase::value(int index) const
if (index >= sqlite3_data_count(m_stmt))
return QVariant();

switch (sqlite3_column_type(m_stmt, index)) {
sqlite3_mutex_enter(sqlite3_db_mutex(m_db));
const int type = sqlite3_column_type(m_stmt, index);
sqlite3_mutex_leave(sqlite3_db_mutex(m_db));

switch (type) {
case SQLITE_INTEGER:
return sqlite3_column_int64(m_stmt, index);
case SQLITE_NULL:
Expand Down
3 changes: 3 additions & 0 deletions src/libs/util/sqlitedatabase.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ class SQLiteDatabase
bool execute(const QString &queryStr);
bool next();

typedef int (*scoreFunc)(const char* needle, const char* haystack);
void createScoreFunc(const QString& funcName, scoreFunc f);

QVariant value(int index) const;

QString lastError() const;
Expand Down

0 comments on commit 19ea667

Please sign in to comment.