Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fuzzy search #281

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions src/core/lcs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#include "lcs.h"

using namespace Zeal::Core;

LCS::LCS()
{
}

LCS::LCS(const QString &a, const QString &b):
m_a(a),
m_b(b)
{
if (m_a.isEmpty() || m_b.isEmpty())
return;

// Uses dynamic programming to calculate and store the size of the longest
// common subsequence shared between a & b;

int **lengthMatrix = createLengthMatrix();
fillLengthMatrix(lengthMatrix);

if (lengthMatrix[m_a.length()][m_b.length()] > 0)
m_subsequence = backtrackLengthMatrix(lengthMatrix, m_a.length(), m_b.length());

freeLengthMatrix(lengthMatrix);
}

QString LCS::subsequence() const
{
return m_subsequence;
}

int LCS::length() const
{
return m_subsequence.length();
}

// Metric of how much of the longest common subsequence covers the target
// string. The more the subsequence and target have in common the higher
// the desity up until 1 for a perfect match;
double LCS::calcDensity(int arg) const
{
if (m_subsequence.length() == 0)
return 0;
QString target = arg == 0 ? m_a : m_b;

return double(m_subsequence.length()) / target.length();
}

// Metric of how chopped the longest common subsequence is against the target.
// The size of the substring composed from the first until the last lcs match
// divided by the size of the LCS. Equals 1 for a subsequence that is actually
// a substring
double LCS::calcSpread(int arg) const
{
if (m_subsequence.length() == 0)
return 0;
QString target = arg == 0 ? m_a : m_b;

// handle cases like "*S*VGs*tring*" being identified as best subsequence
// for "string"
if (target.indexOf(m_subsequence) != -1)
return 1;

int start = target.indexOf(m_subsequence[0]);
int end = start;
for (int i = start, j = 0; j < m_subsequence.length(); i++) {
if (target[i] == m_subsequence[j]) {
end = i;
j++;
}
}
end++;

return double(m_subsequence.length()) / (end - start);
}

// Return lcs positions in target string
QList<int> LCS::subsequencePositions(int arg) const
{
QString target = arg == 0 ? m_a : m_b;
QList<int> positions;

int start = target.indexOf(m_subsequence);
// handle cases like "*S*VGs*tring*"
if (start != -1) {
for (int j = 0; j < m_subsequence.length(); j++) {
positions.append(start + j);
}
} else {
for (int i = 0, j = 0; j < m_subsequence.length(); i++) {
if (target[i] == m_subsequence[j]) {
positions.append(i);
j++;
}
}
}

return positions;
}

int **LCS::createLengthMatrix() const
{
int rows = m_a.length() + 1;
int cols = m_b.length() + 1;

int **matrix = new int*[rows];
matrix[0] = new int[rows * cols];
for (int i = 1; i < rows; i++)
matrix[i] = matrix[0] + i * cols;
memset(matrix[0], 0, sizeof(matrix[0][0]) * rows * cols);

return matrix;
}

// Exploits optimal substructure to calculate the size of the longest common
// subsequnce. Bottom right cell contains the size of the LCS;
void LCS::fillLengthMatrix(int **matrix) const
{
for (int i = 0; i < m_a.length(); i++) {
for (int j = 0; j < m_b.length(); j++) {
if (m_a[i] == m_b[j])
matrix[i + 1][j + 1] = matrix[i][j] + 1;
else
matrix[i + 1][j + 1] = std::max(matrix[i + 1][j], matrix[i][j + 1]);
}
}
}

void LCS::freeLengthMatrix(int **matrix) const
{
delete [] matrix[0];
delete [] matrix;
}

// Work backwards to actually identify the lcs;
QString LCS::backtrackLengthMatrix(int **matrix, int i, int j) const
{
if (i == 0 || j == 0)
return QString();
else if (m_a[i - 1] == m_b[j - 1]) {
return backtrackLengthMatrix(matrix, i - 1, j - 1) + m_a[i - 1];
} else {
if (matrix[i][j - 1] > matrix[i - 1][j])
return backtrackLengthMatrix(matrix, i, j - 1);
else
return backtrackLengthMatrix(matrix, i - 1, j);
}
}
37 changes: 37 additions & 0 deletions src/core/lcs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef LCS_H
#define LCS_H

#include <QString>
#include <QList>

namespace Zeal {
namespace Core {

class LCS
{
public:
LCS();
LCS(const QString &a, const QString &b);

QString subsequence() const;
int length() const;

QList<int> subsequencePositions(int arg) const;
double calcDensity(int arg) const;
double calcSpread(int arg) const;

private:
QString m_a;
QString m_b;
QString m_subsequence;

int **createLengthMatrix() const;
void fillLengthMatrix(int **matrix) const;
void freeLengthMatrix(int **matrix) const;
QString backtrackLengthMatrix(int **matrix, int i, int j) const;
};

} // namespace Core
} // namespace Zeal

#endif // LCS_H
128 changes: 65 additions & 63 deletions src/registry/docset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
****************************************************************************/

#include "docset.h"
#include "docsettoken.h"

#include "searchquery.h"
#include "util/plist.h"
Expand Down Expand Up @@ -237,72 +238,69 @@ QList<SearchResult> Docset::search(const QString &query) const
{
QList<SearchResult> results;

const SearchQuery searchQuery = SearchQuery::fromString(query);
const QString sanitizedQuery = searchQuery.sanitizedQuery();
const SearchQuery searchQuery = SearchQuery::fromString(query.toLower());

if (searchQuery.hasKeywords() && !searchQuery.hasKeywords(m_keywords))
return results;

const DocsetToken queryToken(searchQuery.query());

// Select possible search results
QString queryStr;
if (m_type == Docset::Type::Dash) {
queryStr = QStringLiteral("SELECT name, type, path\n"
"FROM searchIndex\n"
"WHERE name LIKE :queryPatternSubseq ESCAPE '\\'\n"
"ORDER BY\n"
" name LIKE :queryPatternStart ESCAPE '\\' DESC, -- starts with\n"
" name LIKE :queryPatternSubstring ESCAPE '\\' DESC, -- is substring\n"
" LENGTH(name) ASC -- shortest first\n"
"LIMIT 100");
} else if (m_type == Docset::Type::ZDash) {
queryStr = QStringLiteral("SELECT ztokenname, ztypename, zpath, zanchor\n"
"FROM ztoken\n"
"JOIN ztokenmetainformation\n"
" ON ztoken.zmetainformation = ztokenmetainformation.z_pk\n"
"JOIN zfilepath\n"
" ON ztokenmetainformation.zfile = zfilepath.z_pk\n"
"JOIN ztokentype\n"
" ON ztoken.ztokentype = ztokentype.z_pk\n"
"WHERE ztokenname LIKE :queryPatternSubseq ESCAPE '\\'\n"
"ORDER BY\n"
" ztokenname LIKE :queryPatternStart ESCAPE '\\' DESC, -- starts with\n"
" ztokenname LIKE :queryPatternSubstring ESCAPE '\\' DESC, -- is substring\n"
" LENGTH(ztokenname) ASC -- shortest first\n"
"LIMIT 100");
}

bool withSubStrings = false;
// %.%1% for long Django docset values like django.utils.http
// %::%1% for long C++ docset values like std::set
// %/%1% for long Go docset values like archive/tar
QString subNames = QStringLiteral(" OR %1 LIKE '%.%2%' ESCAPE '\\'");
subNames += QLatin1String(" OR %1 LIKE '%::%2%' ESCAPE '\\'");
subNames += QLatin1String(" OR %1 LIKE '%/%2%' ESCAPE '\\'");
while (results.size() < 100) {
QString curQuery = sanitizedQuery;
QString notQuery; // don't return the same result twice
if (withSubStrings) {
// if less than 100 found starting with query, search all substrings
curQuery = QLatin1Char('%') + sanitizedQuery;
// don't return 'starting with' results twice
if (m_type == Docset::Type::Dash)
notQuery = QString(" AND NOT (name LIKE '%1%' ESCAPE '\\' %2) ").arg(sanitizedQuery, subNames.arg("name", sanitizedQuery));
else
notQuery = QString(" AND NOT (ztokenname LIKE '%1%' ESCAPE '\\' %2) ").arg(sanitizedQuery, subNames.arg("ztokenname", sanitizedQuery));
}
if (m_type == Docset::Type::Dash) {
queryStr = QString("SELECT name, type, path "
" FROM searchIndex "
"WHERE (name LIKE '%1%' ESCAPE '\\' %3) %2 "
"ORDER BY name COLLATE NOCASE LIMIT 100")
.arg(curQuery, notQuery, subNames.arg("name", curQuery));
} else {
queryStr = QString("SELECT ztokenname, ztypename, zpath, zanchor "
" FROM ztoken "
"JOIN ztokenmetainformation "
" ON ztoken.zmetainformation = ztokenmetainformation.z_pk "
"JOIN zfilepath "
" ON ztokenmetainformation.zfile = zfilepath.z_pk "
"JOIN ztokentype "
" ON ztoken.ztokentype = ztokentype.z_pk "
"WHERE (ztokenname LIKE '%1%' ESCAPE '\\' %3) %2 "
"ORDER BY ztokenname COLLATE NOCASE LIMIT 100")
.arg(curQuery, notQuery, subNames.arg("ztokenname", curQuery));
}
QSqlQuery sqlQuery(database());
sqlQuery.prepare(queryStr);

const QString queryPatternSubseq = searchQuery.sanitizedQuerySubseq();
const QString queryPattern = searchQuery.sanitizedQuery();
sqlQuery.bindValue(":queryPatternSubseq", queryPatternSubseq);
sqlQuery.bindValue(":queryPatternStart", queryPattern + QChar('%'));
sqlQuery.bindValue(":queryPatternSubstring", QChar('%') + queryPattern + QChar('%'));

sqlQuery.exec();

QSqlQuery query(queryStr, database());
while (query.next()) {
const QString itemName = query.value(0).toString();
QString path = query.value(2).toString();
if (m_type == Docset::Type::ZDash) {
const QString anchor = query.value(3).toString();
if (!anchor.isEmpty())
path += QLatin1Char('#') + anchor;
}

/// TODO: Third should be type
results.append(SearchResult{itemName, QString(),
parseSymbolType(query.value(1).toString()),
const_cast<Docset *>(this), path, sanitizedQuery});
while (sqlQuery.next()) {
DocsetToken token(sqlQuery.value(0).toString());
QString type = parseSymbolType(sqlQuery.value(1).toString());
QString path = sqlQuery.value(2).toString();
if (m_type == Docset::Type::ZDash) {
const QString anchor = sqlQuery.value(3).toString();
if (!anchor.isEmpty())
path += QLatin1Char('#') + anchor;
}

if (withSubStrings)
break;
withSubStrings = true; // try again searching for substrings
// Only actually return relevant ones
SearchRelevancy searchRelevancy = SearchRelevancy::fromQuery(token, queryToken);
if (searchRelevancy.relevancy > 0) {
results.append(SearchResult{token, queryToken, type,
const_cast<Docset *>(this), path,
searchRelevancy});
}
}

return results;
Expand Down Expand Up @@ -338,17 +336,21 @@ QList<SearchResult> Docset::relatedLinks(const QUrl &url) const

QSqlQuery query(queryStr.arg(cleanUrl.toString()), database());

DocsetToken queryToken(QString(""));
while (query.next()) {
const QString sectionName = query.value(0).toString();
DocsetToken token(query.value(0).toString());
QString type = parseSymbolType(query.value(1).toString());
QString sectionPath = query.value(2).toString();
if (m_type == Docset::Type::ZDash) {
sectionPath += QLatin1Char('#');
sectionPath += query.value(3).toString();
const QString anchor = query.value(3).toString();
if (!anchor.isEmpty())
path += QLatin1Char('#') + anchor;
}

results.append(SearchResult{sectionName, QString(),
parseSymbolType(query.value(1).toString()),
const_cast<Docset *>(this), sectionPath, QString()});
SearchRelevancy searchRelevancy = SearchRelevancy::fromQuery(token, queryToken);
results.append(SearchResult{token, queryToken, type,
const_cast<Docset *>(this), sectionPath,
searchRelevancy});
}

if (results.size() == 1)
Expand Down
1 change: 1 addition & 0 deletions src/registry/docset.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class Docset
};

QSqlDatabase database() const;

void loadMetadata();
void countSymbols();
void loadSymbols(const QString &symbolType) const;
Expand Down
1 change: 1 addition & 0 deletions src/registry/docsetregistry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ void DocsetRegistry::_runQuery(const QString &query)

std::sort(m_queryResults.begin(), m_queryResults.end());

m_queryResults = m_queryResults.mid(0, 100);
emit queryCompleted();
}

Expand Down
Loading