Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Xapian-based catalog search #460

Merged
merged 10 commits into from
Mar 17, 2021
29 changes: 23 additions & 6 deletions include/library.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "book.h"
#include "bookmark.h"
#include "common.h"
#include <xapian.h>

#define KIWIX_LIBRARY_VERSION "20110515"

Expand Down Expand Up @@ -58,6 +59,7 @@ class Filter {
std::string _creator;
size_t _maxSize;
std::string _query;
bool _queryIsPartial;
std::string _name;

public:
Expand Down Expand Up @@ -102,10 +104,16 @@ class Filter {
Filter& publisher(std::string publisher);
Filter& creator(std::string creator);
Filter& maxSize(size_t size);
Filter& query(std::string query);
Filter& query(std::string query, bool partial=true);
Filter& name(std::string name);

bool hasQuery() const;
const std::string& getQuery() const { return _query; }
bool queryIsPartial() const { return _queryIsPartial; }

bool accept(const Book& book) const;
bool acceptByQueryOnly(const Book& book) const;
bool acceptByNonQueryCriteria(const Book& book) const;
};


Expand All @@ -117,6 +125,10 @@ class Library
std::map<std::string, kiwix::Book> m_books;
std::map<std::string, std::shared_ptr<Reader>> m_readers;
std::vector<kiwix::Bookmark> m_bookmarks;
Xapian::WritableDatabase m_bookDB;

public:
typedef std::vector<std::string> BookIdCollection;

public:
Library();
Expand Down Expand Up @@ -220,7 +232,7 @@ class Library
*
* @return A list of book ids.
*/
std::vector<std::string> getBooksIds();
BookIdCollection getBooksIds();

/**
* Filter the library and generate a new one with the keep elements.
Expand All @@ -230,7 +242,7 @@ class Library
* @param search List only books with search in the title or description.
* @return The list of bookIds corresponding to the query.
*/
DEPRECATED std::vector<std::string> filter(const std::string& search);
DEPRECATED BookIdCollection filter(const std::string& search);


/**
Expand All @@ -239,7 +251,7 @@ class Library
* @param filter The filter to use.
* @return The list of bookIds corresponding to the filter.
*/
std::vector<std::string> filter(const Filter& filter);
BookIdCollection filter(const Filter& filter);


/**
Expand All @@ -249,7 +261,7 @@ class Library
* @param comparator how to sort the books
* @return The sorted list of books
*/
void sort(std::vector<std::string>& bookIds, supportedListSortBy sortBy, bool ascending);
void sort(BookIdCollection& bookIds, supportedListSortBy sortBy, bool ascending);

/**
* List books in the library.
Expand All @@ -273,7 +285,7 @@ class Library
* Set to 0 to cancel this filter.
* @return The list of bookIds corresponding to the query.
*/
DEPRECATED std::vector<std::string> listBooksIds(
DEPRECATED BookIdCollection listBooksIds(
int supportedListMode = ALL,
supportedListSortBy sortBy = UNSORTED,
const std::string& search = "",
Expand All @@ -285,7 +297,12 @@ class Library

friend class OPDSDumper;
friend class libXMLDumper;

private: // functions
BookIdCollection getBooksByTitleOrDescription(const Filter& filter);
void updateBookDB(const Book& book);
};

}

#endif
5 changes: 3 additions & 2 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pugixml_dep = dependency('pugixml', static:static_deps)
libcurl_dep = dependency('libcurl', static:static_deps)
microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
zlib_dep = dependency('zlib', static:static_deps)
xapian_dep = dependency('xapian-core', static:static_deps)

if compiler.has_header('mustache.hpp')
extra_include = []
Expand All @@ -55,7 +56,7 @@ if target_machine.system() == 'windows' and static_deps
extra_cflags += '-DCURL_STATICLIB'
endif

all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep]
veloman-yunkan marked this conversation as resolved.
Show resolved Hide resolved
all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]

inc = include_directories('include', extra_include)

Expand All @@ -74,7 +75,7 @@ subdir('static')
subdir('src')
subdir('test')

pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd']
pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd', 'xapian-core']

pkg_conf = configuration_data()
pkg_conf.set('prefix', get_option('prefix'))
Expand Down
124 changes: 111 additions & 13 deletions src/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,31 @@
#include <pugixml.hpp>
#include <algorithm>
#include <set>
#include <unicode/locid.h>

namespace kiwix
{

namespace
{

std::string iso639_3ToXapian(const std::string& lang) {
return icu::Locale(lang.c_str()).getLanguage();
};

std::string normalizeText(const std::string& text, const std::string& language)
{
return removeAccents(text);
}

} // unnamed namespace

/* Constructor */
Library::Library()
: m_bookDB("", Xapian::DB_BACKEND_INMEMORY)
{
}

/* Destructor */
Library::~Library()
{
Expand All @@ -47,6 +64,7 @@ Library::~Library()
bool Library::addBook(const Book& book)
{
/* Try to find it */
updateBookDB(book);
try {
auto& oldbook = m_books.at(book.getId());
oldbook.update(book);
Expand Down Expand Up @@ -211,9 +229,9 @@ const std::vector<kiwix::Bookmark> Library::getBookmarks(bool onlyValidBookmarks
return validBookmarks;
}

std::vector<std::string> Library::getBooksIds()
Library::BookIdCollection Library::getBooksIds()
{
std::vector<std::string> bookIds;
BookIdCollection bookIds;

for (auto& pair: m_books) {
bookIds.push_back(pair.first);
Expand All @@ -222,7 +240,7 @@ std::vector<std::string> Library::getBooksIds()
return bookIds;
}

std::vector<std::string> Library::filter(const std::string& search)
Library::BookIdCollection Library::filter(const std::string& search)
{
if (search.empty()) {
return getBooksIds();
Expand All @@ -232,18 +250,82 @@ std::vector<std::string> Library::filter(const std::string& search)
}


std::vector<std::string> Library::filter(const Filter& filter)
void Library::updateBookDB(const Book& book)
{
std::vector<std::string> bookIds;
for(auto& pair:m_books) {
auto book = pair.second;
if(filter.accept(book)) {
bookIds.push_back(pair.first);
}
Xapian::Stem stemmer;
Xapian::TermGenerator indexer;
const std::string lang = book.getLanguage();
try {
stemmer = Xapian::Stem(iso639_3ToXapian(lang));
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {}
Xapian::Document doc;
indexer.set_document(doc);

const std::string title = normalizeText(book.getTitle(), lang);
const std::string desc = normalizeText(book.getDescription(), lang);
doc.add_value(0, title);
doc.add_value(1, desc);
doc.set_data(book.getId());

indexer.index_text(title, 1, "S");
indexer.index_text(desc, 1, "XD");

// Index fields without prefixes for general search
indexer.index_text(title);
indexer.increase_termpos();
indexer.index_text(desc);

const std::string idterm = "Q" + book.getId();
doc.add_boolean_term(idterm);
m_bookDB.replace_document(idterm, doc);
}

Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter)
{
if ( !filter.hasQuery() )
return getBooksIds();

BookIdCollection bookIds;
Xapian::QueryParser queryParser;
queryParser.set_default_op(Xapian::Query::OP_AND);
queryParser.add_prefix("title", "S");
queryParser.add_prefix("description", "XD");
const auto partialQueryFlag = filter.queryIsPartial()
? Xapian::QueryParser::FLAG_PARTIAL
: 0;
// Language assumed for the query is not known for sure so stemming
// is not applied
//queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian(???)));
//queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
const auto flags = Xapian::QueryParser::FLAG_PHRASE
| Xapian::QueryParser::FLAG_BOOLEAN
| Xapian::QueryParser::FLAG_LOVEHATE
| Xapian::QueryParser::FLAG_WILDCARD
| partialQueryFlag;
mgautierfr marked this conversation as resolved.
Show resolved Hide resolved
const auto query = queryParser.parse_query(filter.getQuery(), flags);
Xapian::Enquire enquire(m_bookDB);
enquire.set_query(query);
const auto results = enquire.get_mset(0, m_books.size());
for ( auto it = results.begin(); it != results.end(); ++it ) {
bookIds.push_back(it.get_document().get_data());
}

return bookIds;
}

Library::BookIdCollection Library::filter(const Filter& filter)
{
BookIdCollection result;
for(auto id : getBooksByTitleOrDescription(filter)) {
if(filter.acceptByNonQueryCriteria(m_books[id])) {
result.push_back(id);
}
}
return result;
}

template<supportedListSortBy SORT>
struct KEY_TYPE {
typedef std::string TYPE;
Expand Down Expand Up @@ -303,7 +385,7 @@ std::string Comparator<PUBLISHER>::get_key(const std::string& id)
return lib->getBookById(id).getPublisher();
}

void Library::sort(std::vector<std::string>& bookIds, supportedListSortBy sort, bool ascending)
void Library::sort(BookIdCollection& bookIds, supportedListSortBy sort, bool ascending)
{
switch(sort) {
case TITLE:
Expand All @@ -327,7 +409,7 @@ void Library::sort(std::vector<std::string>& bookIds, supportedListSortBy sort,
}


std::vector<std::string> Library::listBooksIds(
Library::BookIdCollection Library::listBooksIds(
int mode,
supportedListSortBy sortBy,
const std::string& search,
Expand Down Expand Up @@ -479,9 +561,10 @@ Filter& Filter::maxSize(size_t maxSize)
return *this;
}

Filter& Filter::query(std::string query)
Filter& Filter::query(std::string query, bool partial)
{
_query = query;
_queryIsPartial = partial;
activeFilters |= QUERY;
return *this;
}
Expand All @@ -495,7 +578,17 @@ Filter& Filter::name(std::string name)

#define ACTIVE(X) (activeFilters & (X))
#define FILTER(TAG, TEST) if (ACTIVE(TAG) && !(TEST)) { return false; }
bool Filter::hasQuery() const
{
return ACTIVE(QUERY);
}

bool Filter::accept(const Book& book) const
{
return acceptByNonQueryCriteria(book) && acceptByQueryOnly(book);
}

bool Filter::acceptByNonQueryCriteria(const Book& book) const
{
auto local = !book.getPath().empty();
FILTER(_LOCAL, local)
Expand Down Expand Up @@ -538,6 +631,11 @@ bool Filter::accept(const Book& book) const
}
}
}
return true;
}

bool Filter::acceptByQueryOnly(const Book& book) const
{
if ( ACTIVE(QUERY)
&& !(matchRegex(book.getTitle(), "\\Q" + _query + "\\E")
|| matchRegex(book.getDescription(), "\\Q" + _query + "\\E")))
Expand Down
10 changes: 5 additions & 5 deletions test/data/library.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,24 @@
id="raycharles_uncategorized"
path="./zimfile.zim"
url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim"
title="Ray Charles"
description="Wikipedia articles about Ray Charles"
title="Ray (uncategorized) Charles"
description="No category is assigned to this library entry."
language="eng"
creator="Wikipedia"
publisher="Kiwix"
date="2020-03-31"
name="wikipedia_en_ray_charles"
tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes"
tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no"
articleCount="284"
mediaCount="2"
size="556"
size="123"
></book>
<book
id="charlesray"
path="./zimfile.zim"
url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim"
title="Charles, Ray"
description="Wikipedia articles about Charles, Ray"
description="Wikipedia articles about Ray Charles"
language="eng"
creator="Wikipedia"
publisher="Kiwix"
Expand Down
Loading