Skip to content

Commit

Permalink
Merge pull request #838 from kiwix/language_handling_during_search
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr authored Nov 1, 2022
2 parents 8cc1c47 + d1b8519 commit a52138e
Show file tree
Hide file tree
Showing 13 changed files with 199 additions and 92 deletions.
2 changes: 1 addition & 1 deletion src/book.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ bool Book::update(const kiwix::Book& other)
void Book::update(const zim::Archive& archive) {
m_path = archive.getFilename();
m_pathValid = true;
m_id = getArchiveId(archive);
m_id = std::string(archive.getUuid());
m_title = getArchiveTitle(archive);
m_description = getMetaDescription(archive);
m_language = getMetaLanguage(archive);
Expand Down
154 changes: 87 additions & 67 deletions src/server/internalServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ extern "C" {
#include "request_context.h"
#include "response.h"

#define MAX_SEARCH_LEN 140
#define DEFAULT_CACHE_SIZE 2

namespace kiwix {
Expand Down Expand Up @@ -212,6 +211,16 @@ void checkBookNumber(const Library::BookIdSet& bookIds, size_t limit) {
}
}

typedef std::set<std::string> Languages;

Languages getLanguages(const Library& lib, const Library::BookIdSet& bookIds) {
Languages langs;
for ( const auto& b : bookIds ) {
langs.insert(lib.getBookById(b).getLanguage());
}
return langs;
}

struct CustomizedResourceData
{
std::string mimeType;
Expand Down Expand Up @@ -307,6 +316,10 @@ SearchInfo InternalServer::getSearchInfo(const RequestContext& request) const
{
auto bookIds = selectBooks(request);
checkBookNumber(bookIds.second, m_multizimSearchLimit);
if ( getLanguages(*mp_library, bookIds.second).size() != 1 ) {
throw Error(nonParameterizedMessage("confusion-of-tongues"));
}

auto pattern = request.get_optional_param<std::string>("pattern", "");
GeoQuery geoQuery;

Expand Down Expand Up @@ -813,86 +826,93 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
}

try {
auto searchInfo = getSearchInfo(request);
auto bookIds = searchInfo.getBookIds();
return handle_search_request(request);
} catch (const Error& e) {
return HTTP400Response(*this, request)
+ invalidUrlMsg
+ e.message();
}
}

/* Make the search */
// Try to get a search from the searchInfo, else build it
auto searcher = mp_library->getSearcherByIds(bookIds);
auto lock(searcher->getLock());
namespace
{

std::shared_ptr<zim::Search> search;
try {
search = searchCache.getOrPut(searchInfo,
[=](){
return make_shared<zim::Search>(searcher->search(searchInfo.getZimQuery(m_verbose.load())));
}
);
} catch(std::runtime_error& e) {
// Searcher->search will throw a runtime error if there is no valid xapian database to do the search.
// (in case of zim file not containing a index)
const auto cssUrl = renderUrl(m_root, RESOURCE::templates::url_of_search_results_css);
HTTPErrorResponse response(*this, request, MHD_HTTP_NOT_FOUND,
"fulltext-search-unavailable",
"404-page-heading",
cssUrl);
response += nonParameterizedMessage("no-search-results");
// XXX: Now this has to be handled by the iframe-based viewer which
// XXX: has to resolve if the book selection resulted in a single book.
/*
if(bookIds.size() == 1) {
auto bookId = *bookIds.begin();
auto bookName = mp_nameMapper->getNameForId(bookId);
response += TaskbarInfo(bookName, mp_library->getArchiveById(bookId).get());
}
*/
return response;
}
unsigned getSearchPageSize(const RequestContext& r)
{
const auto DEFAULT_PAGE_LEN = 25u;
const auto MAX_PAGE_LEN = 140u;

auto start = 1;
try {
start = request.get_argument<unsigned int>("start");
} catch (const std::exception&) {}
start = max(1, start);
const auto pageLength = r.get_optional_param("pageLength", DEFAULT_PAGE_LEN);
return pageLength == 0
? DEFAULT_PAGE_LEN
: min(MAX_PAGE_LEN, pageLength);
}

auto pageLength = 25;
try {
pageLength = request.get_argument<unsigned int>("pageLength");
} catch (const std::exception&) {}
if (pageLength > MAX_SEARCH_LEN) {
pageLength = MAX_SEARCH_LEN;
}
if (pageLength == 0) {
pageLength = 25;
}
} // unnamed namespace

/* Get the results */
SearchRenderer renderer(search->getResults(start-1, pageLength), mp_nameMapper, mp_library, start,
search->getEstimatedMatches());
renderer.setSearchPattern(searchInfo.pattern);
renderer.setSearchBookQuery(searchInfo.bookFilterQuery);
renderer.setProtocolPrefix(m_root + "/content/");
renderer.setSearchProtocolPrefix(m_root + "/search");
renderer.setPageLength(pageLength);
if (request.get_requested_format() == "xml") {
return ContentResponse::build(*this, renderer.getXml(), "application/rss+xml; charset=utf-8");
}
auto response = ContentResponse::build(*this, renderer.getHtml(), "text/html; charset=utf-8");
std::unique_ptr<Response> InternalServer::handle_search_request(const RequestContext& request)
{
auto searchInfo = getSearchInfo(request);
auto bookIds = searchInfo.getBookIds();

/* Make the search */
// Try to get a search from the searchInfo, else build it
auto searcher = mp_library->getSearcherByIds(bookIds);
auto lock(searcher->getLock());

std::shared_ptr<zim::Search> search;
try {
search = searchCache.getOrPut(searchInfo,
[=](){
return make_shared<zim::Search>(searcher->search(searchInfo.getZimQuery(m_verbose.load())));
}
);
} catch(std::runtime_error& e) {
// Searcher->search will throw a runtime error if there is no valid xapian database to do the search.
// (in case of zim file not containing a index)
const auto cssUrl = renderUrl(m_root, RESOURCE::templates::url_of_search_results_css);
HTTPErrorResponse response(*this, request, MHD_HTTP_NOT_FOUND,
"fulltext-search-unavailable",
"404-page-heading",
cssUrl);
response += nonParameterizedMessage("no-search-results");
// XXX: Now this has to be handled by the iframe-based viewer which
// XXX: has to resolve if the book selection resulted in a single book.
/*
if(bookIds.size() == 1) {
auto bookId = *bookIds.begin();
auto bookName = mp_nameMapper->getNameForId(bookId);
response->set_taskbar(bookName, mp_library->getArchiveById(bookId).get());
response += TaskbarInfo(bookName, mp_library->getArchiveById(bookId).get());
}
*/
return std::move(response);
} catch (const Error& e) {
return HTTP400Response(*this, request)
+ invalidUrlMsg
+ e.message();
return response;
}

const auto start = max(1u, request.get_optional_param("start", 1u));
const auto pageLength = getSearchPageSize(request);

/* Get the results */
SearchRenderer renderer(search->getResults(start-1, pageLength), mp_nameMapper, mp_library, start,
search->getEstimatedMatches());
renderer.setSearchPattern(searchInfo.pattern);
renderer.setSearchBookQuery(searchInfo.bookFilterQuery);
renderer.setProtocolPrefix(m_root + "/content/");
renderer.setSearchProtocolPrefix(m_root + "/search");
renderer.setPageLength(pageLength);
if (request.get_requested_format() == "xml") {
return ContentResponse::build(*this, renderer.getXml(), "application/rss+xml; charset=utf-8");
}
auto response = ContentResponse::build(*this, renderer.getHtml(), "text/html; charset=utf-8");
// XXX: Now this has to be handled by the iframe-based viewer which
// XXX: has to resolve if the book selection resulted in a single book.
/*
if(bookIds.size() == 1) {
auto bookId = *bookIds.begin();
auto bookName = mp_nameMapper->getNameForId(bookId);
response->set_taskbar(bookName, mp_library->getArchiveById(bookId).get());
}
*/
return std::move(response);
}

std::unique_ptr<Response> InternalServer::handle_random(const RequestContext& request)
Expand Down
1 change: 1 addition & 0 deletions src/server/internalServer.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ class InternalServer {
std::unique_ptr<Response> handle_catalog_v2_languages(const RequestContext& request);
std::unique_ptr<Response> handle_catalog_v2_illustration(const RequestContext& request);
std::unique_ptr<Response> handle_search(const RequestContext& request);
std::unique_ptr<Response> handle_search_request(const RequestContext& request);
std::unique_ptr<Response> handle_suggest(const RequestContext& request);
std::unique_ptr<Response> handle_random(const RequestContext& request);
std::unique_ptr<Response> handle_catch(const RequestContext& request);
Expand Down
8 changes: 8 additions & 0 deletions src/server/request_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ MHD_Result RequestContext::fill_argument(void *__this, enum MHD_ValueKind kind,
{
RequestContext *_this = static_cast<RequestContext*>(__this);
_this->arguments[key].push_back(value == nullptr ? "" : value);
if ( ! _this->queryString.empty() ) {
_this->queryString += "&";
}
_this->queryString += key;
if ( value ) {
_this->queryString += "=";
_this->queryString += value;
}
return MHD_YES;
}

Expand Down
5 changes: 2 additions & 3 deletions src/server/request_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,7 @@ class RequestContext {
std::string get_url_part(int part) const;
std::string get_full_url() const;

std::string get_query(bool mustEncode = false) const {
return get_query([](const std::string& key) {return true;}, mustEncode);
}
std::string get_query() const { return queryString; }

template<class F>
std::string get_query(F filter, bool mustEncode) const {
Expand Down Expand Up @@ -132,6 +130,7 @@ class RequestContext {
ByteRange byteRange_;
std::map<std::string, std::string> headers;
std::map<std::string, std::vector<std::string>> arguments;
std::string queryString;

private: // functions
static MHD_Result fill_header(void *, enum MHD_ValueKind, const char*, const char*);
Expand Down
4 changes: 0 additions & 4 deletions src/tools/archiveTools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,6 @@ std::string getMetaFlavour(const zim::Archive& archive) {
return getMetadata(archive, "Flavour");
}

std::string getArchiveId(const zim::Archive& archive) {
return (std::string) archive.getUuid();
}

bool getArchiveFavicon(const zim::Archive& archive, unsigned size,
std::string& content, std::string& mimeType){
try {
Expand Down
1 change: 0 additions & 1 deletion src/tools/archiveTools.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ namespace kiwix
std::string getMetaCreator(const zim::Archive& archive);
std::string getMetaPublisher(const zim::Archive& archive);
std::string getMetaFlavour(const zim::Archive& archive);
std::string getArchiveId(const zim::Archive& archive);

bool getArchiveFavicon(const zim::Archive& archive, unsigned size,
std::string& content, std::string& mimeType);
Expand Down
1 change: 1 addition & 0 deletions static/i18n/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@
, "home-button-text": "Go to the main page of '{{BOOK_TITLE}}'"
, "random-page-button-text": "Go to a randomly selected page"
, "searchbox-tooltip": "Search '{{BOOK_TITLE}}'"
, "confusion-of-tongues": "Two or more books in different languages would participate in search, which may lead to confusing results."
}
4 changes: 4 additions & 0 deletions test/data/lib_for_server_search_test.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<library version="20110515">
<book id="5dc0b3af-5df2-0925-f0ca-d2bf75e78af6" path="example.zim" title="Wikibooks" description="testZim" language="eng" creator="test" publisher="test" tags="_ftindex:yes;_ftindex:yes;_pictures:yes;_videos:yes;_details:yes" date="2021-04-17" mediaCount="22" size="253" />
<book id="6f1d19d0-633f-087b-fb55-7ac324ff9baf" path="zimfile.zim" title="Ray Charles" description="Wikipedia articles about Ray Charles" language="eng" creator="Wikipedia" publisher="Kiwix" name="wikipedia_en_ray_charles" flavour="_mini" tags="wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes" date="2020-03-31" articleCount="129" mediaCount="45" size="555" />
</library>
8 changes: 4 additions & 4 deletions test/library_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ TEST_F(LibraryServerTest, catalog_search_results_pagination)
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Filtered zims (count=1&amp;start=1)</title>\n"
" <title>Filtered zims (start=1&amp;count=1)</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>3</totalResults>\n"
" <startIndex>1</startIndex>\n"
Expand All @@ -375,7 +375,7 @@ TEST_F(LibraryServerTest, catalog_search_results_pagination)
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Filtered zims (count=10&amp;start=100)</title>\n"
" <title>Filtered zims (start=100&amp;count=10)</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>3</totalResults>\n"
" <startIndex>100</startIndex>\n"
Expand Down Expand Up @@ -638,8 +638,8 @@ TEST_F(LibraryServerTest, catalog_v2_entries_filtered_by_range)
const auto r = zfs1_->GET("/ROOT/catalog/v2/entries?start=1&count=1");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
CATALOG_V2_ENTRIES_PREAMBLE("?count=1&start=1")
" <title>Filtered Entries (count=1&amp;start=1)</title>\n"
CATALOG_V2_ENTRIES_PREAMBLE("?start=1&count=1")
" <title>Filtered Entries (start=1&amp;count=1)</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>3</totalResults>\n"
" <startIndex>1</startIndex>\n"
Expand Down
1 change: 1 addition & 0 deletions test/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ if gtest_dep.found() and not meson.is_cross_build()
'corner_cases.zim',
'poor.zim',
'library.xml',
'lib_for_server_search_test.xml',
'customized_resources.txt',
'helloworld.txt',
'welcome.html',
Expand Down
10 changes: 5 additions & 5 deletions test/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -839,7 +839,7 @@ TEST_F(ServerTest, Http400HtmlError)
expected_body==R"(
<h1>Invalid request</h1>
<p>
The requested URL "/ROOT/search?books.filter.lang=eng&pattern=" is not a valid request.
The requested URL "/ROOT/search?books.filter.lang=eng&pattern" is not a valid request.
</p>
<p>
No query provided.
Expand Down Expand Up @@ -896,21 +896,21 @@ TEST_F(ServerTest, HttpXmlError)
/* HTTP status code */ 400,
/* expected response XML */ R"(
<error>Invalid request</error>
<detail>The requested URL "/ROOT/search?content=zimfile&format=xml" is not a valid request.</detail>
<detail>The requested URL "/ROOT/search?format=xml&content=zimfile" is not a valid request.</detail>
<detail>No query provided.</detail>
)" },
{ /* url */ "/ROOT/search?format=xml&content=non-existing-book&pattern=asdfqwerty",
/* HTTP status code */ 400,
/* expected response XML */ R"(
<error>Invalid request</error>
<detail>The requested URL "/ROOT/search?content=non-existing-book&format=xml&pattern=asdfqwerty" is not a valid request.</detail>
<detail>The requested URL "/ROOT/search?format=xml&content=non-existing-book&pattern=asdfqwerty" is not a valid request.</detail>
<detail>No such book: non-existing-book</detail>
)" },
{ /* url */ "/ROOT/search?format=xml&content=non-existing-book&pattern=a\"<script foo>",
/* HTTP status code */ 400,
/* expected response XML */ R"(
<error>Invalid request</error>
<detail>The requested URL "/ROOT/search?content=non-existing-book&format=xml&pattern=a"&lt;script foo&gt;" is not a valid request.</detail>
<detail>The requested URL "/ROOT/search?format=xml&content=non-existing-book&pattern=a"&lt;script foo&gt;" is not a valid request.</detail>
<detail>No such book: non-existing-book</detail>
)" },
// There is a flaw in our way to handle query string, we cannot differenciate
Expand All @@ -919,7 +919,7 @@ TEST_F(ServerTest, HttpXmlError)
/* HTTP status code */ 400,
/* expected response XML */ R"(
<error>Invalid request</error>
<detail>The requested URL "/ROOT/search?books.filter.lang=eng&format=xml&pattern=" is not a valid request.</detail>
<detail>The requested URL "/ROOT/search?format=xml&books.filter.lang=eng&pattern" is not a valid request.</detail>
<detail>No query provided.</detail>
)" },
{ /* url */ "/ROOT/search?format=xml&pattern=foo",
Expand Down
Loading

0 comments on commit a52138e

Please sign in to comment.