From fcba46252387f9442bd7624de647503e1d3bc30e Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sun, 14 Aug 2022 16:28:02 +0400 Subject: [PATCH 1/3] Enter Grouping The new class Grouping implements the current straightforward approach for sorting the dirents in cluster order, but opens a way for switching to a better solution. A user-observable effect of this change is the lower memory usage after `FileImpl::prepareArticleListByCluster()` has completed (but an increased high-watermark memory usage of that function). --- src/fileimpl.cpp | 63 ++++++++++++++++++++++++++++++++++++++++++------ src/fileimpl.h | 3 +-- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 1593ba634..f2a2223fa 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -80,6 +80,53 @@ makeFileReader(std::shared_ptr zimFile, offset_t offset, zsi } } +// Consider a set of integer-numbered objects with their object-ids spanning a +// contiguous range [a, b). +// Each object is also labelled with an integer group id. The group-ids too +// form a contiguous (or dense enough) set. +// The Grouping class allows to re-arrange the stream of such objects fed +// to it in the object-id order, returning a table of object-ids in the group-id +// order (where the order of the objects within the same group is preserved). +// +template +class Grouping +{ +public: // types + typedef std::vector GroupedObjectIds; + +public: // functions + explicit Grouping(ObjectId objectIdBegin, ObjectId objectIdEnd) + : firstObjectId_(objectIdBegin) + { + groupAndObjectIds_.reserve(objectIdEnd - objectIdBegin); + } + + void add(ObjectId objectId, GroupId groupId) + { + assert(objectId == firstObjectId_ + groupAndObjectIds_.size()); + groupAndObjectIds_.push_back({groupId, objectId}); + } + + GroupedObjectIds getGroupedObjectIds() + { + std::sort(groupAndObjectIds_.begin(), groupAndObjectIds_.end()); + GroupedObjectIds result; + result.reserve(groupAndObjectIds_.size()); + for ( const auto groupAndObjectId : groupAndObjectIds_ ) { + result.push_back(groupAndObjectId.second); + } + GroupAndObjectIds().swap(groupAndObjectIds_); + return result; + } + +private: // types + typedef std::vector> GroupAndObjectIds; + +private: // data + const ObjectId firstObjectId_; + GroupAndObjectIds groupAndObjectIds_; +}; + } //unnamed namespace ////////////////////////////////////////////////////////////////////// @@ -348,24 +395,24 @@ makeFileReader(std::shared_ptr zimFile, offset_t offset, zsi void FileImpl::prepareArticleListByCluster() const { - m_articleListByCluster.reserve(getUserEntryCount().v); - - auto endIdx = getEndUserEntry().v; - for(auto i = getStartUserEntry().v; i < endIdx; i++) + const auto endIdx = getEndUserEntry().v; + const auto startIdx = getStartUserEntry().v; + Grouping g(startIdx, endIdx); + for(auto i = startIdx; i < endIdx; i++) { // This is the offset of the dirent in the zimFile auto indexOffset = mp_urlDirentAccessor->getOffset(entry_index_t(i)); // Get the mimeType of the dirent (offset 0) to know the type of the dirent uint16_t mimeType = zimReader->read_uint(indexOffset); if (mimeType==Dirent::redirectMimeType || mimeType==Dirent::linktargetMimeType || mimeType == Dirent::deletedMimeType) { - m_articleListByCluster.push_back(std::make_pair(0, i)); + g.add(i, 0); } else { // If it is a classic article, get the clusterNumber (at offset 8) auto clusterNumber = zimReader->read_uint(indexOffset+offset_t(8)); - m_articleListByCluster.push_back(std::make_pair(clusterNumber, i)); + g.add(i, clusterNumber); } } - std::sort(m_articleListByCluster.begin(), m_articleListByCluster.end()); + m_articleListByCluster = g.getGroupedObjectIds(); } entry_index_t FileImpl::getIndexByClusterOrder(entry_index_t idx) const @@ -380,7 +427,7 @@ makeFileReader(std::shared_ptr zimFile, offset_t offset, zsi } if (idx.v >= m_articleListByCluster.size()) throw std::out_of_range("entry index out of range"); - return entry_index_t(m_articleListByCluster[idx.v].second); + return entry_index_t(m_articleListByCluster[idx.v]); } FileImpl::ClusterHandle FileImpl::readCluster(cluster_index_t idx) diff --git a/src/fileimpl.h b/src/fileimpl.h index 4e1f7d78a..cc85c46d6 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -68,8 +68,7 @@ namespace zim typedef std::vector MimeTypes; MimeTypes mimeTypes; - using pair_type = std::pair; - mutable std::vector m_articleListByCluster; + mutable std::vector m_articleListByCluster; mutable std::mutex m_articleListByClusterMutex; struct DirentLookupConfig From f9b9f84234e46f593aea6b26b788c947a4d42d37 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sun, 14 Aug 2022 18:05:18 +0400 Subject: [PATCH 2/3] Optimized Grouping Got rid of O(NlogN) sorting in FileImpl::prepareArticleListByCluster(). Now its time-complexitiy is O(N). Also, its high-watermark memory usage was lowered roughly to the level before Grouping was introduced. --- src/fileimpl.cpp | 50 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index f2a2223fa..95501c0f8 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include "config.h" #include "log.h" #include "envvalue.h" @@ -97,34 +98,63 @@ class Grouping public: // functions explicit Grouping(ObjectId objectIdBegin, ObjectId objectIdEnd) : firstObjectId_(objectIdBegin) + , minGroupId_(std::numeric_limits::max()) + , maxGroupId_(std::numeric_limits::min()) { - groupAndObjectIds_.reserve(objectIdEnd - objectIdBegin); + groupIds_.reserve(objectIdEnd - objectIdBegin); } void add(ObjectId objectId, GroupId groupId) { - assert(objectId == firstObjectId_ + groupAndObjectIds_.size()); - groupAndObjectIds_.push_back({groupId, objectId}); + assert(objectId == firstObjectId_ + groupIds_.size()); + groupIds_.push_back(groupId); + minGroupId_ = std::min(minGroupId_, groupId); + maxGroupId_ = std::max(maxGroupId_, groupId); } GroupedObjectIds getGroupedObjectIds() { - std::sort(groupAndObjectIds_.begin(), groupAndObjectIds_.end()); GroupedObjectIds result; - result.reserve(groupAndObjectIds_.size()); - for ( const auto groupAndObjectId : groupAndObjectIds_ ) { - result.push_back(groupAndObjectId.second); + if ( !groupIds_.empty() ) { + // nextObjectSeat[g - minGroupId_] tells where the next object + // with group-id g must be placed (seated) in the result + std::vector nextObjectSeat = getGroupBoundaries(); + + result.resize(groupIds_.size()); + for ( size_t i = 0; i < groupIds_.size(); ++i ) { + const GroupId g = groupIds_[i]; + // This statement has an important side-effect vv + const auto pos = nextObjectSeat[g - minGroupId_]++; + result[pos] = firstObjectId_ + i; + } + GroupIds().swap(groupIds_); } - GroupAndObjectIds().swap(groupAndObjectIds_); return result; } +private: // functions + std::vector getGroupBoundaries() const + { + std::vector groupIdCounts(maxGroupId_ - minGroupId_ + 1, 0); + for ( const auto groupId : groupIds_ ) { + ++groupIdCounts[groupId - minGroupId_]; + } + + std::vector groupBoundaries(1, 0); + std::partial_sum(groupIdCounts.begin(), groupIdCounts.end(), + std::back_inserter(groupBoundaries) + ); + return groupBoundaries; + } + private: // types - typedef std::vector> GroupAndObjectIds; + typedef std::vector GroupIds; private: // data const ObjectId firstObjectId_; - GroupAndObjectIds groupAndObjectIds_; + GroupIds groupIds_; + GroupId minGroupId_; + GroupId maxGroupId_; }; } //unnamed namespace From 5e2d46bbee70885379887b5795d58d29dfb762df Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sun, 14 Aug 2022 17:56:22 +0400 Subject: [PATCH 3/3] Removed the ObjectId param from Grouping::add() --- src/fileimpl.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 95501c0f8..f43a039e0 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -104,9 +104,10 @@ class Grouping groupIds_.reserve(objectIdEnd - objectIdBegin); } - void add(ObjectId objectId, GroupId groupId) + // i'th call of add() is assumed to refer to the object + // with id (firstObjectId_+i) + void add(GroupId groupId) { - assert(objectId == firstObjectId_ + groupIds_.size()); groupIds_.push_back(groupId); minGroupId_ = std::min(minGroupId_, groupId); maxGroupId_ = std::max(maxGroupId_, groupId); @@ -435,11 +436,11 @@ class Grouping // Get the mimeType of the dirent (offset 0) to know the type of the dirent uint16_t mimeType = zimReader->read_uint(indexOffset); if (mimeType==Dirent::redirectMimeType || mimeType==Dirent::linktargetMimeType || mimeType == Dirent::deletedMimeType) { - g.add(i, 0); + g.add(0); } else { // If it is a classic article, get the clusterNumber (at offset 8) auto clusterNumber = zimReader->read_uint(indexOffset+offset_t(8)); - g.add(i, clusterNumber); + g.add(clusterNumber); } } m_articleListByCluster = g.getGroupedObjectIds();