From fcba46252387f9442bd7624de647503e1d3bc30e Mon Sep 17 00:00:00 2001
From: Veloman Yunkan <veloman.yunkan@gmail.com>
Date: Sun, 14 Aug 2022 16:28:02 +0400
Subject: [PATCH 1/3] Enter Grouping<ObjectId, GroupId>

The new class Grouping<ObjectId, GroupId> implements the current
straightforward approach for sorting the dirents in cluster order, but
opens a way for switching to a better solution.

A user-observable effect of this change is the lower memory usage after
`FileImpl::prepareArticleListByCluster()` has completed (but an increased
high-watermark memory usage of that function).
---
 src/fileimpl.cpp | 63 ++++++++++++++++++++++++++++++++++++++++++------
 src/fileimpl.h   |  3 +--
 2 files changed, 56 insertions(+), 10 deletions(-)
diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp
index 1593ba634..f2a2223fa 100644
--- a/src/fileimpl.cpp
+++ b/src/fileimpl.cpp
@@ -80,6 +80,53 @@ makeFileReader(std::shared_ptr<const FileCompound> zimFile, offset_t offset, zsi
   }
 }
 
+// Consider a set of integer-numbered objects with their object-ids spanning a
+// contiguous range [a, b).
+// Each object is also labelled with an integer group id. The group-ids too
+// form a contiguous (or dense enough) set.
+// The Grouping class allows to re-arrange the stream of such objects fed
+// to it in the object-id order, returning a table of object-ids in the group-id
+// order (where the order of the objects within the same group is preserved).
+//
+template<class ObjectId, class GroupId>
+class Grouping
+{
+public: // types
+  typedef std::vector<ObjectId> GroupedObjectIds;
+
+public: // functions
+  explicit Grouping(ObjectId objectIdBegin, ObjectId objectIdEnd)
+    : firstObjectId_(objectIdBegin)
+  {
+    groupAndObjectIds_.reserve(objectIdEnd - objectIdBegin);
+  }
+
+  void add(ObjectId objectId, GroupId groupId)
+  {
+    assert(objectId == firstObjectId_ + groupAndObjectIds_.size());
+    groupAndObjectIds_.push_back({groupId, objectId});
+  }
+
+  GroupedObjectIds getGroupedObjectIds()
+  {
+    std::sort(groupAndObjectIds_.begin(), groupAndObjectIds_.end());
+    GroupedObjectIds result;
+    result.reserve(groupAndObjectIds_.size());
+    for ( const auto groupAndObjectId : groupAndObjectIds_ ) {
+      result.push_back(groupAndObjectId.second);
+    }
+    GroupAndObjectIds().swap(groupAndObjectIds_);
+    return result;
+  }
+
+private: // types
+  typedef std::vector<std::pair<GroupId, ObjectId>> GroupAndObjectIds;
+
+private: // data
+  const ObjectId firstObjectId_;
+  GroupAndObjectIds groupAndObjectIds_;
+};
+
 } //unnamed namespace
 
   //////////////////////////////////////////////////////////////////////
@@ -348,24 +395,24 @@ makeFileReader(std::shared_ptr<const FileCompound> zimFile, offset_t offset, zsi
 
   void FileImpl::prepareArticleListByCluster() const
   {
-    m_articleListByCluster.reserve(getUserEntryCount().v);
-
-    auto endIdx = getEndUserEntry().v;
-    for(auto i = getStartUserEntry().v; i < endIdx; i++)
+    const auto endIdx = getEndUserEntry().v;
+    const auto startIdx = getStartUserEntry().v;
+    Grouping<entry_index_type, cluster_index_type> g(startIdx, endIdx);
+    for(auto i = startIdx; i < endIdx; i++)
     {
       // This is the offset of the dirent in the zimFile
       auto indexOffset = mp_urlDirentAccessor->getOffset(entry_index_t(i));
       // Get the mimeType of the dirent (offset 0) to know the type of the dirent
       uint16_t mimeType = zimReader->read_uint<uint16_t>(indexOffset);
       if (mimeType==Dirent::redirectMimeType || mimeType==Dirent::linktargetMimeType || mimeType == Dirent::deletedMimeType) {
-        m_articleListByCluster.push_back(std::make_pair(0, i));
+        g.add(i, 0);
       } else {
         // If it is a classic article, get the clusterNumber (at offset 8)
         auto clusterNumber = zimReader->read_uint<zim::cluster_index_type>(indexOffset+offset_t(8));
-        m_articleListByCluster.push_back(std::make_pair(clusterNumber, i));
+        g.add(i, clusterNumber);
       }
     }
-    std::sort(m_articleListByCluster.begin(), m_articleListByCluster.end());
+    m_articleListByCluster = g.getGroupedObjectIds();
   }
 
   entry_index_t FileImpl::getIndexByClusterOrder(entry_index_t idx) const
@@ -380,7 +427,7 @@ makeFileReader(std::shared_ptr<const FileCompound> zimFile, offset_t offset, zsi
     }
     if (idx.v >= m_articleListByCluster.size())
       throw std::out_of_range("entry index out of range");
-    return entry_index_t(m_articleListByCluster[idx.v].second);
+    return entry_index_t(m_articleListByCluster[idx.v]);
   }
 
   FileImpl::ClusterHandle FileImpl::readCluster(cluster_index_t idx)
diff --git a/src/fileimpl.h b/src/fileimpl.h
index 4e1f7d78a..cc85c46d6 100644
--- a/src/fileimpl.h
+++ b/src/fileimpl.h
@@ -68,8 +68,7 @@ namespace zim
       typedef std::vector<std::string> MimeTypes;
       MimeTypes mimeTypes;
 
-      using pair_type = std::pair<cluster_index_type, entry_index_type>;
-      mutable std::vector<pair_type> m_articleListByCluster;
+      mutable std::vector<entry_index_type> m_articleListByCluster;
       mutable std::mutex m_articleListByClusterMutex;
 
       struct DirentLookupConfig

From f9b9f84234e46f593aea6b26b788c947a4d42d37 Mon Sep 17 00:00:00 2001
From: Veloman Yunkan <veloman.yunkan@gmail.com>
Date: Sun, 14 Aug 2022 18:05:18 +0400
Subject: [PATCH 2/3] Optimized Grouping<ObjectId, GroupId>

Got rid of O(NlogN) sorting in FileImpl::prepareArticleListByCluster().
Now its time-complexitiy is O(N). Also, its high-watermark memory usage was
lowered roughly to the level before Grouping was introduced.
---
 src/fileimpl.cpp | 50 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp
index f2a2223fa..95501c0f8 100644
--- a/src/fileimpl.cpp
+++ b/src/fileimpl.cpp
@@ -30,6 +30,7 @@
 #include <errno.h>
 #include <cstring>
 #include <fstream>
+#include <numeric>
 #include "config.h"
 #include "log.h"
 #include "envvalue.h"
@@ -97,34 +98,63 @@ class Grouping
 public: // functions
   explicit Grouping(ObjectId objectIdBegin, ObjectId objectIdEnd)
     : firstObjectId_(objectIdBegin)
+    , minGroupId_(std::numeric_limits<GroupId>::max())
+    , maxGroupId_(std::numeric_limits<GroupId>::min())
   {
-    groupAndObjectIds_.reserve(objectIdEnd - objectIdBegin);
+    groupIds_.reserve(objectIdEnd - objectIdBegin);
   }
 
   void add(ObjectId objectId, GroupId groupId)
   {
-    assert(objectId == firstObjectId_ + groupAndObjectIds_.size());
-    groupAndObjectIds_.push_back({groupId, objectId});
+    assert(objectId == firstObjectId_ + groupIds_.size());
+    groupIds_.push_back(groupId);
+    minGroupId_ = std::min(minGroupId_, groupId);
+    maxGroupId_ = std::max(maxGroupId_, groupId);
   }
 
   GroupedObjectIds getGroupedObjectIds()
   {
-    std::sort(groupAndObjectIds_.begin(), groupAndObjectIds_.end());
     GroupedObjectIds result;
-    result.reserve(groupAndObjectIds_.size());
-    for ( const auto groupAndObjectId : groupAndObjectIds_ ) {
-      result.push_back(groupAndObjectId.second);
+    if ( !groupIds_.empty() ) {
+      // nextObjectSeat[g - minGroupId_] tells where the next object
+      // with group-id g must be placed (seated) in the result
+      std::vector<size_t> nextObjectSeat = getGroupBoundaries();
+
+      result.resize(groupIds_.size());
+      for ( size_t i = 0; i < groupIds_.size(); ++i ) {
+        const GroupId g = groupIds_[i];
+        // This statement has an important side-effect  vv
+        const auto pos = nextObjectSeat[g - minGroupId_]++;
+        result[pos] = firstObjectId_ + i;
+      }
+      GroupIds().swap(groupIds_);
     }
-    GroupAndObjectIds().swap(groupAndObjectIds_);
     return result;
   }
 
+private: // functions
+  std::vector<size_t> getGroupBoundaries() const
+  {
+    std::vector<size_t> groupIdCounts(maxGroupId_ - minGroupId_ + 1, 0);
+    for ( const auto groupId : groupIds_ ) {
+      ++groupIdCounts[groupId - minGroupId_];
+    }
+
+    std::vector<size_t> groupBoundaries(1, 0);
+    std::partial_sum(groupIdCounts.begin(), groupIdCounts.end(),
+                     std::back_inserter(groupBoundaries)
+    );
+    return groupBoundaries;
+  }
+
 private: // types
-  typedef std::vector<std::pair<GroupId, ObjectId>> GroupAndObjectIds;
+  typedef std::vector<GroupId> GroupIds;
 
 private: // data
   const ObjectId firstObjectId_;
-  GroupAndObjectIds groupAndObjectIds_;
+  GroupIds groupIds_;
+  GroupId minGroupId_;
+  GroupId maxGroupId_;
 };
 
 } //unnamed namespace

From 5e2d46bbee70885379887b5795d58d29dfb762df Mon Sep 17 00:00:00 2001
From: Veloman Yunkan <veloman.yunkan@gmail.com>
Date: Sun, 14 Aug 2022 17:56:22 +0400
Subject: [PATCH 3/3] Removed the ObjectId param from Grouping::add()

---
 src/fileimpl.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp
index 95501c0f8..f43a039e0 100644
--- a/src/fileimpl.cpp
+++ b/src/fileimpl.cpp
@@ -104,9 +104,10 @@ class Grouping
     groupIds_.reserve(objectIdEnd - objectIdBegin);
   }
 
-  void add(ObjectId objectId, GroupId groupId)
+  // i'th call of add() is assumed to refer to the object
+  // with id (firstObjectId_+i)
+  void add(GroupId groupId)
   {
-    assert(objectId == firstObjectId_ + groupIds_.size());
     groupIds_.push_back(groupId);
     minGroupId_ = std::min(minGroupId_, groupId);
     maxGroupId_ = std::max(maxGroupId_, groupId);
@@ -435,11 +436,11 @@ class Grouping
       // Get the mimeType of the dirent (offset 0) to know the type of the dirent
       uint16_t mimeType = zimReader->read_uint<uint16_t>(indexOffset);
       if (mimeType==Dirent::redirectMimeType || mimeType==Dirent::linktargetMimeType || mimeType == Dirent::deletedMimeType) {
-        g.add(i, 0);
+        g.add(0);
       } else {
         // If it is a classic article, get the clusterNumber (at offset 8)
         auto clusterNumber = zimReader->read_uint<zim::cluster_index_type>(indexOffset+offset_t(8));
-        g.add(i, clusterNumber);
+        g.add(clusterNumber);
       }
     }
     m_articleListByCluster = g.getGroupedObjectIds();