GEOMESA-3297 Support for tiered date index pre-splits (#2996)

locationtech · Sep 27, 2023 · 7c287c7 · 7c287c7
1 parent b9e2900
commit 7c287c7
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 33 deletions.
diff --git a/docs/user/datastores/index_config.rst b/docs/user/datastores/index_config.rst
@@ -498,21 +498,23 @@ For the default splitter, ``table.splitter.options`` should consist of comma-sep
 ``key1:value1,key2:value2``. Entries related to a given index should start with the index identifier, e.g. one
 of ``id``, ``z3``, ``z2`` or ``attr`` (``xz3`` and ``xz2`` indices use ``z3`` and ``z2``, respectively).
 
-+------------+-------------------------------+----------------------------------------+
-| Index      | Option                        | Description                            |
-+============+===============================+========================================+
-| Z3/XZ3     | ``z3.min``                    | The minimum date for the data          |
-+            +-------------------------------+----------------------------------------+
-|            | ``z3.max``                    | The maximum date for the data          |
-+            +-------------------------------+----------------------------------------+
-|            | ``z3.bits``                   | The number of leading bits to split on |
-+------------+-------------------------------+----------------------------------------+
-| Z2/XZ2     | ``z2.bits``                   | The number of leading bits to split on |
-+------------+-------------------------------+----------------------------------------+
-| ID         | ``id.pattern``                | Split pattern                          |
-+------------+-------------------------------+----------------------------------------+
-| Attribute  |  ``attr.<attribute>.pattern`` | Split pattern for a given attribute    |
-+------------+-------------------------------+----------------------------------------+
++------------+---------------------------------+----------------------------------------+
+| Index      | Option                          | Description                            |
++============+=================================+========================================+
+| Z3/XZ3     | ``z3.min``                      | The minimum date for the data          |
++            +---------------------------------+----------------------------------------+
+|            | ``z3.max``                      | The maximum date for the data          |
++            +---------------------------------+----------------------------------------+
+|            | ``z3.bits``                     | The number of leading bits to split on |
++------------+---------------------------------+----------------------------------------+
+| Z2/XZ2     | ``z2.bits``                     | The number of leading bits to split on |
++------------+---------------------------------+----------------------------------------+
+| ID         | ``id.pattern``                  | Split pattern                          |
++------------+---------------------------------+----------------------------------------+
+| Attribute  | ``attr.<attribute>.pattern``    | Split pattern for a given attribute    |
++            +---------------------------------+----------------------------------------+
+|            | ``attr.<attribute>.date-range`` | Date range for a given attribute       |
++------------+---------------------------------+----------------------------------------+
 
 Z3/XZ3 Splits
 +++++++++++++
@@ -556,6 +558,16 @@ prefixing will not work correctly. E.g., if the data has numbers in the range 80
 ``[8-9][0-9]`` will not split the data properly. Instead, trailing zeros should be added to reach the appropriate
 length, e.g. ``[8-9][0-9][0][0]``.
 
+For attribute indices with secondary date indexing, each attribute pattern can be further refined with a date
+pattern, using the suffix ``date-range``. For the example above, you could have any or all of ``attr.name.date-range``,
+``attr.name.date-range2`` and ``attr.name.date-range3``. The date range is expected to take the form of
+``begin-date/end-date/number of splits``, and dates are specified in the form ``yyyy-MM-dd``, e.g.
+``2020-01-01/2023-01-01/8``. Note that this is mainly useful with low-cardinality attributes, as due to the way
+secondary indexing works, date suffixes are only useful after full keys. Continuing the example,
+``attr.name.pattern:[a][l][i][c][e],attr.name.date-range:2020-01-01/2023-01-01/8`` would effectively partition an
+index where everyone is named Alice, but ``attr.name.pattern:[a-z],attr.name.date-range:2020-01-01/2023-01-01/8``
+would not be effective unless everyone had a single-letter name.
+
 Full Example
 ++++++++++++
 

diff --git a/...dex-api/src/main/scala/org/locationtech/geomesa/index/conf/splitter/DefaultSplitter.scala b/...dex-api/src/main/scala/org/locationtech/geomesa/index/conf/splitter/DefaultSplitter.scala
@@ -65,6 +65,8 @@ object DefaultSplitter {
     val Z3MinDateOption = s"${Z3Index.name}.min"
     val Z3MaxDateOption = s"${Z3Index.name}.max"
 
+    private val ZeroByteString = new String(ByteArrays.ZeroByteArray, StandardCharsets.UTF_8)
+
     /**
       * Creates splits suitable for a feature ID index. If nothing is specified, will assume a hex distribution.
       *
@@ -80,7 +82,7 @@ object DefaultSplitter {
     def idSplits(options: Map[String, String]): Seq[String] = {
       val patterns = {
         val configured = DefaultSplitter.patterns(s"${IdIndex.name}.pattern", options)
-        if (configured.hasNext) { configured } else {
+        if (configured.nonEmpty) { configured } else {
           Iterator("[0]", "[4]", "[8]", "[c]") // 4 splits assuming hex layout
         }
       }
@@ -102,19 +104,28 @@ object DefaultSplitter {
       * @return
       */
     def attributeSplits(name: String, binding: Class[_], options: Map[String, String]): Seq[String] = {
-      val patterns = DefaultSplitter.patterns(s"${AttributeIndex.name}.$name.pattern", options)
-      val ranges = patterns.toSeq.map(SplitPatternParser.parse)
-      if (classOf[Number].isAssignableFrom(binding)) {
-        try {
-          ranges.flatMap(numberPatternSplits(_, binding))
-        } catch {
-          case e: NumberFormatException =>
-            throw new IllegalArgumentException(s"Trying to create splits for attribute '$name' " +
-                s"of type ${binding.getName}, but splits could not be parsed as a number: " +
-                patterns.mkString(" "), e)
+      val patterns = DefaultSplitter.patternPairs(s"${AttributeIndex.name}.$name.", "pattern", "date-range", options)
+      val ranges = patterns.map { case (pattern, datePattern) =>
+        (SplitPatternParser.parse(pattern), datePattern.map(SplitPatternParser.parse))
+      }
+      val getSplits: SplitPattern => Seq[String] =
+        if (classOf[Number].isAssignableFrom(binding)) {
+          pattern =>
+            try { numberPatternSplits(pattern, binding) } catch {
+              case e: NumberFormatException =>
+                throw new IllegalArgumentException(s"Trying to create splits for attribute '$name' " +
+                    s"of type ${binding.getName}, but splits could not be parsed as a number: " +
+                    patterns.mkString(" "), e)
+            }
+        } else {
+          pattern => pattern.range
+        }
+      ranges.flatMap { case (primary, secondary) =>
+        val splits = getSplits(primary)
+        secondary match {
+          case None => splits
+          case Some(s) => for { a <- splits; b <- s.range } yield { a + ZeroByteString + b }
         }
-      } else {
-        ranges.flatMap(_.range)
       }
     }
 
@@ -233,9 +244,27 @@ object DefaultSplitter {
   private def z2Bytes(options: Map[String, String]): Array[Array[Byte]] =
     Parser.z2Splits(options).map(ByteArrays.toBytes).toArray
 
-  private def patterns(base: String, options: Map[String, String]): Iterator[String] = {
+  private def patterns(base: String, options: Map[String, String]): Seq[String] = {
     val keys = Iterator.single(base) ++ Iterator.range(2, Int.MaxValue).map(i => s"$base$i")
-    keys.map(options.get(_).orNull).takeWhile(_ != null)
+    keys.map(options.get(_).orNull).takeWhile(_ != null).toSeq
+  }
+
+  private def patternPairs(
+      prefix: String,
+      primary: String,
+      secondary: String,
+      options: Map[String, String]): Seq[(String, Option[String])] = {
+    val firstKey = s"$prefix$primary"
+    val secondKey = s"$prefix$secondary"
+    val transforms: Iterator[String => String] =
+      Iterator.single[String => String](b => b) ++ Iterator.range(2, Int.MaxValue).map(i => b => s"$b$i")
+    val patterns = transforms.map { transform =>
+      options.get(transform(firstKey)) match {
+        case None => null
+        case Some(pattern) => (pattern, options.get(transform(secondKey)))
+      }
+    }
+    patterns.takeWhile(_ != null).toSeq
   }
 
   @throws(classOf[NumberFormatException])

diff --git a/...-api/src/main/scala/org/locationtech/geomesa/index/conf/splitter/SplitPatternParser.scala b/...-api/src/main/scala/org/locationtech/geomesa/index/conf/splitter/SplitPatternParser.scala
@@ -9,10 +9,13 @@
 package org.locationtech.geomesa.index.conf.splitter
 
 import org.locationtech.geomesa.index.conf.splitter.SplitPatternParser._
-import org.locationtech.geomesa.utils.text.BasicParser
+import org.locationtech.geomesa.index.index.attribute.AttributeIndexKey
+import org.locationtech.geomesa.utils.text.{BasicParser, DateParsing}
 import org.parboiled.errors.{ErrorUtils, ParsingException}
 import org.parboiled.scala.parserunners.{BasicParseRunner, ReportingParseRunner}
 
+import java.util.Date
+
 /**
   * Parses patterns into splits
   */
@@ -66,6 +69,15 @@ object SplitPatternParser {
     override def range: Seq[String] =
       tiers.map(_.reverse).foldLeft(Seq("-")) { (left, right) => for (a <- left; b <- right) yield { a + b } }
   }
+
+  case class DatePattern(from: Date, to: Date, ranges: Int) extends SplitPattern {
+    override def range: Seq[String] = {
+      val interval = (to.getTime - from.getTime) / ranges
+      Seq.tabulate(ranges) { i =>
+        AttributeIndexKey.encodeForQuery(new Date(from.getTime + interval * i), classOf[Date])
+      }
+    }
+  }
 }
 
 private class SplitPatternParser extends BasicParser {
@@ -79,7 +91,7 @@ private class SplitPatternParser extends BasicParser {
   import org.parboiled.scala._
 
   def patterns: Rule1[SplitPattern] = rule {
-    (mixedPatterns | negativePatterns) ~ EOI
+    (mixedPatterns | negativePatterns | dateRange) ~ EOI
   }
 
   private def mixedPatterns: Rule1[SplitPattern] = rule {
@@ -101,6 +113,10 @@ private class SplitPatternParser extends BasicParser {
     "[" ~ { oneOrMore(numeric) ~~> { p => if (p.length == 1) { p.head } else { CompositePattern(p) } } } ~ "]"
   }
 
+  private def dateRange: Rule1[SplitPattern] = rule {
+    (date ~ "/" ~ date ~ "/" ~ int) ~~> { (from, to, splits) => DatePattern(from, to, splits) }
+  }
+
   private def alpha: Rule1[AlphaPattern] = rule {
     alphaRange | alphaSingle
   }
@@ -128,4 +144,14 @@ private class SplitPatternParser extends BasicParser {
   private def alphaEndpoint: Rule1[Char] = rule { ("a" - "z" | "A" - "Z") ~> { c => c.charAt(0) } }
 
   private def numericEndpoint: Rule1[Byte] = rule { ("0" - "9") ~> { c => c.toByte } }
+
+  private def date: Rule1[Date] = rule {
+    group(year ~ "-" ~ month ~ "-" ~ day) ~> { date => DateParsing.parseDate(date) }
+  }
+
+  private def year: Rule0 = rule { nTimes(4, "0" - "9")  }
+
+  private def month: Rule0 = rule { ("0" - "1") ~ ("1" - "9") }
+
+  private def day: Rule0 = rule { ("0" - "3") ~ ("0" - "9") }
 }
diff --git a/...api/src/test/scala/org/locationtech/geomesa/index/conf/splitter/DefaultSplitterTest.scala b/...api/src/test/scala/org/locationtech/geomesa/index/conf/splitter/DefaultSplitterTest.scala
@@ -15,10 +15,12 @@ import org.locationtech.geomesa.index.index.id.IdIndex
 import org.locationtech.geomesa.index.index.z3.Z3Index
 import org.locationtech.geomesa.utils.geotools.SimpleFeatureTypes
 import org.locationtech.geomesa.utils.index.ByteArrays
+import org.locationtech.geomesa.utils.text.DateParsing
 import org.specs2.mutable.Specification
 import org.specs2.runner.JUnitRunner
 
 import java.nio.charset.StandardCharsets
+import java.util.Date
 
 @RunWith(classOf[JUnitRunner])
 class DefaultSplitterTest extends Specification {
@@ -93,6 +95,22 @@ class DefaultSplitterTest extends Specification {
           (0 to 9).map(_.toString) ++ (0 to 9).map(i => s"8$i")
     }
 
+    "produce correct string tiered date splits" in {
+      val opts =
+        "attr.myString.pattern:[A][B][C],attr.myString.pattern2:[B-Z]," +
+        "attr.myString.date-range:2023-09-15/2023-09-16/4"
+      val splits = splitter.getSplits(sft, attrString, opts)
+      splits.length must be equalTo 29
+      splits.toSeq must containAllOf(Seq.tabulate(25)(i => Array(('B' + i).toChar.toByte)))
+      val dates =
+        Seq("2023-09-15T00:00:00Z", "2023-09-15T06:00:00Z", "2023-09-15T12:00:00Z", "2023-09-15T18:00:00Z").map { d =>
+          "ABC".getBytes(StandardCharsets.UTF_8) ++
+              ByteArrays.ZeroByteArray ++
+              AttributeIndexKey.encodeForQuery(DateParsing.parseDate(d), classOf[Date]).getBytes(StandardCharsets.UTF_8)
+        }
+      splits.toSeq must containAllOf(dates)
+    }
+
     "produce correct int splits" in {
       val opts = "attr.myInt.pattern:[0-9]"
       val splits = splitter.getSplits(sft, attrInt, opts)

diff --git a/...arent/geomesa-utils/src/main/scala/org/locationtech/geomesa/utils/text/KVPairParser.scala b/...arent/geomesa-utils/src/main/scala/org/locationtech/geomesa/utils/text/KVPairParser.scala
@@ -41,7 +41,7 @@ private class KVPairParser(pairSep: String = ",", kvSep: String = ":") extends B
   }
 
   private def value: Rule1[String] = rule {
-    quotedString | singleQuotedString | oneOrMore(char | anyOf(".-[]%")) ~> { (k) => k }
+    quotedString | singleQuotedString | oneOrMore(char | anyOf(".-[]%/")) ~> { (k) => k }
   }
 
   private def keyValue: Rule1[(String, String)] = rule {