Skip to content

Commit

Permalink
GEOMESA-3297 Support for tiered date index pre-splits (#2996)
Browse files Browse the repository at this point in the history
  • Loading branch information
elahrvivaz authored Sep 27, 2023
1 parent b9e2900 commit 7c287c7
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 33 deletions.
42 changes: 27 additions & 15 deletions docs/user/datastores/index_config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -498,21 +498,23 @@ For the default splitter, ``table.splitter.options`` should consist of comma-sep
``key1:value1,key2:value2``. Entries related to a given index should start with the index identifier, e.g. one
of ``id``, ``z3``, ``z2`` or ``attr`` (``xz3`` and ``xz2`` indices use ``z3`` and ``z2``, respectively).

+------------+-------------------------------+----------------------------------------+
| Index | Option | Description |
+============+===============================+========================================+
| Z3/XZ3 | ``z3.min`` | The minimum date for the data |
+ +-------------------------------+----------------------------------------+
| | ``z3.max`` | The maximum date for the data |
+ +-------------------------------+----------------------------------------+
| | ``z3.bits`` | The number of leading bits to split on |
+------------+-------------------------------+----------------------------------------+
| Z2/XZ2 | ``z2.bits`` | The number of leading bits to split on |
+------------+-------------------------------+----------------------------------------+
| ID | ``id.pattern`` | Split pattern |
+------------+-------------------------------+----------------------------------------+
| Attribute | ``attr.<attribute>.pattern`` | Split pattern for a given attribute |
+------------+-------------------------------+----------------------------------------+
+------------+---------------------------------+----------------------------------------+
| Index | Option | Description |
+============+=================================+========================================+
| Z3/XZ3 | ``z3.min`` | The minimum date for the data |
+ +---------------------------------+----------------------------------------+
| | ``z3.max`` | The maximum date for the data |
+ +---------------------------------+----------------------------------------+
| | ``z3.bits`` | The number of leading bits to split on |
+------------+---------------------------------+----------------------------------------+
| Z2/XZ2 | ``z2.bits`` | The number of leading bits to split on |
+------------+---------------------------------+----------------------------------------+
| ID | ``id.pattern`` | Split pattern |
+------------+---------------------------------+----------------------------------------+
| Attribute | ``attr.<attribute>.pattern`` | Split pattern for a given attribute |
+ +---------------------------------+----------------------------------------+
| | ``attr.<attribute>.date-range`` | Date range for a given attribute |
+------------+---------------------------------+----------------------------------------+

Z3/XZ3 Splits
+++++++++++++
Expand Down Expand Up @@ -556,6 +558,16 @@ prefixing will not work correctly. E.g., if the data has numbers in the range 80
``[8-9][0-9]`` will not split the data properly. Instead, trailing zeros should be added to reach the appropriate
length, e.g. ``[8-9][0-9][0][0]``.

For attribute indices with secondary date indexing, each attribute pattern can be further refined with a date
pattern, using the suffix ``date-range``. For the example above, you could have any or all of ``attr.name.date-range``,
``attr.name.date-range2`` and ``attr.name.date-range3``. The date range is expected to take the form of
``begin-date/end-date/number of splits``, and dates are specified in the form ``yyyy-MM-dd``, e.g.
``2020-01-01/2023-01-01/8``. Note that this is mainly useful with low-cardinality attributes, as due to the way
secondary indexing works, date suffixes are only useful after full keys. Continuing the example,
``attr.name.pattern:[a][l][i][c][e],attr.name.date-range:2020-01-01/2023-01-01/8`` would effectively partition an
index where everyone is named Alice, but ``attr.name.pattern:[a-z],attr.name.date-range:2020-01-01/2023-01-01/8``
would not be effective unless everyone had a single-letter name.

Full Example
++++++++++++

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ object DefaultSplitter {
val Z3MinDateOption = s"${Z3Index.name}.min"
val Z3MaxDateOption = s"${Z3Index.name}.max"

private val ZeroByteString = new String(ByteArrays.ZeroByteArray, StandardCharsets.UTF_8)

/**
* Creates splits suitable for a feature ID index. If nothing is specified, will assume a hex distribution.
*
Expand All @@ -80,7 +82,7 @@ object DefaultSplitter {
def idSplits(options: Map[String, String]): Seq[String] = {
val patterns = {
val configured = DefaultSplitter.patterns(s"${IdIndex.name}.pattern", options)
if (configured.hasNext) { configured } else {
if (configured.nonEmpty) { configured } else {
Iterator("[0]", "[4]", "[8]", "[c]") // 4 splits assuming hex layout
}
}
Expand All @@ -102,19 +104,28 @@ object DefaultSplitter {
* @return
*/
def attributeSplits(name: String, binding: Class[_], options: Map[String, String]): Seq[String] = {
val patterns = DefaultSplitter.patterns(s"${AttributeIndex.name}.$name.pattern", options)
val ranges = patterns.toSeq.map(SplitPatternParser.parse)
if (classOf[Number].isAssignableFrom(binding)) {
try {
ranges.flatMap(numberPatternSplits(_, binding))
} catch {
case e: NumberFormatException =>
throw new IllegalArgumentException(s"Trying to create splits for attribute '$name' " +
s"of type ${binding.getName}, but splits could not be parsed as a number: " +
patterns.mkString(" "), e)
val patterns = DefaultSplitter.patternPairs(s"${AttributeIndex.name}.$name.", "pattern", "date-range", options)
val ranges = patterns.map { case (pattern, datePattern) =>
(SplitPatternParser.parse(pattern), datePattern.map(SplitPatternParser.parse))
}
val getSplits: SplitPattern => Seq[String] =
if (classOf[Number].isAssignableFrom(binding)) {
pattern =>
try { numberPatternSplits(pattern, binding) } catch {
case e: NumberFormatException =>
throw new IllegalArgumentException(s"Trying to create splits for attribute '$name' " +
s"of type ${binding.getName}, but splits could not be parsed as a number: " +
patterns.mkString(" "), e)
}
} else {
pattern => pattern.range
}
ranges.flatMap { case (primary, secondary) =>
val splits = getSplits(primary)
secondary match {
case None => splits
case Some(s) => for { a <- splits; b <- s.range } yield { a + ZeroByteString + b }
}
} else {
ranges.flatMap(_.range)
}
}

Expand Down Expand Up @@ -233,9 +244,27 @@ object DefaultSplitter {
private def z2Bytes(options: Map[String, String]): Array[Array[Byte]] =
Parser.z2Splits(options).map(ByteArrays.toBytes).toArray

private def patterns(base: String, options: Map[String, String]): Iterator[String] = {
private def patterns(base: String, options: Map[String, String]): Seq[String] = {
val keys = Iterator.single(base) ++ Iterator.range(2, Int.MaxValue).map(i => s"$base$i")
keys.map(options.get(_).orNull).takeWhile(_ != null)
keys.map(options.get(_).orNull).takeWhile(_ != null).toSeq
}

private def patternPairs(
prefix: String,
primary: String,
secondary: String,
options: Map[String, String]): Seq[(String, Option[String])] = {
val firstKey = s"$prefix$primary"
val secondKey = s"$prefix$secondary"
val transforms: Iterator[String => String] =
Iterator.single[String => String](b => b) ++ Iterator.range(2, Int.MaxValue).map(i => b => s"$b$i")
val patterns = transforms.map { transform =>
options.get(transform(firstKey)) match {
case None => null
case Some(pattern) => (pattern, options.get(transform(secondKey)))
}
}
patterns.takeWhile(_ != null).toSeq
}

@throws(classOf[NumberFormatException])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@
package org.locationtech.geomesa.index.conf.splitter

import org.locationtech.geomesa.index.conf.splitter.SplitPatternParser._
import org.locationtech.geomesa.utils.text.BasicParser
import org.locationtech.geomesa.index.index.attribute.AttributeIndexKey
import org.locationtech.geomesa.utils.text.{BasicParser, DateParsing}
import org.parboiled.errors.{ErrorUtils, ParsingException}
import org.parboiled.scala.parserunners.{BasicParseRunner, ReportingParseRunner}

import java.util.Date

/**
* Parses patterns into splits
*/
Expand Down Expand Up @@ -66,6 +69,15 @@ object SplitPatternParser {
override def range: Seq[String] =
tiers.map(_.reverse).foldLeft(Seq("-")) { (left, right) => for (a <- left; b <- right) yield { a + b } }
}

case class DatePattern(from: Date, to: Date, ranges: Int) extends SplitPattern {
override def range: Seq[String] = {
val interval = (to.getTime - from.getTime) / ranges
Seq.tabulate(ranges) { i =>
AttributeIndexKey.encodeForQuery(new Date(from.getTime + interval * i), classOf[Date])
}
}
}
}

private class SplitPatternParser extends BasicParser {
Expand All @@ -79,7 +91,7 @@ private class SplitPatternParser extends BasicParser {
import org.parboiled.scala._

def patterns: Rule1[SplitPattern] = rule {
(mixedPatterns | negativePatterns) ~ EOI
(mixedPatterns | negativePatterns | dateRange) ~ EOI
}

private def mixedPatterns: Rule1[SplitPattern] = rule {
Expand All @@ -101,6 +113,10 @@ private class SplitPatternParser extends BasicParser {
"[" ~ { oneOrMore(numeric) ~~> { p => if (p.length == 1) { p.head } else { CompositePattern(p) } } } ~ "]"
}

private def dateRange: Rule1[SplitPattern] = rule {
(date ~ "/" ~ date ~ "/" ~ int) ~~> { (from, to, splits) => DatePattern(from, to, splits) }
}

private def alpha: Rule1[AlphaPattern] = rule {
alphaRange | alphaSingle
}
Expand Down Expand Up @@ -128,4 +144,14 @@ private class SplitPatternParser extends BasicParser {
private def alphaEndpoint: Rule1[Char] = rule { ("a" - "z" | "A" - "Z") ~> { c => c.charAt(0) } }

private def numericEndpoint: Rule1[Byte] = rule { ("0" - "9") ~> { c => c.toByte } }

private def date: Rule1[Date] = rule {
group(year ~ "-" ~ month ~ "-" ~ day) ~> { date => DateParsing.parseDate(date) }
}

private def year: Rule0 = rule { nTimes(4, "0" - "9") }

private def month: Rule0 = rule { ("0" - "1") ~ ("1" - "9") }

private def day: Rule0 = rule { ("0" - "3") ~ ("0" - "9") }
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ import org.locationtech.geomesa.index.index.id.IdIndex
import org.locationtech.geomesa.index.index.z3.Z3Index
import org.locationtech.geomesa.utils.geotools.SimpleFeatureTypes
import org.locationtech.geomesa.utils.index.ByteArrays
import org.locationtech.geomesa.utils.text.DateParsing
import org.specs2.mutable.Specification
import org.specs2.runner.JUnitRunner

import java.nio.charset.StandardCharsets
import java.util.Date

@RunWith(classOf[JUnitRunner])
class DefaultSplitterTest extends Specification {
Expand Down Expand Up @@ -93,6 +95,22 @@ class DefaultSplitterTest extends Specification {
(0 to 9).map(_.toString) ++ (0 to 9).map(i => s"8$i")
}

"produce correct string tiered date splits" in {
val opts =
"attr.myString.pattern:[A][B][C],attr.myString.pattern2:[B-Z]," +
"attr.myString.date-range:2023-09-15/2023-09-16/4"
val splits = splitter.getSplits(sft, attrString, opts)
splits.length must be equalTo 29
splits.toSeq must containAllOf(Seq.tabulate(25)(i => Array(('B' + i).toChar.toByte)))
val dates =
Seq("2023-09-15T00:00:00Z", "2023-09-15T06:00:00Z", "2023-09-15T12:00:00Z", "2023-09-15T18:00:00Z").map { d =>
"ABC".getBytes(StandardCharsets.UTF_8) ++
ByteArrays.ZeroByteArray ++
AttributeIndexKey.encodeForQuery(DateParsing.parseDate(d), classOf[Date]).getBytes(StandardCharsets.UTF_8)
}
splits.toSeq must containAllOf(dates)
}

"produce correct int splits" in {
val opts = "attr.myInt.pattern:[0-9]"
val splits = splitter.getSplits(sft, attrInt, opts)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ private class KVPairParser(pairSep: String = ",", kvSep: String = ":") extends B
}

private def value: Rule1[String] = rule {
quotedString | singleQuotedString | oneOrMore(char | anyOf(".-[]%")) ~> { (k) => k }
quotedString | singleQuotedString | oneOrMore(char | anyOf(".-[]%/")) ~> { (k) => k }
}

private def keyValue: Rule1[(String, String)] = rule {
Expand Down

0 comments on commit 7c287c7

Please sign in to comment.