diff --git a/docs/QbeastTable.md b/docs/QbeastTable.md index 5cfa1ae2d..dbf72532d 100644 --- a/docs/QbeastTable.md +++ b/docs/QbeastTable.md @@ -39,7 +39,7 @@ qbeastTable.optimize() // optimizes the cubes You can use it to **compare the index** build **with different indexing parameters** such as the `desiredCubeSize` and `columnsToIndex`. -If you're experimenting with new ways of implementing the OTree algorithm, you can also use this API to analyze the resulting index! +This is meant to be used as an easy access point to analyze the resulting index, which should come handy for comparing different index parameters or even implementations. ```scala val metrics = qbeastTable.getIndexMetrics() @@ -48,45 +48,49 @@ println(metrics) // EXAMPLE OUTPUT -OTree Index Metrics: -dimensionCount: 2 -elementCount: 1001 -depth: 2 -cubeCounts: 7 -desiredCubeSize: 100 -avgFanOut: 2.0 -depthOverLogNumNodes: 0.7124143742160444 -depthOnBalance: 0.6020599913279624 -Non-lead Cube Size Stats: -(All values are 0 if there's no non-leaf cubes): -- min: 3729 -- firstQuartile: 3729 -- secondQuartile: 4832 -- thirdQuartile: 5084 -- max: 5084 -- dev: 2.0133907E7 +Tree Index Metrics: +dimensionCount: 3 +elementCount: 2879966589 +depth: 7 +cubeCount: 22217 +desiredCubeSize: 500000 +avgFan0ut: 8.0 +depthOnBalance: 1.4740213633300192 + +Non-Leaf Cube Size Stats +Quantiles: +- min: 482642 +- 1stQ: 542859 +- 2ndQ: 557161 +- 3rdQ: 576939 +- max: 633266 +- dev(l1, l2): (0.11743615196254953, 0.0023669553335121983) + +(level, average weight, average cube size): +(0, (1.6781478192839184E-4,482642)) +(1, (0.001726577786432248,550513)) +(2, (0.014704148241220776,566831)) +(3, (0.1260420146029599,570841)) +(4, (0.7243052757165773, 557425)) +(5, (0.4040913470739245,527043)) +(6, (0.8873759316622165, 513460)) ``` ## Metrics ### 1. General index metadata: -- **Desired cube size**: the desired cube size choosed by the user, or the one that was automatically calculated. -- **Number of cubes**: the number of cubes in the index. -- **Tree depth**: the number of levels of the tree. -- **Average fan out of the cubes**: the average number of children per non-leaf cube. For this metric, it is better to get closer to `2^(numberOfDimensions)`. -- **Dimension count**: the number of dimensions (indexed columns) in the index. -- **Number of rows**: the total number of elements. - -### 2. Some more specific details such as: -- **depthOverLogNumNodes = depth / log(cubeCounts)** -- **depthOnBalance = depth / log(rowCount/desiredCubeSize)** - - both logs use **base = dimensionCount** - -### 3. Cube sizes for non-leaf cubes: -- `NonLeafCubeSizeDetails` contains their **min**, **max**, **quantiles**, and how far each of the cube sizes are from the `desiredCubeSize`(**dev**). - -### 4. `Map[CubeId, CubeStatus]` -- Some information from the map can be interesting to analyze - for example, the **distribution of cube weights**. - - You can access this information through `metrics.cubeStatuses`. +- **dimensionCount**: the number of dimensions (indexed columns) in the index. +- **elementCount**: the number of rows in the table. +- **desiredCubeSize**: the desired cube size chosen at the moment of indexing. +- **Number of cubes**: the number of nodes in the index tree. +- **depth**: the number of levels in the tree. +- **avgFanOut**: the average number of children per non-leaf cube. The max value for this metrics is `2 ^ dimensionCount`. +- **depthOnBalance**: how far the depth of the tree is to the theoretical value if we were to have the same number of cubes and max fan out. + +### 2. Cube sizes for non-leaf cubes: +`Non-leaf cube size stats` is meant to describe the distribution of inner cube sizes: +- **min**, **max**, **quartiles**, and how far the cube sizes are from the `desiredCubeSize`(**l1 and l2 error**). +- The average normalizedWeight and cube size per level. + +### 3. `Map[CubeId, CubeStatus]` +- More information can be extracted from the index tree through `metrics.cubeStatuses`. diff --git a/src/main/scala/io/qbeast/spark/QbeastTable.scala b/src/main/scala/io/qbeast/spark/QbeastTable.scala index 887632503..c8dcaedcd 100644 --- a/src/main/scala/io/qbeast/spark/QbeastTable.scala +++ b/src/main/scala/io/qbeast/spark/QbeastTable.scala @@ -11,6 +11,8 @@ import io.qbeast.spark.table._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.delta.DeltaLog +import scala.collection.immutable.SortedMap + /** * Class for interacting with QbeastTable at a user level * @@ -79,57 +81,87 @@ class QbeastTable private ( val cubeCount = allCubeStatuses.size val depth = allCubeStatuses.map(_._1.depth).max - val elementCount = allCubeStatuses.flatMap(_._2.files.map(_.elementCount)).sum + val rowCount = allCubeStatuses.flatMap(_._2.files.map(_.elementCount)).sum val dimensionCount = indexedColumns().size val desiredCubeSize = cubeSize() - val depthOverLogNumNodes = depth / logOfBase(dimensionCount, cubeCount) - val depthOnBalance = depth / logOfBase(dimensionCount, elementCount / desiredCubeSize) - - val nonLeafStatuses = - allCubeStatuses.filter(_._1.children.exists(allCubeStatuses.contains)).values - val nonLeafCubeSizes = nonLeafStatuses.map(_.files.map(_.elementCount).sum).toSeq.sorted - - val (avgFanOut, details) = - if (nonLeafStatuses.isEmpty || nonLeafCubeSizes.isEmpty) { - (0, NonLeafCubeSizeDetails(0, 0, 0, 0, 0, 0)) - } else { - val nonLeafCubeSizeDeviation = - nonLeafCubeSizes - .map(cubeSize => math.pow(cubeSize - desiredCubeSize, 2) / nonLeafCubeSizes.size) - .sum - - ( - nonLeafStatuses - .map(_.cubeId.children.count(allCubeStatuses.contains)) - .sum / nonLeafStatuses.size, - NonLeafCubeSizeDetails( - nonLeafCubeSizes.min, - nonLeafCubeSizes((nonLeafCubeSizes.size * 0.25).toInt), - nonLeafCubeSizes((nonLeafCubeSizes.size * 0.50).toInt), - nonLeafCubeSizes((nonLeafCubeSizes.size * 0.75).toInt), - nonLeafCubeSizes.max, - nonLeafCubeSizeDeviation)) - } + val (avgFanOut, details) = getInnerCubeSizeDetails(allCubeStatuses, desiredCubeSize) IndexMetrics( allCubeStatuses, dimensionCount, - elementCount, + rowCount, depth, cubeCount, desiredCubeSize, avgFanOut, - depthOverLogNumNodes, - depthOnBalance, + depthOnBalance(depth, cubeCount, dimensionCount), details) } - def logOfBase(base: Int, value: Double): Double = { + private def logOfBase(base: Int, value: Double): Double = { math.log10(value) / math.log10(base) } + private def depthOnBalance(depth: Int, cubeCount: Int, dimensionCount: Int): Double = { + val c = math.pow(2, dimensionCount).toInt + val theoreticalDepth = logOfBase(c, 1 - cubeCount * (1 - c)) - 1 + depth / theoreticalDepth + } + + private def getInnerCubeSizeDetails( + cubeStatuses: SortedMap[CubeId, CubeStatus], + desiredCubeSize: Int): (Double, NonLeafCubeSizeDetails) = { + val innerCubeStatuses = + cubeStatuses.filter(_._1.children.exists(cubeStatuses.contains)) + val innerCubeSizes = + innerCubeStatuses.values.map(_.files.map(_.elementCount).sum).toSeq.sorted + val innerCubeCount = innerCubeSizes.size.toDouble + + val avgFanOut = innerCubeStatuses.keys + .map(_.children.count(cubeStatuses.contains)) + .sum + .toDouble / innerCubeCount + + val details = + if (innerCubeCount == 0) { + NonLeafCubeSizeDetails(0, 0, 0, 0, 0, 0, 0, "") + } else { + val l1_dev = innerCubeSizes + .map(cs => math.abs(cs - desiredCubeSize)) + .sum / innerCubeCount / desiredCubeSize + + val l2_dev = math.sqrt( + innerCubeSizes + .map(cs => (cs - desiredCubeSize) * (cs - desiredCubeSize)) + .sum) / innerCubeCount / desiredCubeSize + + val levelStats = "\n(level, average weight, average cube size):\n" + + innerCubeStatuses + .groupBy(cw => cw._1.depth) + .mapValues { m => + val weights = m.values.map(_.normalizedWeight) + val elementCounts = m.values.map(_.files.map(_.elementCount).sum) + (weights.sum / weights.size, elementCounts.sum / elementCounts.size) + } + .toSeq + .sortBy(_._1) + .mkString("\n") + + NonLeafCubeSizeDetails( + innerCubeSizes.min, + innerCubeSizes((innerCubeCount * 0.25).toInt), + innerCubeSizes((innerCubeCount * 0.50).toInt), + innerCubeSizes((innerCubeCount * 0.75).toInt), + innerCubeSizes.max, + l1_dev, + l2_dev, + levelStats) + } + (avgFanOut, details) + } + /** * Outputs the indexed columns of the table * @param revisionID the identifier of the revision. @@ -192,17 +224,21 @@ case class NonLeafCubeSizeDetails( secondQuartile: Long, thirdQuartile: Long, max: Long, - dev: Double) { + l1_dev: Double, + l2_dev: Double, + levelStats: String) { override def toString: String = { - s"""Non-leaf Cube Size Stats - |(All values are 0 if there's no non-leaf cubes): + s"""Non-leaf Cube Size Stats: + |Quartiles: |- min: $min - |- firstQuartile: $firstQuartile - |- secondQuartile: $secondQuartile - |- thirdQuartile: $thirdQuartile + |- 1stQ: $firstQuartile + |- 2ndQ: $secondQuartile + |- 3rdQ: $thirdQuartile |- max: $max - |- dev: $dev + |- l1_dev: $l1_dev + |- l2_dev: $l2_dev + |$levelStats |""".stripMargin } @@ -216,7 +252,6 @@ case class IndexMetrics( cubeCount: Int, desiredCubeSize: Int, avgFanOut: Double, - depthOverLogNumNodes: Double, depthOnBalance: Double, nonLeafCubeSizeDetails: NonLeafCubeSizeDetails) { @@ -228,9 +263,8 @@ case class IndexMetrics( |cubeCount: $cubeCount |desiredCubeSize: $desiredCubeSize |avgFanOut: $avgFanOut - |depthOverLogNumNodes: $depthOverLogNumNodes |depthOnBalance: $depthOnBalance - |$nonLeafCubeSizeDetails + |\n$nonLeafCubeSizeDetails |""".stripMargin } diff --git a/src/test/scala/io/qbeast/spark/utils/QbeastTableTest.scala b/src/test/scala/io/qbeast/spark/utils/QbeastTableTest.scala index a8de9c59f..9796ce2d0 100644 --- a/src/test/scala/io/qbeast/spark/utils/QbeastTableTest.scala +++ b/src/test/scala/io/qbeast/spark/utils/QbeastTableTest.scala @@ -104,19 +104,38 @@ class QbeastTableTest extends QbeastIntegrationTestSpec { val cubeSize = 100 writeTestData(data, columnsToIndex, cubeSize, tmpDir) - val qbeastTable = QbeastTable.forPath(spark, tmpDir) - val metrics = qbeastTable.getIndexMetrics() + val metrics = QbeastTable.forPath(spark, tmpDir).getIndexMetrics() + val details = metrics.nonLeafCubeSizeDetails + + // scalastyle:off println + println(metrics) + // scalastyle:on metrics.elementCount shouldBe data.count() metrics.dimensionCount shouldBe columnsToIndex.size - metrics.nonLeafCubeSizeDetails.min shouldBe <=(metrics.nonLeafCubeSizeDetails.firstQuartile) - metrics.nonLeafCubeSizeDetails.firstQuartile shouldBe <=( - metrics.nonLeafCubeSizeDetails.secondQuartile) - metrics.nonLeafCubeSizeDetails.secondQuartile shouldBe <=( - metrics.nonLeafCubeSizeDetails.thirdQuartile) - metrics.nonLeafCubeSizeDetails.thirdQuartile shouldBe <=(metrics.nonLeafCubeSizeDetails.max) metrics.desiredCubeSize shouldBe cubeSize + details.min shouldBe <=(details.firstQuartile) + details.firstQuartile shouldBe <=(details.secondQuartile) + details.secondQuartile shouldBe <=(details.thirdQuartile) + details.thirdQuartile shouldBe <=(details.max) + } } + + it should "single cube tree correctly" in + withQbeastContextSparkAndTmpDir { (spark, tmpDir) => + { + val data = createDF(spark) + val columnsToIndex = Seq("age", "val2") + val cubeSize = 5000 + writeTestData(data, columnsToIndex, cubeSize, tmpDir) + + val qbeastTable = QbeastTable.forPath(spark, tmpDir) + val metrics = qbeastTable.getIndexMetrics() + + metrics.depth shouldBe 0 + metrics.avgFanOut.isNaN shouldBe true + } + } }