@@ -24,11 +24,11 @@ import com._4paradigm.openmldb.batch.window.{WindowAggPlanUtil, WindowComputer}
24
24
import com ._4paradigm .openmldb .batch .{OpenmldbBatchConfig , PlanContext , SparkInstance }
25
25
import com ._4paradigm .openmldb .common .codec .CodecUtil
26
26
import org .apache .spark .sql .catalyst .InternalRow
27
- import org .apache .spark .sql .catalyst .expressions .JoinedRow
28
27
import org .apache .spark .sql .types .{DateType , LongType , StructType , TimestampType }
29
- import org .apache .spark .sql .{DataFrame , Row , functions }
28
+ import org .apache .spark .sql .{Column , DataFrame , Row , functions }
30
29
import org .apache .spark .util .SerializableConfiguration
31
30
import org .slf4j .LoggerFactory
31
+
32
32
import scala .collection .mutable
33
33
34
34
/** The planner which implements window agg physical node.
@@ -67,17 +67,26 @@ object WindowAggPlan {
67
67
val dfWithIndex = inputTable.getDfConsideringIndex(ctx, physicalNode.GetNodeId ())
68
68
69
69
// Do union if physical node has union flag
70
+ val uniqueColName = " _WINDOW_UNION_FLAG_" + System .currentTimeMillis()
70
71
val unionTable = if (isWindowWithUnion) {
71
- WindowAggPlanUtil .windowUnionTables(ctx, physicalNode, dfWithIndex)
72
+ WindowAggPlanUtil .windowUnionTables(ctx, physicalNode, dfWithIndex, uniqueColName )
72
73
} else {
73
74
dfWithIndex
74
75
}
75
76
76
- // Do groupby and sort with window skew optimization or not
77
+ // Use order by to make sure that rows with same timestamp from primary will be placed in last
78
+ // TODO(tobe): support desc if we get config from physical plan
79
+ val unionSparkCol : Option [Column ] = if (isWindowWithUnion) {
80
+ Some (unionTable.col(uniqueColName))
81
+ } else {
82
+ None
83
+ }
84
+
85
+ // Do group by and sort with window skew optimization or not
77
86
val repartitionDf = if (isWindowSkewOptimization) {
78
- windowPartitionWithSkewOpt(ctx, physicalNode, unionTable, windowAggConfig)
87
+ windowPartitionWithSkewOpt(ctx, physicalNode, unionTable, windowAggConfig, unionSparkCol )
79
88
} else {
80
- windowPartition(ctx, physicalNode, unionTable)
89
+ windowPartition(ctx, physicalNode, unionTable, unionSparkCol )
81
90
}
82
91
83
92
// Get the output schema which may add the index column
@@ -179,7 +188,8 @@ object WindowAggPlan {
179
188
def windowPartitionWithSkewOpt (ctx : PlanContext ,
180
189
windowAggNode : PhysicalWindowAggrerationNode ,
181
190
inputDf : DataFrame ,
182
- windowAggConfig : WindowAggConfig ): DataFrame = {
191
+ windowAggConfig : WindowAggConfig ,
192
+ unionSparkCol : Option [Column ]): DataFrame = {
183
193
val uniqueNamePostfix = ctx.getConf.windowSkewOptPostfix
184
194
185
195
// Cache the input table which may be used for multiple times
@@ -274,7 +284,12 @@ object WindowAggPlan {
274
284
}
275
285
276
286
val sortedByCol = PhysicalNodeUtil .getOrderbyColumns(windowAggNode, addColumnsDf)
277
- val sortedByCols = repartitionCols ++ sortedByCol
287
+
288
+ val sortedByCols = if (unionSparkCol.isEmpty) {
289
+ repartitionCols ++ sortedByCol
290
+ } else {
291
+ repartitionCols ++ sortedByCol ++ Array (unionSparkCol.get)
292
+ }
278
293
279
294
// Notice that we should make sure the keys in the same partition are ordering as well
280
295
val sortedDf = repartitionDf.sortWithinPartitions(sortedByCols : _* )
@@ -289,7 +304,8 @@ object WindowAggPlan {
289
304
* 1. Repartition the table with the "partition by" keys.
290
305
* 2. Sort the data within partitions with the "order by" keys.
291
306
*/
292
- def windowPartition (ctx : PlanContext , windowAggNode : PhysicalWindowAggrerationNode , inputDf : DataFrame ): DataFrame = {
307
+ def windowPartition (ctx : PlanContext , windowAggNode : PhysicalWindowAggrerationNode , inputDf : DataFrame ,
308
+ unionSparkCol : Option [Column ]): DataFrame = {
293
309
294
310
// Repartition the table with window keys
295
311
val repartitionCols = PhysicalNodeUtil .getRepartitionColumns(windowAggNode, inputDf)
@@ -302,9 +318,12 @@ object WindowAggPlan {
302
318
// Sort with the window orderby keys
303
319
val orderbyCols = PhysicalNodeUtil .getOrderbyColumns(windowAggNode, inputDf)
304
320
321
+ val sortedDf = if (unionSparkCol.isEmpty) {
322
+ repartitionDf.sortWithinPartitions(repartitionCols ++ orderbyCols : _* )
323
+ } else {
324
+ repartitionDf.sortWithinPartitions(repartitionCols ++ orderbyCols ++ Array (unionSparkCol.get): _* )
325
+ }
305
326
// Notice that we should make sure the keys in the same partition are ordering as well
306
- val sortedDf = repartitionDf.sortWithinPartitions(repartitionCols ++ orderbyCols : _* )
307
-
308
327
sortedDf
309
328
}
310
329
0 commit comments