NVIDIA · parthosa · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/core/src/main/resources/photonOperatorMapping.json b/core/src/main/resources/photonOperatorMapping.json
@@ -0,0 +1,99 @@
+{
+  "PhotonAdapter": [
+    "Scan"
+  ],
+  "PhotonScan": [
+    "Scan"
+  ],
+  "PhotonSubqueryBroadcast": [
+    "SubqueryBroadcast"
+  ],
+  "PhotonResultStage": [
+    "WholeStageCodegen"
+  ],
+  "PhotonUnionShuffleExchangeSink": [
+    "Union"
+  ],
+  "PhotonUnionShuffleMapStage": [
+    "WholeStageCodegen"
+  ],
+  "PhotonTopK": [
+    "TakeOrderedAndProject"
+  ],
+  "PhotonAgg": [
+    "HashAggregate",
+    "SortAggregate",
+    "ObjectHashAggregate"
+  ],
+  "PhotonBroadcastExchange": [
+    "BroadcastExchange"
+  ],
+  "PhotonBroadcastHashJoin": [
+    "BroadcastHashJoin"
+  ],
+  "PhotonBroadcastNestedLoopJoin": [
+    "BroadcastNestedLoopJoin"
+  ],
+  "PhotonExpand": [
+    "Expand"
+  ],
+  "PhotonFilter": [
+    "Filter"
+  ],
+  "PhotonGenerate": [
+    "Generate"
+  ],
+  "PhotonGlobalLimit": [
+    "GlobalLimit"
+  ],
+  "PhotonGroupingAgg": [
+    "HashAggregate",
+    "SortAggregate",
+    "ObjectHashAggregate"
+  ],
+  "PhotonGroupingAggWithRollup": [
+    "HashAggregate"
+  ],
+  "PhotonHashJoin": [
+    "BroadcastHashJoin"
+  ],
+  "PhotonLocalLimit": [
+    "LocalLimit"
+  ],
+  "PhotonProject": [
+    "Project"
+  ],
+  "PhotonRowToColumnar": [
+    "RowToColumnar"
+  ],
+  "PhotonShuffledHashJoin": [
+    "SortMergeJoin"
+  ],
+  "PhotonShuffleExchangeSink": [
+    "Exchange",
+    "StageBoundary",
+    "BroadcastExchange"
+  ],
+  "PhotonShuffleExchangeSource": [
+    "Exchange",
+    "StageBoundary",
+    "AQEShuffleRead",
+    "ShuffleQueryStage"
+  ],
+  "PhotonShuffleHashJoin": [
+    "ShuffledHashJoin"
+  ],
+  "PhotonShuffleMapStage": [
+    "WholeStageCodegen"
+  ],
+  "PhotonSort": [
+    "Sort"
+  ],
+  "PhotonUnion": [
+    "Union"
+  ],
+  "PhotonWindow": [
+    "Window",
+    "RunningWindowFunction"
+  ]
+}
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/DatabricksParseHelper.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/DatabricksParseHelper.scala
@@ -17,11 +17,15 @@
 package com.nvidia.spark.rapids.tool.planparser
 
 import scala.util.control.NonFatal
+import scala.util.matching.Regex
 
-import org.json4s.DefaultFormats
+import org.json4s.{DefaultFormats, Formats}
+import org.json4s.jackson.JsonMethods
 import org.json4s.jackson.JsonMethods.parse
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.execution.SparkPlanInfo
+import org.apache.spark.sql.rapids.tool.util.UTF8Source
 
 // Utilities used to handle Databricks and Photon Ops
 object DatabricksParseHelper extends Logging {
@@ -42,6 +46,9 @@ object DatabricksParseHelper extends Logging {
   val SUB_PROP_CLUSTER_ID = "ClusterId"
   val SUB_PROP_JOB_ID = "JobId"
   val SUB_PROP_RUN_NAME = "RunName"
+
+  private val PHOTON_PATTERN: Regex = "Photon[a-zA-Z]*".r
+  private val PHOTON_OPERATOR_MAPPING_FILE = "photonOperatorMapping.json"
   /**
    * Checks if the properties indicate that the application is a Photon app.
    * This ca be checked by looking for keywords in one of the keys defined in PHOTON_SPARK_PROPS
@@ -108,4 +115,41 @@ object DatabricksParseHelper extends Logging {
         Map.empty
     }
   }
+
+  /**
+   * Maps the Photon operator names to Spark operator names using a mapping JSON file.
+   */
+  private lazy val photonToSparkMapping: Map[String, String] = {
+    val jsonString = UTF8Source.fromResource(PHOTON_OPERATOR_MAPPING_FILE).mkString
+    val json = JsonMethods.parse(jsonString)
+    // Implicitly define JSON formats for deserialization using DefaultFormats
+    implicit val formats: Formats = DefaultFormats
+    // Extract and deserialize the JValue object into a Map[String, String]
+    // TODO: Instead of only extracting the first value, we should consider extracting all values
+    json.extract[Map[String, List[String]]].mapValues(_.head)
+  }
+
+  /**
+   * Replaces a Photon node name in the SparkPlanInfo object with a corresponding Spark node name,
+   * if a mapping exists.
+   */
+  def processPhotonPlan(planInfo: SparkPlanInfo): Option[SparkPlanInfo] = {
+    // Check if the node name contains a Photon node
+    val photonNodeOpt = PHOTON_PATTERN.findFirstIn(planInfo.nodeName)
+    // Early return if node is not a Photon node
+    if (photonNodeOpt.isEmpty) {
+      return None
+    }
+    // If a Photon node is found, try to map it to a Spark node
+    photonNodeOpt.flatMap(photonToSparkMapping.get).map { sparkNode =>
+      // Create a new SparkPlanInfo object with node name and description replaced
+      // with the Spark node name
+      new SparkPlanInfo(
+        nodeName = sparkNode,
+        simpleString = planInfo.simpleString.replace(planInfo.nodeName, sparkNode),
+        children = planInfo.children,
+        metadata = planInfo.metadata,
+        metrics = planInfo.metrics)
+    }
+  }
 }
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala
@@ -936,9 +936,16 @@ object SQLPlanParser extends Logging {
     // BuildRight, LeftOuter, ((CEIL(cast(id1#1490 as double)) <= cast(id2#1496 as bigint))
     // AND (cast(id1#1490 as bigint) < CEIL(cast(id2#1496 as double))))
     // Get joinType and buildSide by splitting the input string.
-    val nestedLoopParameters = exprStr.split(",", 3)
-    val buildSide = nestedLoopParameters(0).trim
-    val joinType = nestedLoopParameters(1).trim
+    val nestedLoopParameters = exprStr.split(",", 3).map(_.trim)
+    val (buildSide, joinType) =
+      if(JoinType.allsupportedJoinType.contains(nestedLoopParameters(1))) {
+        // If exprString has the format: BuildRight, Inner
+        (nestedLoopParameters(0).trim, nestedLoopParameters(1).trim)
+      } else {
+        // If exprString has the format: Inner, BuildRight
+        // Note: This format is present in Photon Event logs
+        (nestedLoopParameters(1).trim, nestedLoopParameters(0).trim)
+      }
 
     // Check if condition present on join columns else return empty array
     val parsedExpressions = if (nestedLoopParameters.size > 2) {

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala
@@ -46,6 +46,7 @@ import org.apache.spark.sql.rapids.tool.profiling.{ApplicationInfo, SparkPlanInf
 object GenerateDot {
   val GPU_COLOR = "#76b900" // NVIDIA Green
   val CPU_COLOR = "#0071c5"
+  // TODO: Add color for Photon nodes
   val TRANSITION_COLOR = "red"
 
   def formatMetric(m: SQLMetricInfo, value: Long): String = {

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualOutputWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualOutputWriter.scala
@@ -386,7 +386,8 @@ case class FormattedQualificationSummaryInfo(
     unSupportedExprs: String,
     clusterTags: Map[String, String],
     estimatedFrequency: Long,
-    totalCoreSec: Long)
+    totalCoreSec: Long,
+    isPhoton: Boolean)
 
 object QualOutputWriter {
   val NON_SQL_TASK_DURATION_STR = "NonSQL Task Duration"
@@ -466,6 +467,7 @@ object QualOutputWriter {
   val RECOMMENDED_WORKER_NODE_TYPE = "Recommended Worker Node Type"
   val DRIVER_NODE_TYPE = "Driver Node Type"
   val TOTAL_CORE_SEC = "Total Core Seconds"
+  val IS_PHOTON = "Photon App"
   // Default frequency for jobs with a single instance is 30 times every month (30 days)
   val DEFAULT_JOB_FREQUENCY = 30L
   val APP_DUR_STR_SIZE: Int = APP_DUR_STR.size
@@ -642,7 +644,8 @@ object QualOutputWriter {
       UNSUPPORTED_EXECS -> UNSUPPORTED_EXECS.size,
       UNSUPPORTED_EXPRS -> UNSUPPORTED_EXPRS.size,
       ESTIMATED_FREQUENCY -> ESTIMATED_FREQUENCY.size,
-      TOTAL_CORE_SEC -> TOTAL_CORE_SEC.size
+      TOTAL_CORE_SEC -> TOTAL_CORE_SEC.size,
+      IS_PHOTON -> IS_PHOTON.length
     )
     if (appInfos.exists(_.clusterTags.nonEmpty)) {
       detailedHeadersAndFields += (CLUSTER_TAGS -> getMaxSizeForHeader(
@@ -1105,7 +1108,8 @@ object QualOutputWriter {
       appInfo.unSupportedExprs,
       appInfo.allClusterTagsMap,
       appInfo.estimatedFrequency.getOrElse(DEFAULT_JOB_FREQUENCY),
-      appInfo.totalCoreSec
+      appInfo.totalCoreSec,
+      appInfo.isPhoton
     )
   }
 
@@ -1142,7 +1146,8 @@ object QualOutputWriter {
       reformatCSVFunc(appInfo.unSupportedExecs) -> headersAndSizes(UNSUPPORTED_EXECS),
       reformatCSVFunc(appInfo.unSupportedExprs) -> headersAndSizes(UNSUPPORTED_EXPRS),
       appInfo.estimatedFrequency.toString -> headersAndSizes(ESTIMATED_FREQUENCY),
-      appInfo.totalCoreSec.toString -> headersAndSizes(TOTAL_CORE_SEC)
+      appInfo.totalCoreSec.toString -> headersAndSizes(TOTAL_CORE_SEC),
+      appInfo.isPhoton.toString -> headersAndSizes(IS_PHOTON)
     )
 
     if (appInfo.clusterTags.nonEmpty) {

diff --git a/.../src/main/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationAppInfo.scala b/.../src/main/scala/org/apache/spark/sql/rapids/tool/qualification/QualificationAppInfo.scala
@@ -27,7 +27,7 @@ import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent}
-import org.apache.spark.sql.rapids.tool.{AppBase, AppEventlogProcessException, ClusterSummary, FailureApp, GpuEventLogException, IncorrectAppStatusException, MlOps, MlOpsEventLogType, PhotonEventLogException, SupportedMLFuncsName, ToolUtils}
+import org.apache.spark.sql.rapids.tool.{AppBase, AppEventlogProcessException, ClusterSummary, FailureApp, GpuEventLogException, IncorrectAppStatusException, MlOps, MlOpsEventLogType, SupportedMLFuncsName, ToolUtils}
 import org.apache.spark.sql.rapids.tool.annotation.{Calculated, WallClock}
 import org.apache.spark.sql.rapids.tool.store.StageModel
 
@@ -111,9 +111,6 @@ class QualificationAppInfo(
     if (gpuMode) {
       throw GpuEventLogException()
     }
-    if (isPhoton) {
-      throw PhotonEventLogException()
-    }
   }
 
   override def guestimateAppEndTimeCB(): () => Option[Long] = {
@@ -644,7 +641,7 @@ class QualificationAppInfo(
         perSqlStageSummary.map(_.stageSum).flatten, estimatedInfo, perSqlInfos,
         unSupportedExecs, unSupportedExprs, clusterTags, allClusterTagsMap,
         mlFuncReportInfo.mlFunctionsAndStageInfo, mlFuncReportInfo.mlTotalStageDurations,
-        unsupportedOpsReason, clusterSummary, calculateTotalCoreSec())
+        unsupportedOpsReason, clusterSummary, calculateTotalCoreSec(), isPhoton)
     }
   }
 
@@ -988,6 +985,7 @@ case class QualificationSummaryInfo(
     unsupportedOpsReasons: Map[String, String],
     clusterSummary: ClusterSummary,
     totalCoreSec: Long,
+    isPhoton: Boolean,
     estimatedFrequency: Option[Long] = None)
 
 case class StageQualSummaryInfo(

diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/ToolsPlanGraph.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/ToolsPlanGraph.scala
@@ -16,8 +16,8 @@
 
 package org.apache.spark.sql.rapids.tool.util
 
+import com.nvidia.spark.rapids.tool.planparser.DatabricksParseHelper
 import java.util.concurrent.atomic.AtomicLong
-
 import scala.collection.mutable
 import scala.reflect.runtime.universe._
 
@@ -199,6 +199,7 @@ object ToolsPlanGraph {
   //      This can be achieved by checking for spark properties
   //      spark.databricks.clusterUsageTags.clusterAllTags
   private lazy val dbRuntimeReflection = DBReflectionContainer()
+
   // By default call the Spark constructor. If this fails, we fall back to the DB constructor
   def constructGraphNode(id: Long, name: String, desc: String,
       metrics: collection.Seq[SQLPlanMetric]): SparkPlanGraphNode = {
@@ -272,13 +273,16 @@ object ToolsPlanGraph {
   }
 
   private def buildSparkPlanGraphNode(
-      planInfo: SparkPlanInfo,
+      planInfoRaw: SparkPlanInfo,
       nodeIdGenerator: AtomicLong,
       nodes: mutable.ArrayBuffer[SparkPlanGraphNode],
       edges: mutable.ArrayBuffer[SparkPlanGraphEdge],
       parent: SparkPlanGraphNode,
       subgraph: SparkPlanGraphCluster,
       exchanges: mutable.HashMap[SparkPlanInfo, SparkPlanGraphNode]): Unit = {
+    // Replace Photon node names with Spark node names
+    // TODO: Skip this if app.isPhoton is false
+    val planInfo = DatabricksParseHelper.processPhotonPlan(planInfoRaw).getOrElse(planInfoRaw)
     processPlanInfo(planInfo.nodeName) match {
       case name if name.startsWith("WholeStageCodegen") =>
         val metrics = planInfo.metrics.map { metric =>

diff --git a/core/src/test/resources/QualificationExpectations/complex_dec_expectation.csv b/core/src/test/resources/QualificationExpectations/complex_dec_expectation.csv
@@ -1,2 +1,2 @@
-App Name,App ID,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,SQL Stage Durations Sum,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly), Total Core Seconds
-"Spark shell","local-1626104300434",130285.11,818.88,1500,1469,131104,1315,88.35,"","","","struct<firstname:string,middlename:array<string>,lastname:string>;struct<current:struct<state:string,city:string>,previous:struct<state:map<string,string>,city:string>>;array<struct<city:string,state:string>>;map<string,string>;map<string,array<string>>;map<string,map<string,string>>;array<array<string>>;array<string>","struct<firstname:string,middlename:array<string>,lastname:string>;struct<current:struct<state:string,city:string>,previous:struct<state:map<string,string>,city:string>>;array<struct<city:string,state:string>>;map<string,array<string>>;map<string,map<string,string>>;array<array<string>>","NESTED COMPLEX TYPE",1260,1388,129598,181,1288,false,"CollectLimit","",30,1564
+App Name,App ID,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,SQL Stage Durations Sum,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly), Total Core Seconds,Photon App
+"Spark shell","local-1626104300434",130285.11,818.88,1500,1469,131104,1315,88.35,"","","","struct<firstname:string,middlename:array<string>,lastname:string>;struct<current:struct<state:string,city:string>,previous:struct<state:map<string,string>,city:string>>;array<struct<city:string,state:string>>;map<string,string>;map<string,array<string>>;map<string,map<string,string>>;array<array<string>>;array<string>","struct<firstname:string,middlename:array<string>,lastname:string>;struct<current:struct<state:string,city:string>,previous:struct<state:map<string,string>,city:string>>;array<struct<city:string,state:string>>;map<string,array<string>>;map<string,map<string,string>>;array<array<string>>","NESTED COMPLEX TYPE",1260,1388,129598,181,1288,false,"CollectLimit","",30,1564,false
diff --git a/core/src/test/resources/QualificationExpectations/db_sim_test_expectation.csv b/core/src/test/resources/QualificationExpectations/db_sim_test_expectation.csv
@@ -1,2 +1,2 @@
-App Name,App ID,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,SQL Stage Durations Sum,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly),Total Core Seconds
-"Spark shell","local-1623876083964",64013.8,69843.19,119353,1417661,133857,92667,91.14,"","","","","","",119903,143821,14504,316964,1100697,false,"Scan;SerializeFromObject","",30,1599
+App Name,App ID,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,SQL Stage Durations Sum,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly),Total Core Seconds,Photon App
+"Spark shell","local-1623876083964",64013.8,69843.19,119353,1417661,133857,92667,91.14,"","","","","","",119903,143821,14504,316964,1100697,false,"Scan;SerializeFromObject","",30,1599,false
diff --git a/core/src/test/resources/QualificationExpectations/directory_test_expectation.csv b/core/src/test/resources/QualificationExpectations/directory_test_expectation.csv
@@ -1,2 +1,2 @@
-App Name,App ID,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,SQL Stage Durations Sum,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly),Total Core Seconds
-"Spark shell","local-1623876083964",64013.8,69843.19,119353,1417661,133857,92667,91.14,"","","","","","",119903,143821,14504,316964,1100697,false,"Scan;SerializeFromObject","",30,1599
+App Name,App ID,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,SQL Stage Durations Sum,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly),Total Core Seconds,Photon App
+"Spark shell","local-1623876083964",64013.8,69843.19,119353,1417661,133857,92667,91.14,"","","","","","",119903,143821,14504,316964,1100697,false,"Scan;SerializeFromObject","",30,1599,false
diff --git a/core/src/test/resources/QualificationExpectations/jdbc_expectation.csv b/core/src/test/resources/QualificationExpectations/jdbc_expectation.csv
@@ -1,2 +1,2 @@
-App Name,App ID,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,SQL Stage Durations Sum,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly),Total Core Seconds
-"Spark shell","app-20211019113801-0001",569916.96,2050.03,2942,19894,571967,2814,28.41,"","JDBC[*]","","","","",1812,2883,569025,859,19035,false,"CollectLimit;Scan JDBCRelation(TBLS) [numPartitions=1];Execute CreateViewCommand","",30,9110
+App Name,App ID,Estimated GPU Duration,Estimated GPU Time Saved,SQL DF Duration,SQL Dataframe Task Duration,App Duration,GPU Opportunity,Executor CPU Time Percent,SQL Ids with Failures,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types,Potential Problems,Longest SQL Duration,SQL Stage Durations Sum,NONSQL Task Duration Plus Overhead,Unsupported Task Duration,Supported SQL DF Task Duration,App Duration Estimated,Unsupported Execs,Unsupported Expressions,Estimated Job Frequency (monthly),Total Core Seconds,Photon App
+"Spark shell","app-20211019113801-0001",569916.96,2050.03,2942,19894,571967,2814,28.41,"","JDBC[*]","","","","",1812,2883,569025,859,19035,false,"CollectLimit;Scan JDBCRelation(TBLS) [numPartitions=1];Execute CreateViewCommand","",30,9110,false