pulp-platform · Victor-Jung · Feb 17, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 13, 2025
@@ -74,7 +74,7 @@ jobs:
         ICCT_ITA_8
         miniMobileNet
         miniMobileNetv2
-        CCT
+        CCT/CCT_16_16_8
 
 
   ### CortexM Tests ###
@@ -212,7 +212,14 @@ jobs:
         testBacktracking
         testFloatAdder
         testFloatGEMM
+        testFloat2DConvolution
+        testFloatLayerNorm
+        testFloatRelu
+        testFloatMaxPool
+        testFloatMatmul
         testFloatSoftmax
+        testFloatTranspose
+        testFloatMul
       num-cores: 8
 
   siracusa-models:
@@ -268,6 +275,38 @@ jobs:
           {
             "name": "testFloatGEMM",
             "L1": [8000]
+          },
+          {
+            "name": "testFloat2DConvolution",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatLayerNorm",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatRelu",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatMaxPool",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatMatmul",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatSoftmax",
+            "L1": [4000]
+          },
+          {
+            "name": "testFloatTranspose",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatMul",
+            "L1": [2000]
           }
         ]
       num-cores: 8
@@ -312,6 +351,38 @@ jobs:
           {
             "name": "testFloatGEMM",
             "L1": [8000]
+          },
+          {
+            "name": "testFloat2DConvolution",
+            "L1": [4000]
+          },
+          {
+            "name": "testFloatLayerNorm",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatRelu",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatMaxPool",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatMatmul",
+            "L1": [5000]
+          },
+          {
+            "name": "testFloatSoftmax",
+            "L1": [8000]
+          },
+          {
+            "name": "testFloatTranspose",
+            "L1": [2000]
+          },
+          {
+            "name": "testFloatMul",
+            "L1": [2000]
           }
         ]
       num-cores: 8
@@ -342,6 +413,8 @@ jobs:
             L1: [64000]
           - name: "MLPerf/AnomalyDetection"
             L1: [64000]
+          - name: "CCT/CCT_16_16_8"
+            L1: [64000]
         num-cores:
           - 8
     uses: ./.github/workflows/TestRunnerTiledSiracusa.yml

@@ -112,4 +112,23 @@ Change main.c to use OUTPUTTYPE instead of float
 
 ### Fixed
 - MaxPool Padding Extract Pass for float and interger
-- Testinput, testoutput, weight type casted from double to float warning
+- Testinput, testoutput, weight type casted from double to float warning
+
+## Add Tiling Support to All CCT Kernels and Fix CCT Operators on Siracusa Platform for L2
+
+### Added
+- Float Bindings, Tilers of CCT kernels for Pulp Target
+- Float Convolution, MaxPool Parser, Template, Kernel with HWC layout and padding integrated
+- Added tiling constraints for conv gather and layernorm and exisitng constraints for other kernels
+- profileuntiling arg
+- CCT onnx tests with img size of 16 and 32
+
+### Fixed
+- CycleMeasure Pass for Siracusa Untiling Profilling
+- GEMM Tiling Constraints transA and `transB' not supported
+- MatMul layer Multi-Dimensional Input Issue
+- Add Layer for Broadcasted Bias
+- Resolved an issue where concatenation of float32 with f caused inf errors during code generation
+
+
+
@@ -25,13 +25,17 @@
 
 from typing import Tuple
 
-from Deeploy.DeeployTypes import CodeTransformationPass, ExecutionBlock, NetworkContext, NodeTemplate
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
+    NodeTemplate, _NoVerbosity
 
 
 class ProfilingCodeGeneration(CodeTransformationPass):
 
-    def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
-              name: str) -> Tuple[NetworkContext, ExecutionBlock]:
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
         executionBlock.addLeft(NodeTemplate("""
         uint32_t ${op}_cycles = getCycles();
         """), {"op": name})

@@ -70,6 +70,8 @@ class CodeGenVerbosity:
     """
 
     tilingProfiling: Optional[str]  #: str: Specifies the name of the memory level on which to profile tiling
+    untilingProfiling: Optional[
+        bool] = None  #: str: Specifies the name of the memory level on which to profile untiling
 
 
 _NoVerbosity = CodeGenVerbosity(None)

@@ -151,12 +151,13 @@ def __init__(self, maps: List[NodeMapper]):
 
     def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
                       channels_first) -> Tuple[Shape, Shape]:
-        outputShapes = inputShapes.copy()
+
         if len(inputShapes[0]) > len(inputShapes[1]):
             inputShapes[1] = inputShapes[0]
         else:
             inputShapes[0] = inputShapes[1]
 
+        outputShapes = [inputShapes[0]]
         return (inputShapes, outputShapes)
 
     def computeOps(self):
@@ -172,6 +173,27 @@ def computeOps(self):
         return 2 * self.mapper.parser.operatorRepresentation['M'] * self.mapper.parser.operatorRepresentation[
             'N'] * self.mapper.parser.operatorRepresentation['O'] * self.mapper.parser.operatorRepresentation['batch']
 
+    def computeShapes(self, inputShapes: Tuple[Shape, Shape], outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Tuple[Shape, Shape], Shape]:
+
+        A_shape, B_shape = inputShapes
+        if len(A_shape) < 2:
+            A_shape = [1] * (2 - len(A_shape)) + A_shape
+
+        if len(B_shape) < 2:
+            B_shape = B_shape + [1] * (2 - len(B_shape))
+
+        if A_shape[-1] != B_shape[-2]:
+            raise ValueError(f"MatMul requires A.shape[-1] == B.shape[-2], but got {A_shape} and {B_shape}")
+
+        if len(A_shape) > len(B_shape):
+            B_shape = [1] * (len(A_shape) - len(B_shape)) + list(B_shape)
+
+        elif len(A_shape) < len(B_shape):
+            A_shape = [1] * (len(B_shape) - len(A_shape)) + list(A_shape)
+
+        return [A_shape, B_shape], outputShapes
+
 
 class RQMatMulLayer(MatMulLayer):
 

@@ -25,7 +25,7 @@
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
-// GEMM (Name: ${nodeName}, Op: ${nodeOp})
+// Matmul (Name: ${nodeName}, Op: ${nodeOp})
 BEGIN_SINGLE_CORE
     ${A_type.typeName} ref_${data_out}_${A} = ${A};
     ${B_type.typeName} ref_${data_out}_${B} = ${B};

@@ -37,17 +37,20 @@
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
-from Deeploy.Targets.Generic.Templates import ConcatTemplate, FloatGemmTemplate, RQSiGELUTemplate, iHardswishTemplate
-from Deeploy.Targets.Generic.TypeCheckers import ConcatChecker, GELUChecker, GEMMChecker, HardswishChecker, \
-    LayerNormChecker, MatMulChecker, MulChecker, ReduceMeanChecker, RQAddChecker, RQHardswishChecker, SliceChecker, \
-    SoftmaxChecker, TransposeChecker
+from Deeploy.Targets.Generic.Templates import ConcatTemplate, FloatGELUTemplate, FloatGemmTemplate, \
+    FloatLayernormTemplate, FloatMatMulTemplate, FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, \
+    GatherTemplate, RQSiGELUTemplate, iHardswishTemplate
+from Deeploy.Targets.Generic.TypeCheckers import ConcatChecker, ConvChecker, GatherChecker, GELUChecker, GEMMChecker, \
+    HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, ReduceMeanChecker, ReluChecker, RQAddChecker, \
+    RQHardswishChecker, SliceChecker, SoftmaxChecker, TransposeChecker
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
 from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
-from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \
-    MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, RQAddTemplate, RQSiHardswishTemplate, SliceTemplate, \
-    TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
+from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatConvTemplate, FloatMaxPoolTemplate, GEMMTemplate, \
+    MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, RQAddTemplate, \
+    RQSiHardswishTemplate, SliceTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
+    iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
@@ -204,6 +207,13 @@
         ForkTransformer)
 ]
 
+PULPFloatConv2DBindings = [
+    NodeBinding(
+        ConvChecker([PointerClass(float32_t), PointerClass(float32_t),
+                     PointerClass(float32_t)], [PointerClass(float32_t)]), FloatConvTemplate.reference2DTemplate,
+        ForkTransformer)
+]
+
 PULPRQSMatrixVecBindings = [
     NodeBinding(
         PULPLinearChecker([PointerClass(type1),
@@ -227,6 +237,9 @@
 PULPMaxPool2DBindings = [
     NodeBinding(PULPMaxPoolChecker([PointerClass(type)], [PointerClass(type)]),
                 MaxPool2DTemplate.PULPMaxPool2D_8_Template, ForkTransformer) for type in [int8_t, uint8_t]
+] + [
+    NodeBinding(PULPMaxPoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatMaxPoolTemplate.referenceTemplate, ForkTransformer)
 ]
 
 PULPConv1DBinding = NodeBinding(
@@ -241,8 +254,13 @@
          PointerClass(int32_t),
          PointerClass(int32_t)], [PointerClass(int8_t)]), ConvTemplate.PULPDWConv1D_8_Template, ForkTransformer)
 
-PULPMatMulBinding = NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
-                                GEMMTemplate.PULPMM_8_Template, ClusterTransformer)
+PULPMatMulBindings = [
+    NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                GEMMTemplate.PULPMM_8_Template, ClusterTransformer)
+] + [
+    NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatMatMulTemplate.referenceTemplate, ClusterTransformer)
+]
 
 PULPReduceMeanBindings = [
     NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate,
@@ -271,11 +289,17 @@
 PULPSoftmaxBindings = [
     NodeBinding(SoftmaxChecker([PointerClass(_type)], [PointerClass(uint8_t)]), iSoftmaxTemplate.referenceTemplate,
                 ForkTransformer) for _type in [int8_t, uint8_t]
+] + [
+    NodeBinding(SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatSoftmaxTemplate.referenceTemplate, ForkTransformer)
 ]
 
 PULPTransposeBindings = [
     NodeBinding(TransposeChecker([PointerClass(type)], [PointerClass(type)]), TransposeTemplate.referenceTemplate,
                 ForkTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(TransposeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                TransposeTemplate.referenceTemplate, ForkTransformer)
 ]
 
 PULPConcatBindings = [
@@ -314,4 +338,25 @@
     NodeBinding(MulChecker([PointerClass(typeA), PointerClass(typeB)], [PointerClass(int32_t)]),
                 MulTemplate.referenceTemplate, ForkTransformer)
     for typeA, typeB in itertools.product(SignedIntegerDataTypes, SignedIntegerDataTypes)
+] + [
+    NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatMulTemplate.referenceTemplate, ForkTransformer)
+]
+
+PULPReluBinding = NodeBinding(ReluChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                              FloatReluTemplate.referenceTemplate, ForkTransformer)
+
+PULPLayernormBinding = NodeBinding(
+    LayerNormChecker(
+        [PointerClass(float32_t), PointerClass(float32_t),
+         PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceTemplate,
+    ForkTransformer)
+
+PULPFloatGELUBinding = NodeBinding(
+    GELUChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+    FloatGELUTemplate.referenceTemplate, ForkTransformer)
+
+PULPGatherBindings = [
+    NodeBinding(GatherChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]),
+                GatherTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes
 ]
@@ -25,6 +25,7 @@
 
 from typing import Tuple
 
+from Deeploy.CommonExtensions.CodeTransformationPasses.CycleMeasurement import ProfilingCodeGeneration
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
 
 from .PULPClusterTilingDB import ProfilingPULPClusterTilingGenerationDB, PULPClusterTilingGenerationDB
@@ -38,6 +39,7 @@ def __init__(self, targetMemLevel: str):
         self.profilingSB = ProfilingPULPClusterTilingGenerationSB(targetMemLevel)
         self.DB = PULPClusterTilingGenerationDB(targetMemLevel)
         self.profilingDB = ProfilingPULPClusterTilingGenerationDB(targetMemLevel)
+        self.profiluntiling = ProfilingCodeGeneration()
 
     def apply(self,
               ctxt: NetworkContext,
@@ -52,4 +54,7 @@ def apply(self,
             ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
             ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)
 
+        if verbose.untilingProfiling:
+            ctxt, executionBlock = self.profiluntiling.apply(ctxt, executionBlock, name)
+
         return ctxt, executionBlock