Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Tiling Support to All CCT Kernels and Fix CCT Operators on Siracusa Platform for L2 #35

Merged
merged 4 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 74 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ jobs:
ICCT_ITA_8
miniMobileNet
miniMobileNetv2
CCT
CCT/CCT_16_16_8


### CortexM Tests ###
Expand Down Expand Up @@ -212,7 +212,14 @@ jobs:
testBacktracking
testFloatAdder
testFloatGEMM
testFloat2DConvolution
testFloatLayerNorm
testFloatRelu
testFloatMaxPool
testFloatMatmul
testFloatSoftmax
testFloatTranspose
testFloatMul
num-cores: 8

siracusa-models:
Expand Down Expand Up @@ -268,6 +275,38 @@ jobs:
{
"name": "testFloatGEMM",
"L1": [8000]
},
{
"name": "testFloat2DConvolution",
"L1": [2000]
},
{
"name": "testFloatLayerNorm",
"L1": [2000]
},
{
"name": "testFloatRelu",
"L1": [2000]
},
{
"name": "testFloatMaxPool",
"L1": [2000]
},
{
"name": "testFloatMatmul",
"L1": [2000]
},
{
"name": "testFloatSoftmax",
"L1": [4000]
},
{
"name": "testFloatTranspose",
"L1": [2000]
},
{
"name": "testFloatMul",
"L1": [2000]
}
]
num-cores: 8
Expand Down Expand Up @@ -312,6 +351,38 @@ jobs:
{
"name": "testFloatGEMM",
"L1": [8000]
},
{
"name": "testFloat2DConvolution",
"L1": [4000]
},
{
"name": "testFloatLayerNorm",
"L1": [2000]
},
{
"name": "testFloatRelu",
"L1": [2000]
},
{
"name": "testFloatMaxPool",
"L1": [2000]
},
{
"name": "testFloatMatmul",
"L1": [5000]
},
{
"name": "testFloatSoftmax",
"L1": [8000]
},
{
"name": "testFloatTranspose",
"L1": [2000]
},
{
"name": "testFloatMul",
"L1": [2000]
}
]
num-cores: 8
Expand Down Expand Up @@ -342,6 +413,8 @@ jobs:
L1: [64000]
- name: "MLPerf/AnomalyDetection"
L1: [64000]
- name: "CCT/CCT_16_16_8"
L1: [64000]
num-cores:
- 8
uses: ./.github/workflows/TestRunnerTiledSiracusa.yml
Expand Down
21 changes: 20 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,23 @@ Change main.c to use OUTPUTTYPE instead of float

### Fixed
- MaxPool Padding Extract Pass for float and interger
- Testinput, testoutput, weight type casted from double to float warning
- Testinput, testoutput, weight type casted from double to float warning

## Add Tiling Support to All CCT Kernels and Fix CCT Operators on Siracusa Platform for L2

### Added
- Float Bindings, Tilers of CCT kernels for Pulp Target
- Float Convolution, MaxPool Parser, Template, Kernel with HWC layout and padding integrated
- Added tiling constraints for conv gather and layernorm and exisitng constraints for other kernels
- profileuntiling arg
- CCT onnx tests with img size of 16 and 32

### Fixed
- CycleMeasure Pass for Siracusa Untiling Profilling
- GEMM Tiling Constraints transA and `transB' not supported
- MatMul layer Multi-Dimensional Input Issue
- Add Layer for Broadcasted Bias
- Resolved an issue where concatenation of float32 with f caused inf errors during code generation



Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,17 @@

from typing import Tuple

from Deeploy.DeeployTypes import CodeTransformationPass, ExecutionBlock, NetworkContext, NodeTemplate
from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
NodeTemplate, _NoVerbosity


class ProfilingCodeGeneration(CodeTransformationPass):

def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
name: str) -> Tuple[NetworkContext, ExecutionBlock]:
def apply(self,
ctxt: NetworkContext,
executionBlock: ExecutionBlock,
name: str,
verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
executionBlock.addLeft(NodeTemplate("""
uint32_t ${op}_cycles = getCycles();
"""), {"op": name})
Expand Down
2 changes: 2 additions & 0 deletions Deeploy/DeeployTypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ class CodeGenVerbosity:
"""

tilingProfiling: Optional[str] #: str: Specifies the name of the memory level on which to profile tiling
untilingProfiling: Optional[
bool] = None #: str: Specifies the name of the memory level on which to profile untiling


_NoVerbosity = CodeGenVerbosity(None)
Expand Down
24 changes: 23 additions & 1 deletion Deeploy/Targets/Generic/Layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,13 @@ def __init__(self, maps: List[NodeMapper]):

def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
channels_first) -> Tuple[Shape, Shape]:
outputShapes = inputShapes.copy()

if len(inputShapes[0]) > len(inputShapes[1]):
inputShapes[1] = inputShapes[0]
else:
inputShapes[0] = inputShapes[1]

outputShapes = [inputShapes[0]]
return (inputShapes, outputShapes)

def computeOps(self):
Expand All @@ -172,6 +173,27 @@ def computeOps(self):
return 2 * self.mapper.parser.operatorRepresentation['M'] * self.mapper.parser.operatorRepresentation[
'N'] * self.mapper.parser.operatorRepresentation['O'] * self.mapper.parser.operatorRepresentation['batch']

def computeShapes(self, inputShapes: Tuple[Shape, Shape], outputShapes: Shape, operatorRepresentation,
channels_first) -> Tuple[Tuple[Shape, Shape], Shape]:

A_shape, B_shape = inputShapes
if len(A_shape) < 2:
A_shape = [1] * (2 - len(A_shape)) + A_shape

if len(B_shape) < 2:
B_shape = B_shape + [1] * (2 - len(B_shape))

if A_shape[-1] != B_shape[-2]:
raise ValueError(f"MatMul requires A.shape[-1] == B.shape[-2], but got {A_shape} and {B_shape}")

if len(A_shape) > len(B_shape):
B_shape = [1] * (len(A_shape) - len(B_shape)) + list(B_shape)

elif len(A_shape) < len(B_shape):
A_shape = [1] * (len(B_shape) - len(A_shape)) + list(A_shape)

return [A_shape, B_shape], outputShapes


class RQMatMulLayer(MatMulLayer):

Expand Down
2 changes: 1 addition & 1 deletion Deeploy/Targets/Generic/Templates/FloatMatMulTemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from Deeploy.DeeployTypes import NodeTemplate

referenceTemplate = NodeTemplate("""
// GEMM (Name: ${nodeName}, Op: ${nodeOp})
// Matmul (Name: ${nodeName}, Op: ${nodeOp})
BEGIN_SINGLE_CORE
${A_type.typeName} ref_${data_out}_${A} = ${A};
${B_type.typeName} ref_${data_out}_${B} = ${B};
Expand Down
63 changes: 54 additions & 9 deletions Deeploy/Targets/PULPOpen/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,20 @@
from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate
from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
from Deeploy.Targets.Generic.Templates import ConcatTemplate, FloatGemmTemplate, RQSiGELUTemplate, iHardswishTemplate
from Deeploy.Targets.Generic.TypeCheckers import ConcatChecker, GELUChecker, GEMMChecker, HardswishChecker, \
LayerNormChecker, MatMulChecker, MulChecker, ReduceMeanChecker, RQAddChecker, RQHardswishChecker, SliceChecker, \
SoftmaxChecker, TransposeChecker
from Deeploy.Targets.Generic.Templates import ConcatTemplate, FloatGELUTemplate, FloatGemmTemplate, \
FloatLayernormTemplate, FloatMatMulTemplate, FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, \
GatherTemplate, RQSiGELUTemplate, iHardswishTemplate
from Deeploy.Targets.Generic.TypeCheckers import ConcatChecker, ConvChecker, GatherChecker, GELUChecker, GEMMChecker, \
HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, ReduceMeanChecker, ReluChecker, RQAddChecker, \
RQHardswishChecker, SliceChecker, SoftmaxChecker, TransposeChecker
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \
MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, RQAddTemplate, RQSiHardswishTemplate, SliceTemplate, \
TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatConvTemplate, FloatMaxPoolTemplate, GEMMTemplate, \
MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, RQAddTemplate, \
RQSiHardswishTemplate, SliceTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
iRMSNormTemplate, iSoftmaxTemplate
from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
PULPRequantShiftChecker
from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
Expand Down Expand Up @@ -204,6 +207,13 @@
ForkTransformer)
]

PULPFloatConv2DBindings = [
NodeBinding(
ConvChecker([PointerClass(float32_t), PointerClass(float32_t),
PointerClass(float32_t)], [PointerClass(float32_t)]), FloatConvTemplate.reference2DTemplate,
ForkTransformer)
]

PULPRQSMatrixVecBindings = [
NodeBinding(
PULPLinearChecker([PointerClass(type1),
Expand All @@ -227,6 +237,9 @@
PULPMaxPool2DBindings = [
NodeBinding(PULPMaxPoolChecker([PointerClass(type)], [PointerClass(type)]),
MaxPool2DTemplate.PULPMaxPool2D_8_Template, ForkTransformer) for type in [int8_t, uint8_t]
] + [
NodeBinding(PULPMaxPoolChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
FloatMaxPoolTemplate.referenceTemplate, ForkTransformer)
]

PULPConv1DBinding = NodeBinding(
Expand All @@ -241,8 +254,13 @@
PointerClass(int32_t),
PointerClass(int32_t)], [PointerClass(int8_t)]), ConvTemplate.PULPDWConv1D_8_Template, ForkTransformer)

PULPMatMulBinding = NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
GEMMTemplate.PULPMM_8_Template, ClusterTransformer)
PULPMatMulBindings = [
NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
GEMMTemplate.PULPMM_8_Template, ClusterTransformer)
] + [
NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
FloatMatMulTemplate.referenceTemplate, ClusterTransformer)
]

PULPReduceMeanBindings = [
NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate,
Expand Down Expand Up @@ -271,11 +289,17 @@
PULPSoftmaxBindings = [
NodeBinding(SoftmaxChecker([PointerClass(_type)], [PointerClass(uint8_t)]), iSoftmaxTemplate.referenceTemplate,
ForkTransformer) for _type in [int8_t, uint8_t]
] + [
NodeBinding(SoftmaxChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
FloatSoftmaxTemplate.referenceTemplate, ForkTransformer)
]

PULPTransposeBindings = [
NodeBinding(TransposeChecker([PointerClass(type)], [PointerClass(type)]), TransposeTemplate.referenceTemplate,
ForkTransformer) for type in IntegerDataTypes
] + [
NodeBinding(TransposeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
TransposeTemplate.referenceTemplate, ForkTransformer)
]

PULPConcatBindings = [
Expand Down Expand Up @@ -314,4 +338,25 @@
NodeBinding(MulChecker([PointerClass(typeA), PointerClass(typeB)], [PointerClass(int32_t)]),
MulTemplate.referenceTemplate, ForkTransformer)
for typeA, typeB in itertools.product(SignedIntegerDataTypes, SignedIntegerDataTypes)
] + [
NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
FloatMulTemplate.referenceTemplate, ForkTransformer)
]

PULPReluBinding = NodeBinding(ReluChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
FloatReluTemplate.referenceTemplate, ForkTransformer)

PULPLayernormBinding = NodeBinding(
LayerNormChecker(
[PointerClass(float32_t), PointerClass(float32_t),
PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceTemplate,
ForkTransformer)

PULPFloatGELUBinding = NodeBinding(
GELUChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
FloatGELUTemplate.referenceTemplate, ForkTransformer)

PULPGatherBindings = [
NodeBinding(GatherChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]),
GatherTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes
]
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from typing import Tuple

from Deeploy.CommonExtensions.CodeTransformationPasses.CycleMeasurement import ProfilingCodeGeneration
from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity

from .PULPClusterTilingDB import ProfilingPULPClusterTilingGenerationDB, PULPClusterTilingGenerationDB
Expand All @@ -38,6 +39,7 @@ def __init__(self, targetMemLevel: str):
self.profilingSB = ProfilingPULPClusterTilingGenerationSB(targetMemLevel)
self.DB = PULPClusterTilingGenerationDB(targetMemLevel)
self.profilingDB = ProfilingPULPClusterTilingGenerationDB(targetMemLevel)
self.profiluntiling = ProfilingCodeGeneration()

def apply(self,
ctxt: NetworkContext,
Expand All @@ -52,4 +54,7 @@ def apply(self,
ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)

if verbose.untilingProfiling:
ctxt, executionBlock = self.profiluntiling.apply(ctxt, executionBlock, name)

return ctxt, executionBlock
Loading