diff --git a/CHANGELOG.md b/CHANGELOG.md index 4cdb588d57..418f128427 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid ## Unreleased (Planned Release Target: v0.2.1) ### List of Pull Requests +- Refactors and fixes [#131](https://github.com/pulp-platform/Deeploy/pull/131) - Disallow shape inference [#128](https://github.com/pulp-platform/Deeploy/pull/128) - Remove memory-aware node bindings [#123](https://github.com/pulp-platform/Deeploy/pull/123) - Fix missing const's layout transformation and refactor NCHWtoNHWC passes [#122](https://github.com/pulp-platform/Deeploy/pull/122) @@ -91,6 +92,8 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Removed Wmem variants of bindings and tile constraints from Neureka - Disabled ICCT_ITA_8 MemPool test because it was using a lowering that created shapeless tensors - Added missing shape annotation to the testTypeInferenceDifferentTypes +- ref naming scheme various templates from `${data_out}_${}` to `${nodeName}_${}` +- move iNoNorm from Generic to Snitch since it uses a Snitch kernel ### Fixed - Prevent node duplication for graphs generated via GraphSurgeon @@ -105,6 +108,8 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Missing layout transformation of the const's (bias, mul, add, shift in Conv/RequantizedConv) - Keep mul/add rank of requantized Neureka tile constraints - Fix bias hoisting in generic GEMM with no bias +- formatting of test_input/output integer values +- pulp rqs tile constraints now properly target the last dimension of rqs params ### Removed - Delete outdated and unused `.gitlab-ci.yml` file @@ -180,9 +185,9 @@ This release containing major architectural changes, new platform support, enhan ### Added -- BatchNorm kernel -- ConvTranspose kernel -- MaxPool1D kernel +- BatchNorm kernel +- ConvTranspose kernel +- MaxPool1D kernel - Template for 1D Convolution - Support for float32 data type in the previous kernels - Float binding for Pad1D kernel @@ -321,7 +326,7 @@ This release containing major architectural changes, new platform support, enhan ### Changed - FloatConvTemplate file -- Platform.py file +- Platform.py file - Bump the CMake version to 3.24 as required for the chimera-sdk - Bump GVSoC's version and add chimera simulation target - Rename the generic source util to utils to avoid name collision with chimera-sdk diff --git a/Deeploy/AbstractDataTypes.py b/Deeploy/AbstractDataTypes.py index feeebe939b..0e8d4a0715 100644 --- a/Deeploy/AbstractDataTypes.py +++ b/Deeploy/AbstractDataTypes.py @@ -206,12 +206,20 @@ def checkValue(cls, value: Union[int, Iterable[int], np.ndarray], ctxt: Optional if isinstance(value, int): _max, _min = (value, value) + elif isinstance(value, np.number): + value = value.item() + if isinstance(value, float): + assert value.is_integer(), f"Floating-point value {value} is not an integer." + value = int(value) + _max, _min = (value, value) elif isinstance(value, np.ndarray): _max = value.max() _min = value.min() elif isinstance(value, Iterable): _max = max(value) _min = min(value) + else: + raise ValueError(f"Unsupported value of type {type(value)} with value {value}") if _max > cls.typeMax: return False diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py index 4f6dba3827..c05ea3b9d9 100644 --- a/Deeploy/CommonExtensions/DataTypes.py +++ b/Deeploy/CommonExtensions/DataTypes.py @@ -87,11 +87,11 @@ class float64_t(FloatImmediate): SignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (int8_t, int16_t, int32_t, int64_t) UnsignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (uint8_t, uint16_t, uint32_t, uint64_t) -IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (sorted(( - *SignedIntegerDataTypes, - *UnsignedIntegerDataTypes, -), - key = lambda _type: _type.typeWidth)) +IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = tuple( + sorted(( + *SignedIntegerDataTypes, + *UnsignedIntegerDataTypes, + ), key = lambda _type: _type.typeWidth)) FloatDataTypes: Tuple[Type[FloatImmediate], ...] = (bfloat16_t, float16_t, float32_t, float64_t) diff --git a/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py index c70628729b..5e38af0aa2 100644 --- a/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py +++ b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py @@ -2,7 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from abc import ABC, abstractmethod +from typing import List import onnx_graphsurgeon as gs @@ -11,27 +12,28 @@ from Deeploy.Logging import DEFAULT_LOGGER as log -class SignPropTypeChecker(NodeTypeChecker): +class SignPropTypeChecker(NodeTypeChecker, ABC): + @abstractmethod def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: - return None + operatorRepresentation: OperatorRepresentation) -> List[int]: + pass + @abstractmethod def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: - return None + operatorRepresentation: OperatorRepresentation) -> List[bool]: + pass def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext: ctxt = super().typeInferGlobalCtxt(ctxt, node) - for inputNode, _type in zip(node.inputs, self.input_types): - if isinstance(ctxt.lookup(inputNode.name), ConstantBuffer): - reference = ctxt.lookup(inputNode.name) - if not _type.referencedType.checkPromotion(reference.values): - raise Exception(f"Can't cast {reference} to {_type}!") - - reference.nLevels = reference.values.max() - reference.values.min() - reference._signed = _type.referencedType.typeMin < 0 + for tensor, ty in zip(node.inputs, self.input_types): + buffer = ctxt.lookup(tensor.name) + if isinstance(buffer, ConstantBuffer): + refTy = ty.referencedType + assert refTy.checkPromotion(buffer.values), f"Can't cast {buffer} to {ty}!" + buffer.nLevels = buffer.values.max() - buffer.values.min() + buffer._signed = refTy.typeMin < 0 return ctxt @@ -42,21 +44,16 @@ def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node, inputs = [ctxt.lookup(inputNode.name) for inputNode in node.inputs] outputs = [ctxt.lookup(outputNode.name) for outputNode in node.outputs] - signProp = all([hasattr(_input, "_signed") and hasattr(_input, "nLevels") for _input in inputs]) - - if signProp: - nLevels = self._inferNumLevels(inputs, operatorRepresentation) - signedness = self._inferSignedness(inputs, operatorRepresentation) - - if nLevels is None or signedness is None: - return ctxt - for obj, nLevel, sign in zip(outputs, nLevels, signedness): - obj.nLevels = nLevel - obj._signed = sign - - if issubclass(obj._type.referencedType, IntegerImmediate) and not obj._type.fitsNumLevels(nLevel): - log.warning( - f"{obj.name} has {nLevel} levels, but {obj._type.referencedType.typeName} only supports {obj._type.referencedType.nLevels} levels." - ) + nLevels = self._inferNumLevels(inputs, operatorRepresentation) + signedness = self._inferSignedness(inputs, operatorRepresentation) + + for obj, nLevels, sign in zip(outputs, nLevels, signedness): + assert isinstance(obj, VariableBuffer) + obj.nLevels = nLevels + obj._signed = sign + refTy = obj._type.referencedType + if issubclass(refTy, IntegerImmediate) and not refTy.fitsNumLevels(nLevels): + log.warning( + f"{obj.name} has {nLevels} levels, but {refTy.typeName} only supports {refTy.nLevels} levels.") return ctxt diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 8c2f5d2485..560b82eebb 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -251,8 +251,8 @@ def __init__(self, name: str = '', shape = [1], aliases: Optional[List[str]] = N self._live: bool = False #: bool: DO NOT OVERRIDE - this variable is true if a previous Memory allocation pass has allocated the buffer, and false if this buffer has been deallocated or has not been allocated yet. self._deploy: bool = True #: bool: MAY OVERRIDE - this variable is a global switch to deactivate the buffer for all purposes without deleting it outright. - self._signed = None - self.nLevels = None + self._signed: bool = None + self.nLevels: int = None self.is_input: bool = False self.is_output: bool = False @@ -1009,9 +1009,10 @@ def annotateType(self, name: str, _type: Type[Pointer]): VariableBuffer with """ - obj = self.lookup(name) - obj._type = _type - obj._instance = _type(name, ctxt = self) + buffer = self.lookup(name) + assert isinstance(buffer, VariableBuffer) + buffer._type = _type + buffer._instance = _type(name, ctxt = self) def copy(self) -> NetworkContext: """Return a shallow copy of this NetworkContext @@ -1312,14 +1313,12 @@ def typeCheckNodeInputs(self, ctxt: NetworkContext, node: gs.Node) -> bool: return retCheck def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext: - for inputNode, _type in zip(node.inputs, self.input_types): - if isinstance(ctxt.lookup(inputNode.name), ConstantBuffer): - reference = ctxt.lookup(inputNode.name) - if not _type.referencedType.checkPromotion(reference.values): - raise Exception(f"Can't cast {reference} to {_type}!") - - ctxt.annotateType(inputNode.name, _type) - + for tensor, ty in zip(node.inputs, self.input_types): + buffer = ctxt.lookup(tensor.name) + if isinstance(buffer, ConstantBuffer): + if not ty.referencedType.checkPromotion(buffer.values): + raise Exception(f"Can't cast {buffer} to {ty}!") + ctxt.annotateType(tensor.name, ty) return ctxt def annotateDict(self, ctxt: NetworkContext, node: gs.Node, operatorRepresentation: OperatorRepresentation): @@ -2695,7 +2694,7 @@ def parse(self, default_channels_first: bool = True) -> bool: f" - Deepest layer available mappers: {[type(x.parser).__name__ for x in deepestLayer.maps]}") log.error("=" * 80) raise RuntimeError( - f'Did not find adequate mapping for graph! Explored until layer {deepestLayer.__class__.__name__} of node {deepestNodeName}' + f'Did not find adequate mapping for graph! Explored until layer {deepestLayer.__class__.__name__} of node {deepestNodeName}\n' f'Candidates: {[type(x.parser).__name__ for x in deepestLayer.maps]}. Exhausted backtracking.') previousLayer = scheduledLayerList[idx - 1] @@ -3365,6 +3364,11 @@ def _mangleNodeNames(self): idx = seen.get(orig, 0) node.name = f"{orig}_{idx}" seen[orig] = idx + 1 + # Handle empty node name + elif orig == "": + idx = seen.get(orig, 0) + node.name = f"{node.op}_{idx}" + seen[orig] = idx + 1 # else: unique name, leave it unchanged # Don't override this diff --git a/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py b/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py index d82704cdcf..5589396366 100644 --- a/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py +++ b/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py @@ -31,14 +31,14 @@ def alignToContext(self, ctxt: NetworkContext, Linear_8_Template = _GEMM_8_Template(""" // GEMM -int8_t* ref_${data_out}_${A} = ${A}; -int8_t* ref_${data_out}_${B} = ${B}; -int8_t* ref_${data_out}_${data_out} = ${data_out}; +int8_t* ref_${nodeName}_${A} = ${A}; +int8_t* ref_${nodeName}_${B} = ${B}; +int8_t* ref_${nodeName}_${data_out} = ${data_out}; for(int i=0;i<${batch};i++){ - arm_fully_connected_s8(&${ctxt}, &${fc_params}, &${quant_params}, &${input_dims}, ref_${data_out}_${A}, &${filter_dims}, ref_${data_out}_${B}, &${bias_dims}, ${C}, &${output_dims}, ref_${data_out}_${data_out}); - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + arm_fully_connected_s8(&${ctxt}, &${fc_params}, &${quant_params}, &${input_dims}, ref_${nodeName}_${A}, &${filter_dims}, ref_${nodeName}_${B}, &${bias_dims}, ${C}, &${output_dims}, ref_${nodeName}_${data_out}); + ref_${nodeName}_${A} += ${M} * ${N}; + ref_${nodeName}_${B} += ${N} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } """) diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index c924895c13..97e833f489 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -7,7 +7,7 @@ import numpy as np -from Deeploy.DeeployTypes import NodeMapper, ONNXLayer, OperatorRepresentation, Shape +from Deeploy.DeeployTypes import NodeMapper, ONNXLayer, Shape class ConcatLayer(ONNXLayer): @@ -64,23 +64,6 @@ def __init__(self, maps: List[NodeMapper]): super().__init__(maps) -class iNoNormLayer(ONNXLayer): - - def __init__(self, maps: List[NodeMapper]): - super().__init__(maps) - - def computeOps(self): - return self.mapper.parser.operatorRepresentation['size'] * 4 # 2 mul, 1 add, 1 right shift - - def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation: OperatorRepresentation, - channels_first: bool) -> Tuple[Shape]: - - # JUNGVI: Broadcast the weights and bias to have as many dimensions as the inputs - inputShapes[1] = [1] * (len(inputShapes[0]) - len(inputShapes[1])) + list(inputShapes[1]) - inputShapes[2] = inputShapes[1] - return (inputShapes, outputShapes) - - class RQSiGELULayer(GELULayer): def __init__(self, maps: List[NodeMapper]): diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index f63bb5411d..87840f1f49 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -227,20 +227,25 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: - ret = super().parseNode(node) - wellFormed = False - if ret: - pads = self.operatorRepresentation['pads'] - kernel_shape = self.operatorRepresentation['kernel_shape'] - strides = self.operatorRepresentation['strides'] - # 1D: pads should be length 2, kernel_shape length 1, strides length 1 - if len(pads) == 2 and len(kernel_shape) == 1 and len(strides) == 1: - wellFormed = True - self.operatorRepresentation['padding_y'] = int(pads[0]) - self.operatorRepresentation['padding_y_right'] = int(pads[1]) - self.operatorRepresentation['stride_y'] = int(strides[0]) - self.operatorRepresentation['dim_kernel_y'] = int(kernel_shape[0]) - return wellFormed + if not super().parseNode(node): + return False + + pads = self.operatorRepresentation['pads'] + kernel_shape = self.operatorRepresentation['kernel_shape'] + strides = self.operatorRepresentation['strides'] + + if not all([ + len(pads) == 2, + len(kernel_shape) == 1, + len(strides) == 1, + ]): + return False + + self.operatorRepresentation['padding_y'] = pads[0] + self.operatorRepresentation['padding_y_right'] = pads[1] + self.operatorRepresentation['stride_y'] = strides[0] + self.operatorRepresentation['dim_kernel_y'] = kernel_shape[0] + return True def parseNodeCtxt(self, ctxt, node, channels_first = True): newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) @@ -269,28 +274,31 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: + if not super().parseNode(node): + return False - ret = super().parseNode(node) - wellFormed = False - if ret: - pads = self.operatorRepresentation['pads'] - kernel_shape = self.operatorRepresentation['kernel_shape'] - strides = self.operatorRepresentation['strides'] - if len(pads) == 4 and len(kernel_shape) == 2 and len(strides) == 2: - wellFormed = True + pads = self.operatorRepresentation['pads'] + kernel_shape = self.operatorRepresentation['kernel_shape'] + strides = self.operatorRepresentation['strides'] - self.operatorRepresentation['padding_x'] = int(self.operatorRepresentation['pads'][0]) - self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][1]) - self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][0]) - self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][1]) - self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][2]) - self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][3]) - self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0]) - self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1]) - self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0]) - self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1]) + if not all([ + len(pads) == 4, + len(kernel_shape) == 2, + len(strides) == 2, + ]): + return False - return wellFormed + self.operatorRepresentation['padding_x'] = pads[0] + self.operatorRepresentation['padding_y'] = pads[1] + self.operatorRepresentation['padding_x_left'] = pads[0] + self.operatorRepresentation['padding_y_top'] = pads[1] + self.operatorRepresentation['padding_x_right'] = pads[2] + self.operatorRepresentation['padding_y_bottom'] = pads[3] + self.operatorRepresentation['stride_x'] = strides[0] + self.operatorRepresentation['stride_y'] = strides[1] + self.operatorRepresentation['dim_kernel_x'] = kernel_shape[0] + self.operatorRepresentation['dim_kernel_y'] = kernel_shape[1] + return True def parseNodeCtxt(self, ctxt: NetworkContext, @@ -837,41 +845,6 @@ def parseNodeCtxt(self, return ctxt, True -class iNoNormParser(NodeParser): - - def __init__(self): - super().__init__() - - def parseNode(self, node: gs.Node) -> bool: - - ret = all(['D' in node.attrs, 'mul' in node.attrs, 'n_levels' in node.attrs]) - - if ret: - self.operatorRepresentation['D'] = node.attrs['D'] - self.operatorRepresentation['log2D'] = int(np.log2(node.attrs['D'].values).tolist()[0]) - self.operatorRepresentation['mul'] = int(node.attrs['mul'].values.tolist()[0]) - self.operatorRepresentation['n_levels'] = node.attrs['n_levels'] - - return ret - - def parseNodeCtxt(self, - ctxt: NetworkContext, - node: gs.Node, - channels_first: bool = True) -> Tuple[NetworkContext, bool]: - - data_in = ctxt.lookup(node.inputs[0].name) - weights = ctxt.lookup(node.inputs[1].name) - bias = ctxt.lookup(node.inputs[2].name) - data_out = ctxt.lookup(node.outputs[0].name) - self.operatorRepresentation['data_in'] = data_in.name - self.operatorRepresentation['weights'] = weights.name - self.operatorRepresentation['bias'] = bias.name - self.operatorRepresentation['data_out'] = data_out.name - self.operatorRepresentation['size'] = np.prod(data_in.shape) - - return ctxt, True - - class RQSiHardswishParser(iHardswishParser): def __init__(self): diff --git a/Deeploy/Targets/Generic/Templates/ConvTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTemplate.py index 51f292dcae..ad39cb2a46 100644 --- a/Deeploy/Targets/Generic/Templates/ConvTemplate.py +++ b/Deeploy/Targets/Generic/Templates/ConvTemplate.py @@ -36,18 +36,18 @@ def alignToContext(self, ctxt: NetworkContext, // 1D Conv (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { Conv2d_s${data_in_type.referencedType.typeWidth}_s${weight_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y}, ${weight}, ${ch_im_out}, 1, ${dim_kernel_y}, 1, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset} + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) @@ -60,18 +60,18 @@ def alignToContext(self, ctxt: NetworkContext, // 2D Conv (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { Conv2d_s${data_in_type.referencedType.typeWidth}_s${weight_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${weight}, ${ch_im_out}, ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset} + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py index 9bf864c91f..298101c58e 100644 --- a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py +++ b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py @@ -12,20 +12,20 @@ // 1D Transposed Conv (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { ConvTranspose1d_fp32( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_y}, ${weight}, ${ch_im_out}, ${dim_kernel_y}, ${stride_y}, ${bias}, ${has_bias}, - ref_${data_out}_${data_out}, ${dim_im_out_y} + ref_${nodeName}_${data_out}, ${dim_im_out_y} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/DWConvTemplate.py b/Deeploy/Targets/Generic/Templates/DWConvTemplate.py index aeeb1ac523..84a15d8b0e 100644 --- a/Deeploy/Targets/Generic/Templates/DWConvTemplate.py +++ b/Deeploy/Targets/Generic/Templates/DWConvTemplate.py @@ -36,18 +36,18 @@ def alignToContext(self, ctxt: NetworkContext, // 1D Depth-Wise Conv (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { DWConv2d_s${data_in_type.referencedType.typeWidth}_s${weight_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y}, ${weight}, 1, ${dim_kernel_y}, 1, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset} + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) @@ -60,18 +60,18 @@ def alignToContext(self, ctxt: NetworkContext, // 2D Depth-Wise Conv (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { DWConv2d_s${data_in_type.referencedType.typeWidth}_s${weight_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${weight}, ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset} + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/FloatConvTemplate.py b/Deeploy/Targets/Generic/Templates/FloatConvTemplate.py index 7519d33a21..4e5bde4a09 100644 --- a/Deeploy/Targets/Generic/Templates/FloatConvTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatConvTemplate.py @@ -12,20 +12,20 @@ // 2D FP Conv (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { Conv2d_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${weight}, ${ch_im_out}, ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, ${bias}, ${has_bias}, - ref_${data_out}_${data_out} + ref_${nodeName}_${data_out} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) @@ -37,21 +37,21 @@ %> // 1D FP Conv (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { Conv1d_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_y}, ${weight}, ${ch_im_out}, ${dim_kernel_y}, ${stride_y}, ${bias}, ${has_bias}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${data_out}, ${dim_im_out_y} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) \ No newline at end of file diff --git a/Deeploy/Targets/Generic/Templates/FloatDWConvTemplate.py b/Deeploy/Targets/Generic/Templates/FloatDWConvTemplate.py index 0e0fee7a86..d1225732a1 100644 --- a/Deeploy/Targets/Generic/Templates/FloatDWConvTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatDWConvTemplate.py @@ -11,21 +11,21 @@ %> // 2D FP Depth-wise Conv (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { DWConv2d_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${weight}, ${ch_im_out}, ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, ${bias}, ${has_bias}, - ref_${data_out}_${data_out} + ref_${nodeName}_${data_out} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py index 69bea8484e..8f18d8abf6 100644 --- a/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py @@ -7,17 +7,17 @@ referenceTemplate = NodeTemplate(""" // GEMM (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${A_type.typeName} ref_${data_out}_${A} = ${A}; - ${B_type.typeName} ref_${data_out}_${B} = ${B}; - ${C_type.typeName} ref_${data_out}_${C} = ${C}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${A_type.typeName} ref_${nodeName}_${A} = ${A}; + ${B_type.typeName} ref_${nodeName}_${B} = ${B}; + ${C_type.typeName} ref_${nodeName}_${C} = ${C}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0; i<${batch}; i++){ Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${C}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${C}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -26,18 +26,18 @@ ); % if A_batched: - ref_${data_out}_${A} += ${M} * ${N}; + ref_${nodeName}_${A} += ${M} * ${N}; % endif % if B_batched: - ref_${data_out}_${B} += ${N} * ${O}; + ref_${nodeName}_${B} += ${N} * ${O}; % endif % if C_batched: - ref_${data_out}_${C} += ${M} * ${O}; + ref_${nodeName}_${C} += ${M} * ${O}; % endif - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } END_SINGLE_CORE """) \ No newline at end of file diff --git a/Deeploy/Targets/Generic/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/Generic/Templates/FloatMatMulTemplate.py index d8a9f5b4b2..a7b176106f 100644 --- a/Deeploy/Targets/Generic/Templates/FloatMatMulTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatMatMulTemplate.py @@ -7,23 +7,23 @@ referenceTemplate = NodeTemplate(""" // Matmul (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${A_type.typeName} ref_${data_out}_${A} = ${A}; - ${B_type.typeName} ref_${data_out}_${B} = ${B}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${A_type.typeName} ref_${nodeName}_${A} = ${A}; + ${B_type.typeName} ref_${nodeName}_${B} = ${B}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0; i<${batch}; i++){ MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O} ); - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${A} += ${M} * ${N}; + ref_${nodeName}_${B} += ${N} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } END_SINGLE_CORE """) \ No newline at end of file diff --git a/Deeploy/Targets/Generic/Templates/FloatMaxPoolTemplate.py b/Deeploy/Targets/Generic/Templates/FloatMaxPoolTemplate.py index 1eef5e0f4f..7d0a477e6d 100644 --- a/Deeploy/Targets/Generic/Templates/FloatMaxPoolTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatMaxPoolTemplate.py @@ -8,13 +8,13 @@ // 2D Float MaxPool (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { MaxPool2d_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out} + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, + ref_${nodeName}_${data_out} ); } @@ -28,16 +28,16 @@ %> // 1D Float MaxPool (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { MaxPool1d_fp32_fp32( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_y}, ${dim_kernel_y}, ${stride_y}, - ref_${data_out}_${data_out} + ref_${nodeName}_${data_out} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) \ No newline at end of file diff --git a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py index 005b0b8893..7dbcaed269 100644 --- a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py @@ -18,10 +18,10 @@ def alignToContext(self, ctxt: NetworkContext, data_in = ctxt.lookup(operatorRepresentation['data_in']) data_out = ctxt.lookup(operatorRepresentation['data_out']) operatorRepresentation['input_offset'] = 0 - if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"): + if data_in._signed is not None and data_in.nLevels is not None: operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2) operatorRepresentation['output_offset'] = 0 - if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"): + if data_out._signed is not None and data_out.nLevels is not None: operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_in.nLevels / 2) return ctxt, operatorRepresentation, [] diff --git a/Deeploy/Targets/Generic/Templates/GemmTemplate.py b/Deeploy/Targets/Generic/Templates/GemmTemplate.py index 62d760d15c..21650d09f1 100644 --- a/Deeploy/Targets/Generic/Templates/GemmTemplate.py +++ b/Deeploy/Targets/Generic/Templates/GemmTemplate.py @@ -40,17 +40,17 @@ def alignToContext(self, ctxt: NetworkContext, referenceTemplate = _GemmTemplate(""" // GEMM (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${A_type.typeName} ref_${data_out}_${A} = ${A}; - ${B_type.typeName} ref_${data_out}_${B} = ${B}; - ${C_type.typeName} ref_${data_out}_${C} = ${C}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${A_type.typeName} ref_${nodeName}_${A} = ${A}; + ${B_type.typeName} ref_${nodeName}_${B} = ${B}; + ${C_type.typeName} ref_${nodeName}_${C} = ${C}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0;i<${batch};i++){ Gemm_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${C_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${C}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${C}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -64,10 +64,19 @@ def alignToContext(self, ctxt: NetworkContext, ${Y_offset} ); - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${C} += ${M} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + % if A_batched: + ref_${nodeName}_${A} += ${M} * ${N}; + % endif + + % if B_batched: + ref_${nodeName}_${B} += ${N} * ${O}; + % endif + + % if C_batched: + ref_${nodeName}_${C} += ${M} * ${O}; + % endif + + ref_${nodeName}_${data_out} += ${M} * ${O}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/MatMulTemplate.py b/Deeploy/Targets/Generic/Templates/MatMulTemplate.py index d1b25c1b0d..fdf3e20b9f 100644 --- a/Deeploy/Targets/Generic/Templates/MatMulTemplate.py +++ b/Deeploy/Targets/Generic/Templates/MatMulTemplate.py @@ -34,24 +34,24 @@ def alignToContext(self, ctxt: NetworkContext, referenceTemplate = _MatMulTemplate(""" // MatMul (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${A_type.typeName} ref_${data_out}_${A} = ${A}; - ${B_type.typeName} ref_${data_out}_${B} = ${B}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${A_type.typeName} ref_${nodeName}_${A} = ${A}; + ${B_type.typeName} ref_${nodeName}_${B} = ${B}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0;i<${batch};i++){ MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, ${A_offset}, ${B_offset}, ${C_offset} ); - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${A} += ${M} * ${N}; + ref_${nodeName}_${B} += ${N} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py b/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py index 1a1b3060bc..f5bc638292 100644 --- a/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py +++ b/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py @@ -36,17 +36,17 @@ def alignToContext(self, ctxt: NetworkContext, // 2D MaxPool (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { MaxPool2d_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset} + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset} ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..3b83a78da2 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -368,23 +368,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] -class iNoNormChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(4 * self.input_types[0].referencedType.typeWidth)] - - def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[bool]: - if inputs[0]._signed: - return [True] - else: - return [False] - - class GELUChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): @@ -493,6 +476,10 @@ class DummyChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): super().__init__(input_types, output_types) + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [] + def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [2**(self.input_types[0].referencedType.typeWidth)] diff --git a/Deeploy/Targets/MemPool/Templates/ConvTemplate.py b/Deeploy/Targets/MemPool/Templates/ConvTemplate.py index 7539eebbf4..b977cf4760 100644 --- a/Deeploy/Targets/MemPool/Templates/ConvTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/ConvTemplate.py @@ -37,20 +37,20 @@ def alignToContext(self, ctxt: NetworkContext, // 1D Conv Parallel (Name: ${nodeName}, Op: ${nodeOp}) mempool_barrier(numThreads); -${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { Conv2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y}, ${weight}, ${ch_im_out}, 1, ${dim_kernel_y}, 1, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}, + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset}, core_id, numThreads ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } mempool_barrier(numThreads); """) @@ -63,20 +63,20 @@ def alignToContext(self, ctxt: NetworkContext, // 2D Conv Parallel (Name: ${nodeName}, Op: ${nodeOp}) mempool_barrier(numThreads); -${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { Conv2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${weight}, ${ch_im_out}, ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}, + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset}, core_id, numThreads ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } mempool_barrier(numThreads); """) diff --git a/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py b/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py index 27252cb74b..1ff9e5d4e4 100644 --- a/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py @@ -36,20 +36,20 @@ def alignToContext(self, ctxt: NetworkContext, // 1D Depth-Wise Conv Parallel (Name: ${nodeName}, Op: ${nodeOp}) mempool_barrier(numThreads); -${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { DWConv2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, 1, ${dim_im_in_y}, ${weight}, 1, ${dim_kernel_y}, 1, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}, + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset}, core_id, numThreads ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } mempool_barrier(numThreads); """) @@ -62,20 +62,20 @@ def alignToContext(self, ctxt: NetworkContext, // 2D Depth-Wise Conv Parallel (Name: ${nodeName}, Op: ${nodeOp}) mempool_barrier(numThreads); -${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { DWConv2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${weight}, ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}, + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset}, core_id, numThreads ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } mempool_barrier(numThreads); """) diff --git a/Deeploy/Targets/MemPool/Templates/GemmTemplate.py b/Deeploy/Targets/MemPool/Templates/GemmTemplate.py index e5d53bd255..d9b6d653f7 100644 --- a/Deeploy/Targets/MemPool/Templates/GemmTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/GemmTemplate.py @@ -111,17 +111,17 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, mempool_barrier(numThreads); %endif -${A_type.typeName} ref_${data_out}_${A} = ${ctxtBuffer_A}; -${B_type.typeName} ref_${data_out}_${B} = ${ctxtBuffer_B}; -${C_type.typeName} ref_${data_out}_${C} = ${ctxtBuffer_C}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${A_type.typeName} ref_${nodeName}_${A} = ${ctxtBuffer_A}; +${B_type.typeName} ref_${nodeName}_${B} = ${ctxtBuffer_B}; +${C_type.typeName} ref_${nodeName}_${C} = ${ctxtBuffer_C}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0;i<${batch};i++){ Gemm_parallel_s${A_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${C}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${C}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -137,9 +137,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, numThreads ); - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${A} += ${M} * ${N}; + ref_${nodeName}_${B} += ${N} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } mempool_barrier(numThreads); """) diff --git a/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py b/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py index 062ba3cc9e..e9c04f2e10 100644 --- a/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py @@ -29,15 +29,15 @@ def alignToContext(self, ctxt: NetworkContext, MemPoolParallelTemplate = _MatMulTemplate(""" // MatMul Parallel (Name: ${nodeName}, Op: ${nodeOp}) mempool_barrier(numThreads); -${A_type.typeName} ref_${data_out}_${A} = ${A}; -${B_type.typeName} ref_${data_out}_${B} = ${B}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${A_type.typeName} ref_${nodeName}_${A} = ${A}; +${B_type.typeName} ref_${nodeName}_${B} = ${B}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0;i<${batch};i++){ MatMul_parallel_s${A_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -46,9 +46,9 @@ def alignToContext(self, ctxt: NetworkContext, numThreads ); - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${A} += ${M} * ${N}; + ref_${nodeName}_${B} += ${N} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } mempool_barrier(numThreads); """) diff --git a/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py b/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py index c57bbade77..72b0cf315e 100644 --- a/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py @@ -36,19 +36,19 @@ def alignToContext(self, ctxt: NetworkContext, // 2D MaxPool Parallel (Name: ${nodeName}, Op: ${nodeOp}) mempool_barrier(numThreads); -${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { MaxPool2d_parallel_s${data_in_type.referencedType.typeWidth}_NCHW( - ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ref_${nodeName}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out}, ${input_offset}, ${output_offset}, + ref_${nodeName}_${data_out}, ${input_offset}, ${output_offset}, core_id, numThreads ); - ref_${data_out}_${data_in} += ${batchOffsetIn}; - ref_${data_out}_${data_out} += ${batchOffsetOut}; + ref_${nodeName}_${data_in} += ${batchOffsetIn}; + ref_${nodeName}_${data_out} += ${batchOffsetOut}; } mempool_barrier(numThreads); """) diff --git a/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py b/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py index e6a42768e8..2baa8cd41f 100644 --- a/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py @@ -128,18 +128,18 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, mempool_barrier(numThreads); %endif -${A_type.typeName} ref_${data_out}_${A} = ${ctxtBuffer_A}; -${B_type.typeName} ref_${data_out}_${B} = ${ctxtBuffer_B}; -${C_type.typeName} ref_${data_out}_${C} = ${ctxtBuffer_C}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${A_type.typeName} ref_${nodeName}_${A} = ${ctxtBuffer_A}; +${B_type.typeName} ref_${nodeName}_${B} = ${ctxtBuffer_B}; +${C_type.typeName} ref_${nodeName}_${C} = ${ctxtBuffer_C}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0;i<${batch};i++){ %if M%4==0 and N%4==0 and O%4==0: RQGemm_offset_unrolled_2x2_parallel_s${A_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${C}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${C}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -161,10 +161,10 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ); %else: RQGemm_parallel_s${A_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${C}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${C}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -188,9 +188,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ); %endif - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${A} += ${M} * ${N}; + ref_${nodeName}_${B} += ${N} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } mempool_barrier(numThreads); """) diff --git a/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py b/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py index 76ad029fb4..ceab67c440 100644 --- a/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py @@ -112,18 +112,18 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, mempool_barrier(numThreads); %endif -${A_type.typeName} ref_${data_out}_${A} = ${ctxtBuffer_A}; -${B_type.typeName} ref_${data_out}_${B} = ${ctxtBuffer_B}; +${A_type.typeName} ref_${nodeName}_${A} = ${ctxtBuffer_A}; +${B_type.typeName} ref_${nodeName}_${B} = ${ctxtBuffer_B}; ${mul_type.typeName} ref_${mul} = ${mul}; ${add_type.typeName} ref_${add} = ${add}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0;i<${batch};i++){ %if A_offset==0 and B_offset==0 and offset_output==0 and M%4==0 and N%4==0 and O%4==0: RQMatMul_unrolled_2x2_parallel_s${A_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -137,9 +137,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ); %elif M%4==0 and N%4==0 and O%4==0: RQMatMul_offset_unrolled_2x2_parallel_s${A_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -154,9 +154,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ); %else: RQMatMul_parallel_s${A_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -173,9 +173,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ); %endif - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${A} += ${M} * ${N}; + ref_${nodeName}_${B} += ${N} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; %if perChannelQuant: ++ref_${mul}; ++ref_${add}; diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py index e94af6e420..b4a36e35bc 100644 --- a/Deeploy/Targets/PULPOpen/Parsers.py +++ b/Deeploy/Targets/PULPOpen/Parsers.py @@ -349,41 +349,22 @@ def parseNodeCtxt(self, class PULPMatrixVecParser(PULPGEMMParser): - def parseNodeCtxt(self, - ctxt: NetworkContext, - node: gs.Node, - channels_first: bool = True) -> Tuple[NetworkContext, bool]: - - newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + def parseNode(self, node: gs.Node) -> bool: + if not super().parseNode(node): + return False - if not ret: - return ctxt, False - - if not (self.operatorRepresentation['M'] == 1 and self.operatorRepresentation['batch'] >= 8): - return ctxt, False - - return newCtxt, True + M = node.inputs[0].shape[-1 if node.attrs["transA"] else -2] + batch = math.prod(node.inputs[0].shape[:-2]) + return M == 1 and batch >= 8 class PULPTallGEMMParser(PULPGEMMParser): - def parseNodeCtxt(self, - ctxt: NetworkContext, - node: gs.Node, - channels_first: bool = True) -> Tuple[NetworkContext, bool]: - - newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) - - if not ret: - return ctxt, False - - ret = all([ - self.operatorRepresentation['batch'] < 8, - self.operatorRepresentation['M'] >= 8, - self.operatorRepresentation['M'] % 8 < self.operatorRepresentation['O'] % 8, - ]) - - if not ret: - return ctxt, False + def parseNode(self, node: gs.Node) -> bool: + if not super().parseNode(node): + return False - return newCtxt, True + M = node.inputs[0].shape[-1 if node.attrs["transA"] else -2] + N = node.inputs[1].shape[-2 if node.attrs["transB"] else -1] + batch = math.prod(node.inputs[0].shape[:-2]) + return M >= 8 and (M % 8) < (N % 8) and batch < 8 diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py index 29a216d728..41ee4f86ab 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py @@ -37,35 +37,35 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, reference2DTemplate = NodeTemplate(""" // 2D FP Conv HWC with ChannelOut parallelism (Name: ${nodeName}, Op: ${nodeOp}) -${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { PULP_Conv2d_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( - ref_${data_out}_${data_in}, + ref_${nodeName}_${data_in}, ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in}, ${weight}, ${ch_im_out}, ${dim_kernel_y}, ${dim_kernel_x}, ${stride_y}, ${stride_x}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${data_out}, ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} ); - ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; - ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; + ref_${nodeName}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; + ref_${nodeName}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; } """) reference2DIm2ColTemplate = PULP2DFloatConvIm2ColTemplate(""" // 2D FP Conv HWC with Im2Col and ChannelOout parallelism (Name: ${nodeName}, Op: ${nodeOp}) -${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { PULP_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( - ref_${data_out}_${data_in}, + ref_${nodeName}_${data_in}, ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in}, @@ -75,7 +75,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ${dim_kernel_x}, ${stride_y}, ${stride_x}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${data_out}, ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, @@ -83,7 +83,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ${ctxtBuffer} ); - ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; - ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; + ref_${nodeName}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; + ref_${nodeName}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; } """) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py index f4c22b2c22..fa03fd2b22 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py @@ -6,17 +6,17 @@ referenceTemplate = NodeTemplate(""" // GEMM (Name: ${nodeName}, Op: ${nodeOp}) -${A_type.typeName} ref_${data_out}_${A} = ${A}; -${B_type.typeName} ref_${data_out}_${B} = ${B}; -${C_type.typeName} ref_${data_out}_${C} = ${C}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${A_type.typeName} ref_${nodeName}_${A} = ${A}; +${B_type.typeName} ref_${nodeName}_${B} = ${B}; +${C_type.typeName} ref_${nodeName}_${C} = ${C}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0; i<${batch}; i++){ PULP_Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${C}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${C}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, @@ -24,9 +24,9 @@ ${transB} ); - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${C} += ${M} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${A} += ${M} * ${N}; + ref_${nodeName}_${B} += ${N} * ${O}; + ref_${nodeName}_${C} += ${M} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } """) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py index 846aeae92d..9aa7168d58 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py @@ -7,19 +7,19 @@ referenceTemplate = NodeTemplate(""" // 2D Float MaxPool Channel Parallel (Name: ${nodeName}, Op: ${nodeOp}) -${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; -${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +${data_in_type.typeName} ref_${nodeName}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for (uint32_t n=0; n<${batch}; ++n) { PULP_MaxPool2d_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( - ref_${data_out}_${data_in}, + ref_${nodeName}_${data_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, ${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${data_out}, ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} ); - ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; - ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; + ref_${nodeName}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; + ref_${nodeName}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; } """) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py index 1f7149e1e8..1a74a4f2e1 100644 --- a/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py @@ -41,10 +41,10 @@ def alignToContext(self, ctxt: NetworkContext, else: signatureString += '_u8' %> -// PULP NN GEMM -int8_t* ref_${data_out}_${A} = ${A}; -int8_t* ref_${data_out}_${B} = ${B}; -int8_t* ref_${data_out}_${data_out} = ${data_out}; +// PULP NN GEMM (Name: ${nodeName}, Op: ${nodeOp}) +int8_t* ref_${nodeName}_${A} = ${A}; +int8_t* ref_${nodeName}_${B} = ${B}; +int8_t* ref_${nodeName}_${data_out} = ${data_out}; for(int i=0;i<${batch};i++){ for(int j=0;j<${M};j++){ // LMACAN: In some edge cases sporadic errors happen if this loop is not added. @@ -55,12 +55,12 @@ def alignToContext(self, ctxt: NetworkContext, for(int k=0;k<3;k++){ asm volatile("nop" ::); } -pulp_nn_linear${signatureString}(ref_${data_out}_${A}, NULL, ref_${data_out}_${data_out}, ref_${data_out}_${B}, ${mul}, ${C}, 1, ${log2D}, ${N}, ${O}, 1, 1); -ref_${data_out}_${A} += ${N}; -ref_${data_out}_${data_out} += ${O}; +pulp_nn_linear${signatureString}(ref_${nodeName}_${A}, NULL, ref_${nodeName}_${data_out}, ref_${nodeName}_${B}, ${mul}, ${C}, 1, ${log2D}, ${N}, ${O}, 1, 1); +ref_${nodeName}_${A} += ${N}; +ref_${nodeName}_${data_out} += ${O}; } % if W_batched: -ref_${data_out}_${B} += ${N} * ${O}; +ref_${nodeName}_${B} += ${N} * ${O}; % endif } """) @@ -82,11 +82,11 @@ def alignToContext(self, ctxt: NetworkContext, operatorRepresentation['B_offset'] = 0 operatorRepresentation['C_offset'] = 0 - if hasattr(A, "nLevels"): + if A.nLevels is not None: operatorRepresentation['A_offset'] = (A._type.referencedType.typeMin == 0) * int(A.nLevels / 2) - if hasattr(B, "nLevels"): + if B.nLevels is not None: operatorRepresentation['B_offset'] = (B._type.referencedType.typeMin == 0) * int(B.nLevels / 2) - if hasattr(C, "nLevels"): + if C.nLevels is not None: operatorRepresentation['C_offset'] = -(C._type.referencedType.typeMin == 0) * int(C.nLevels / 2) return ctxt, operatorRepresentation, [] @@ -95,24 +95,24 @@ def alignToContext(self, ctxt: NetworkContext, PULPMM_8_Template = _MatMulTemplate(""" // MatMul (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${A_type.typeName} ref_${data_out}_${A} = ${A}; - ${B_type.typeName} ref_${data_out}_${B} = ${B}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${A_type.typeName} ref_${nodeName}_${A} = ${A}; + ${B_type.typeName} ref_${nodeName}_${B} = ${B}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0;i<${batch};i++){ MatMul_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, 0, 0, ${C_offset} ); - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${A} += ${M} * ${N}; + ref_${nodeName}_${B} += ${N} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py index c69760df59..156271417a 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py @@ -54,12 +54,15 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) - addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0) - mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0) + addBuffer = ctxt.lookup(addBufferName) + addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = len(addBuffer.shape) - 1) + mulBuffer = ctxt.lookup(mulBufferName) + mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = len(mulBuffer.shape) - 1) # Map output dims to inputs dims tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch tilerModel.addConstraint(outputChannelVar == weightOutChannelVar) # Output Channel + tilerModel.addConstraint(inputChannelVar == weightInChannelVar) # Input channel tilerModel.addConstraint(outputChannelVar == addChannelVar) tilerModel.addConstraint(outputChannelVar == mulChannelVar) @@ -88,10 +91,8 @@ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkCo outputChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0) weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1) weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2) - weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3) strides = parseDict["strides"] - padding = parseDict["pads"] # VIC: Force at least one row of A and one col of B in the GEMM (since it's a im2col Conv) to avoid partial results tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) @@ -101,7 +102,6 @@ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkCo tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x']) tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y']) - tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in']) # VIC: Constraint the minimum tile size such that we can apply at least one kernel on it tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x']) @@ -174,6 +174,8 @@ def serializeTilingSolution( weightH = ctxt.lookup(varWeight).shape[1] weightW = ctxt.lookup(varWeight).shape[2] weightC = ctxt.lookup(varWeight).shape[3] + shapeMul = ctxt.lookup(operatorRepresentation["mul"]).shape + shapeAdd = ctxt.lookup(operatorRepresentation["add"]).shape pads = operatorRepresentation['pads'] strides = operatorRepresentation['strides'] @@ -200,12 +202,13 @@ def serializeTilingSolution( inputInCubes.append(InCube) - RequantCube = HyperRectangle((COffset,), (CSize,)) + MulCube = HyperRectangle((0,) * (len(shapeMul) - 1) + (COffset,), (1,) * (len(shapeMul) - 1) + (CSize,)) + AddCube = HyperRectangle((0,) * (len(shapeAdd) - 1) + (COffset,), (1,) * (len(shapeAdd) - 1) + (CSize,)) WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, weightC)) inputWeightCubes.append(WeightCube) - inputAddCubes.append(RequantCube) - inputMulCubes.append(RequantCube) + inputMulCubes.append(MulCube) + inputAddCubes.append(AddCube) inputLoadSchedule = [] outputLoadSchedule = [] diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py index 8d54eea437..2d6ea07a0f 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py @@ -60,8 +60,10 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) - addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0) - mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0) + addBuffer = ctxt.lookup(addBufferName) + addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = len(addBuffer.shape) - 1) + mulBuffer = ctxt.lookup(mulBufferName) + mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = len(mulBuffer.shape) - 1) # map output dims to inputs dims tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch @@ -183,6 +185,8 @@ def serializeTilingSolution( weightH = ctxt.lookup(varWeight).shape[1] weightW = ctxt.lookup(varWeight).shape[2] + shapeMul = ctxt.lookup(operatorRepresentation["mul"]).shape + shapeAdd = ctxt.lookup(operatorRepresentation["add"]).shape pads = operatorRepresentation['pads'] strides = operatorRepresentation['strides'] @@ -200,7 +204,8 @@ def serializeTilingSolution( NCHWInCube = HyperRectangle((NHWCInCube.offset[0], COffset, NHWCInCube.offset[1], NHWCInCube.offset[2]), (NHWCInCube.dims[0], CSize, NHWCInCube.dims[1], NHWCInCube.dims[2])) - RequantCube = HyperRectangle((COffset,), (CSize,)) + MulCube = HyperRectangle((0,) * (len(shapeMul) - 1) + (COffset,), (1,) * (len(shapeMul) - 1) + (CSize,)) + AddCube = HyperRectangle((0,) * (len(shapeAdd) - 1) + (COffset,), (1,) * (len(shapeAdd) - 1) + (CSize,)) WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, 1)) replacements['dim_im_in_x'].append(NCHWInCube.dims[2]) @@ -216,8 +221,8 @@ def serializeTilingSolution( replacements['padding_x_right'].append(padding_right) inputInCubes.append(NCHWInCube) - inputAddCubes.append(RequantCube) - inputMulCubes.append(RequantCube) + inputMulCubes.append(MulCube) + inputAddCubes.append(AddCube) inputWeightCubes.append(WeightCube) inputLoadSchedule = [] diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py index 8b795be88e..91c2cc2888 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py @@ -17,39 +17,49 @@ class MatMulTileConstraint(TileConstraint): + @staticmethod + def _getIdxMapping(rank: int, isTrans: bool) -> Tuple[int, int]: + if isTrans: + idxSecondDim, idxFirstDim = rank - 2, rank - 1 + else: + idxFirstDim, idxSecondDim = rank - 2, rank - 1 + return idxFirstDim, idxSecondDim + @staticmethod def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: # Get to-be-tiled tensor's buffers bufferA = ctxt.lookup(name = parseDict['A']) bufferB = ctxt.lookup(name = parseDict['B']) - outputBuffer = ctxt.lookup(name = parseDict['data_out']) + bufferOut = ctxt.lookup(name = parseDict['data_out']) # Add I/O dimensions to the model as variables - for _buffer in [bufferA, bufferB, outputBuffer]: + for _buffer in [bufferA, bufferB, bufferOut]: tilerModel.addTensorDimToModel(ctxt, _buffer.name) - tensorsShapeLen = len(bufferA.shape) - - AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, - dimIdx = (tensorsShapeLen - 2) + parseDict['transA']) - ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) - BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) - BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) - outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2)) - outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1)) - - # Map output dims to inputs dims - for idx in range(tensorsShapeLen - 2): - tilerModel.addConstraint( - tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( - tensorName = bufferA.name, dimIdx = idx)) - tilerModel.addConstraint( - tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( - tensorName = bufferB.name, dimIdx = idx)) + idxFirstDimA, idxSecondDimA = MatMulTileConstraint._getIdxMapping(len(bufferA.shape), parseDict['transA']) + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = idxFirstDimA) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = idxSecondDimA) + + idxFirstDimB, idxSecondDimB = MatMulTileConstraint._getIdxMapping(len(bufferB.shape), parseDict['transB']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = idxFirstDimB) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = idxSecondDimB) + + rankOut = len(bufferOut.shape) + outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferOut.name, dimIdx = rankOut - 2) + outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferOut.name, dimIdx = rankOut - 1) + + # Map input A's batch dims to output batch dims if present + for idx in range(len(bufferA.shape) - 2): + varA = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = idx) + varOut = tilerModel.getTensorDimVar(tensorName = bufferOut.name, dimIdx = idx) + tilerModel.addConstraint(varA == varOut) + + # Map input B's batch dims to output batch dims if present + for idx in range(len(bufferB.shape) - 2): + varB = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = idx) + varOut = tilerModel.getTensorDimVar(tensorName = bufferOut.name, dimIdx = idx) + tilerModel.addConstraint(varB == varOut) tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar) tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar) @@ -61,18 +71,14 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw @staticmethod def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: - bufferA = ctxt.lookup(name = parseDict['A']) bufferB = ctxt.lookup(name = parseDict['B']) - tensorsShapeLen = len(bufferA.shape) + _, idxSecondDimA = MatMulTileConstraint._getIdxMapping(len(bufferA.shape), parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = idxSecondDimA) - ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) - BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) - BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) + idxFirstDimB, _ = MatMulTileConstraint._getIdxMapping(len(bufferB.shape), parseDict['transB']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = idxFirstDimB) # VIC: We don't want to deal with intermediate results between kernel calls tilerModel.addConstraint(ASecondDimVar == parseDict['N']) diff --git a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py index 43d490e80b..95987839e6 100644 --- a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py +++ b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import copy +import math from collections import OrderedDict import numpy as np @@ -164,23 +165,41 @@ def __init__(self): def _merge_conv_rq_fun(graph: gs.Graph, match: Match, name: str): - matched_nodes = [m for k, m in match.nodes_map.items()] - conv = matched_nodes[0] - rqs = matched_nodes[1] - - totalShift = int(np.log2(rqs.attrs['div'].values)) + conv, rqs = list(match.nodes_map.values()) - # Artifically add half the shift division value to implement rounding - rounding = 2**(totalShift - 1) if totalShift > 0 else 0 + mul, add = rqs.inputs[1:] - rqs.inputs[-1].values = copy.deepcopy(rqs.inputs[-1].values) + rounding + div_attr = rqs.attrs['div'] - _inputs = list(conv.inputs) + list(rqs.inputs[1:]) + if isinstance(div_attr, gs.Constant): + assert div_attr.values.size == 1 + div_attr = div_attr.values.item() - _outputs = rqs.outputs - - rqsConv = gs.Node(op = 'RequantizedConv', name = name, attrs = {**conv.attrs, **rqs.attrs, "shift": totalShift}) - graph.replaceInsertNode(_inputs, _outputs, rqsConv) + if isinstance(div_attr, int): + div = div_attr + elif isinstance(div_attr, float) and div_attr.is_integer(): + div = int(div_attr) + else: + raise ValueError(f"Cannot convert div to integer. Received {div_attr}") + + assert div > 0, f"Shift calculation (log2(div)) requires div to be a positive number. Received non-positive div {div}" + assert div.bit_count() == 1, f"Div is expected to be a power of 2 number. Received div {div}" + + shift = int(math.log2(div)) + # Artifically add half the division value as rounding + if shift > 0: + add.values += 2**(shift - 1) + + rqsConv = gs.Node( + op = 'RequantizedConv', + name = name, + attrs = { + **conv.attrs, + **rqs.attrs, + "shift": shift, + }, + ) + graph.replaceInsertNode(list(conv.inputs) + [mul, add], rqs.outputs, rqsConv) return graph diff --git a/Deeploy/Targets/Snitch/Bindings.py b/Deeploy/Targets/Snitch/Bindings.py index e9be18a535..1d1af32a36 100644 --- a/Deeploy/Targets/Snitch/Bindings.py +++ b/Deeploy/Targets/Snitch/Bindings.py @@ -11,15 +11,16 @@ from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.Generic.Templates import iNoNormTemplate -from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \ SnitchProfileExecutionBlockPass, SnitchSynchCoresPass from Deeploy.Targets.Snitch.DMA.SnitchDma import SnitchDma -from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate +from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iNoNormTemplate, \ + iSoftmaxTemplate from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template +from Deeploy.Targets.Snitch.TypeCheckers import iNoNormChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ TilingVariableReplacementUpdate diff --git a/Deeploy/Targets/Snitch/Layers.py b/Deeploy/Targets/Snitch/Layers.py new file mode 100644 index 0000000000..99bd3c14b6 --- /dev/null +++ b/Deeploy/Targets/Snitch/Layers.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Tuple + +import numpy as np + +from Deeploy.DeeployTypes import NodeMapper, ONNXLayer, OperatorRepresentation, Shape + + +class iNoNormLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] * 4 # 2 mul, 1 add, 1 right shift + + def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation: OperatorRepresentation, + channels_first: bool) -> Tuple[Shape, Shape]: + # JUNGVI: Broadcast the weights and bias to have as many dimensions as the inputs + shape = np.broadcast_shapes(*inputShapes) + return ([shape] * len(inputShapes), outputShapes) diff --git a/Deeploy/Targets/Snitch/Parsers.py b/Deeploy/Targets/Snitch/Parsers.py index 0051994686..03da563af0 100644 --- a/Deeploy/Targets/Snitch/Parsers.py +++ b/Deeploy/Targets/Snitch/Parsers.py @@ -2,11 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 +import math from typing import Tuple import onnx_graphsurgeon as gs -from Deeploy.DeeployTypes import NetworkContext +from Deeploy.DeeployTypes import NetworkContext, NodeParser from Deeploy.Targets.Generic.Parsers import GEMMParser, RQGEMMParser @@ -72,3 +73,35 @@ def parseNodeCtxt(self, return ctxt, False return newCtxt, True + + +class iNoNormParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if not all([ + 'D' in node.attrs, + 'mul' in node.attrs, + 'n_levels' in node.attrs, + len(node.inputs) == 3, + len(node.outputs) == 1, + ]): + return False + + self.operatorRepresentation['log2D'] = int(math.log2(self._unpack_const(node.attrs['D']))) + self.operatorRepresentation['n_levels'] = int(self._unpack_const(node.attrs['n_levels'])) + self.operatorRepresentation['mul'] = int(self._unpack_const(node.attrs['mul'])) + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + for tensor, symName in zip(node.inputs, ["data_in", "weights", "bias"], strict = True): + self.operatorRepresentation[symName] = ctxt.lookup(tensor.name).name + for tensor, symName in zip(node.outputs, ["data_out"], strict = True): + self.operatorRepresentation[symName] = ctxt.lookup(tensor.name).name + self.operatorRepresentation['size'] = math.prod(ctxt.lookup(node.inputs[0].name).shape) + return ctxt, True diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py index d62d1c3802..bb570b588f 100644 --- a/Deeploy/Targets/Snitch/Platform.py +++ b/Deeploy/Targets/Snitch/Platform.py @@ -11,15 +11,16 @@ from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicLayerNormBindings, BasicMatMulBindings, \ BasicPad1DBindings, BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding from Deeploy.Targets.Generic.Layers import AddLayer, GatherLayer, GEMMLayer, LayerNormLayer, MatMulLayer, PadLayer, \ - ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer, iNoNormLayer + ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer from Deeploy.Targets.Generic.Parsers import AddParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \ - RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser + RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import AddRequantMergePass, GEMMRequantMergePass, \ IntegerDivRequantMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, \ SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass from Deeploy.Targets.PULPOpen.Platform import RQAddMapper -from Deeploy.Targets.Snitch.Parsers import SnitchGEMMParser, SnitchRQGEMMParser +from Deeploy.Targets.Snitch.Layers import iNoNormLayer +from Deeploy.Targets.Snitch.Parsers import SnitchGEMMParser, SnitchRQGEMMParser, iNoNormParser from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchGemmTilingReadyBindings, \ SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, SnitchRQAddTilingReadyBindings, \ diff --git a/Deeploy/Targets/Generic/Templates/iNoNormTemplate.py b/Deeploy/Targets/Snitch/Templates/iNoNormTemplate.py similarity index 70% rename from Deeploy/Targets/Generic/Templates/iNoNormTemplate.py rename to Deeploy/Targets/Snitch/Templates/iNoNormTemplate.py index 562b3168a9..29b84cad8e 100644 --- a/Deeploy/Targets/Generic/Templates/iNoNormTemplate.py +++ b/Deeploy/Targets/Snitch/Templates/iNoNormTemplate.py @@ -4,14 +4,7 @@ from Deeploy.DeeployTypes import NodeTemplate - -class _iNoNormTemplate(NodeTemplate): - - def __init__(self, templateStr): - super().__init__(templateStr) - - -referenceTemplate = _iNoNormTemplate(""" +referenceTemplate = NodeTemplate(""" // iNoNorm (Name: ${nodeName}, Op: ${nodeOp}) SnitchiNoNorm_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${weights}, ${bias}, ${size}, ${mul}, ${log2D}); """) diff --git a/Deeploy/Targets/Snitch/TypeCheckers.py b/Deeploy/Targets/Snitch/TypeCheckers.py new file mode 100644 index 0000000000..09ef3bc3c5 --- /dev/null +++ b/Deeploy/Targets/Snitch/TypeCheckers.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Sequence, Type + +from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker +from Deeploy.DeeployTypes import OperatorRepresentation, Pointer, VariableBuffer + + +class iNoNormChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(4 * self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + if inputs[0]._signed: + return [True] + else: + return [False] diff --git a/DeeployTest/Platforms/Generic/CMakeLists.txt b/DeeployTest/Platforms/Generic/CMakeLists.txt index f97f1cdf1b..b2e68b257e 100644 --- a/DeeployTest/Platforms/Generic/CMakeLists.txt +++ b/DeeployTest/Platforms/Generic/CMakeLists.txt @@ -8,7 +8,7 @@ file(GLOB_RECURSE SOURCES main.c ) -link_directories(${ProjectId}/../../${GENERATED_SOURCE}) +link_directories(${GENERATED_SOURCE}) add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES} ) target_link_libraries(${ProjectId} PRIVATE network deeploylib) diff --git a/DeeployTest/testRunner_siracusa_l3dma.py b/DeeployTest/testRunner_siracusa_l3dma.py index b70d8dda22..937f7e9b29 100644 --- a/DeeployTest/testRunner_siracusa_l3dma.py +++ b/DeeployTest/testRunner_siracusa_l3dma.py @@ -6,15 +6,16 @@ import numpy as np from testUtils.codeGenerate import generateTestNetwork -from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, MemcpyTypeChecker, generate_graph, \ - memcpyTemplate, prepare_deployer_with_custom_tiling, setup_pulp_deployer +from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, generate_graph, memcpyTemplate, \ + prepare_deployer_with_custom_tiling, setup_pulp_deployer from testUtils.testRunner import TestRunner, TestRunnerArgumentParser from testUtils.typeMapping import baseTypeFromName, dtypeFromDeeployType from Deeploy.AbstractDataTypes import PointerClass from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration -from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, _NoVerbosity +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, NodeTypeChecker, _NoVerbosity from Deeploy.Targets.PULPOpen.Bindings import L3MemoryAwareFunctionCallClosure, TilingCallClosure from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack @@ -74,8 +75,11 @@ MemoryManagementGeneration(), ]) -binding = NodeBinding(MemcpyTypeChecker(), memcpyTemplate, transformer) -tilingReadyBindings = TilingReadyNodeBindings([binding], MemcpyTileConstraint()) +bindings = [ + NodeBinding(NodeTypeChecker([PointerClass(ty)], [PointerClass(ty)]), memcpyTemplate, transformer) + for ty in IntegerDataTypes + FloatDataTypes +] +tilingReadyBindings = TilingReadyNodeBindings(bindings, MemcpyTileConstraint()) memcpyMapper = NodeMapper(MemcpyParser(), tilingReadyBindings) memcpyMapping = {"Memcpy": MemcpyLayer([memcpyMapper])} deployer.Platform.engines[0].Mapping.update(memcpyMapping) diff --git a/DeeployTest/testRunner_siracusa_mchandma.py b/DeeployTest/testRunner_siracusa_mchandma.py index 56ed6f5a14..aeb407d7e6 100644 --- a/DeeployTest/testRunner_siracusa_mchandma.py +++ b/DeeployTest/testRunner_siracusa_mchandma.py @@ -6,15 +6,16 @@ import numpy as np from testUtils.codeGenerate import generateTestNetwork -from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, MemcpyTypeChecker, generate_graph, \ - memcpyTemplate, prepare_deployer_with_custom_tiling, setup_pulp_deployer +from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, generate_graph, memcpyTemplate, \ + prepare_deployer_with_custom_tiling, setup_pulp_deployer from testUtils.testRunner import TestRunner, TestRunnerArgumentParser from testUtils.typeMapping import baseTypeFromName, dtypeFromDeeployType from Deeploy.AbstractDataTypes import PointerClass from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration -from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, _NoVerbosity +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, NodeTypeChecker, _NoVerbosity from Deeploy.Targets.PULPOpen.Bindings import MemoryAwareFunctionCallClosure, TilingCallClosure from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma @@ -75,8 +76,11 @@ MemoryManagementGeneration(), ]) -binding = NodeBinding(MemcpyTypeChecker(), memcpyTemplate, transformer) -tilingReadyBindings = TilingReadyNodeBindings([binding], MemcpyTileConstraint()) +bindings = [ + NodeBinding(NodeTypeChecker([PointerClass(ty)], [PointerClass(ty)]), memcpyTemplate, transformer) + for ty in IntegerDataTypes + FloatDataTypes +] +tilingReadyBindings = TilingReadyNodeBindings(bindings, MemcpyTileConstraint()) memcpyMapper = NodeMapper(MemcpyParser(), tilingReadyBindings) memcpyMapping = {"Memcpy": MemcpyLayer([memcpyMapper])} deployer.Platform.engines[0].Mapping.update(memcpyMapping) diff --git a/DeeployTest/testRunner_snitch_dma.py b/DeeployTest/testRunner_snitch_dma.py index 80073ac5ed..ba42b433fe 100644 --- a/DeeployTest/testRunner_snitch_dma.py +++ b/DeeployTest/testRunner_snitch_dma.py @@ -6,15 +6,16 @@ import numpy as np from testUtils.codeGenerate import generateTestNetwork -from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, MemcpyTypeChecker, generate_graph, \ - memcpyTemplate, prepare_deployer_with_custom_tiling, setup_snitch_deployer +from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, generate_graph, memcpyTemplate, \ + prepare_deployer_with_custom_tiling, setup_snitch_deployer from testUtils.testRunner import TestRunner, TestRunnerArgumentParser from testUtils.typeMapping import baseTypeFromName, dtypeFromDeeployType from Deeploy.AbstractDataTypes import PointerClass from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration -from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, _NoVerbosity +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, NodeTypeChecker, _NoVerbosity from Deeploy.Targets.Snitch.Bindings import MemoryAwareFunctionCallClosure, TilingCallClosure from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterSynch import SnitchSynchCoresPass @@ -80,8 +81,11 @@ MemoryManagementGeneration(), ]) -binding = NodeBinding(MemcpyTypeChecker(), memcpyTemplate, transformer) -tilingReadyBindings = TilingReadyNodeBindings([binding], MemcpyTileConstraint()) +bindings = [ + NodeBinding(NodeTypeChecker([PointerClass(ty)], [PointerClass(ty)]), memcpyTemplate, transformer) + for ty in IntegerDataTypes + FloatDataTypes +] +tilingReadyBindings = TilingReadyNodeBindings(bindings, MemcpyTileConstraint()) memcpyMapper = NodeMapper(MemcpyParser(), tilingReadyBindings) memcpyMapping = {"Memcpy": MemcpyLayer([memcpyMapper])} deployer.Platform.engines[0].Mapping.update(memcpyMapping) diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py index 878bc42014..5a4774a447 100644 --- a/DeeployTest/testUtils/codeGenerate.py +++ b/DeeployTest/testUtils/codeGenerate.py @@ -2,11 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +import math import os from typing import List, Tuple import numpy as np +from Deeploy.AbstractDataTypes import FloatImmediate, IntegerImmediate from Deeploy.DeeployTypes import CodeGenVerbosity, ConstantBuffer, NetworkDeployer, VariableBuffer from Deeploy.Targets.MemPool.Platform import MemPoolPlatform from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPPlatform @@ -30,6 +32,46 @@ def _shapeBroadcast(ctxt, value, name): return broadcastNum +def generateArray(name: str, buffer: VariableBuffer, values: np.ndarray) -> str: + assert math.prod(buffer.shape) == math.prod(values.shape), \ + f"Buffer size ({math.prod(buffer.shape)}) and values size ({math.prod(values.shape)}) are not equal." + refTy = buffer._type.referencedType + + values = values.flatten() + + if issubclass(refTy, FloatImmediate): + if refTy.typeWidth == 32: + suffix = "f" + elif refTy.typeWidth == 64: + suffix = "" + else: + raise RuntimeError( + f"Unimplemented floating-poing literal suffix for type {refTy.typeName} of typeWidth {refTy.typeWidth}") + + def formatFloat(x: float, suffix: str = "") -> str: + if np.isinf(x) or np.isnan(x): + return str(x) + else: + return str(x) + suffix + + list_str = ",".join(formatFloat(x) for x in values) + elif issubclass(refTy, IntegerImmediate): + suffix = "u" if refTy.typeMin >= 0 else "" + suffix += "l" if refTy.typeWidth >= 64 else "" + list_str = ",".join(str(int(x)) + suffix for x in values) + else: + list_str = ",".join(str(x) for x in values) + + # WIESEP: Arrays have to be 4 byte aligned (at least in banshee) + total_bytes = (values.size * refTy.typeWidth) // 8 + pad_bytes = (-total_bytes) % 4 + if pad_bytes: + paddingElements = (pad_bytes * 8 + refTy.typeWidth - 1) // refTy.typeWidth + list_str += ", " + (", ").join("0" for _ in range(paddingElements)) + + return f"{refTy.typeName} {name}[] = {{ {list_str} }};\n" + + def generateTestInputsHeader(deployer: NetworkDeployer, test_inputs: List) -> str: vectors = [] retStr = "" @@ -44,69 +86,44 @@ def generateTestInputsHeader(deployer: NetworkDeployer, test_inputs: List) -> st if not deployer.ctxt.is_buffer(bufferName): continue - values = _shapeBroadcast(deployer.ctxt, values, bufferName) - buffer = deployer.ctxt.lookup(bufferName) - typeName = buffer._type.referencedType.typeName - typeWidth = buffer._type.referencedType.typeWidth + assert isinstance(buffer, VariableBuffer) + + bufferSize = math.prod(buffer.shape) + valuesSize = math.prod(values.shape) + assert bufferSize % valuesSize == 0, \ + f"Values shape {values.shape} of size {valuesSize} cannot be repeated into buffer of shape {buffer.shape} and size {bufferSize}." + repeat = bufferSize // valuesSize + values = np.tile(values, repeat) vectorName = f"testInputVector{index}" + retStr += generateArray(vectorName, buffer, values) vectors.append(vectorName) - retStr += f"{typeName} {vectorName}[] =" - retStr += "{" - if typeName == 'float32_t': - list_str = (", ").join([f'{x}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in values]) - else: - list_str = (", ").join([str(x) for x in values]) - - # WIESEP: Arrays have to be 4 byte aligned (at least in banshee) - total_bytes = (values.size * typeWidth) // 8 - pad_bytes = (-total_bytes) % 4 - if pad_bytes: - paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth - list_str += ", " + (", ").join("0" for _ in range(paddingElements)) - - retStr += list_str - retStr += "};\n" - retStr += f"void* testInputVector[{len(vectors)}] = {{" - retStr += ", ".join(vectors) + retStr += ",".join(vectors) retStr += "};\n" return retStr def generateTestOutputsHeader(deployer: NetworkDeployer, test_outputs: List[np.ndarray]) -> str: + vectors = [] retStr = "" for index, values in enumerate(test_outputs): - typeName = deployer.ctxt.lookup(f'output_{index}')._type.referencedType.typeName - typeWidth = deployer.ctxt.lookup(f'output_{index}')._type.referencedType.typeWidth + buffer = deployer.ctxt.lookup(f"output_{index}") + assert isinstance(buffer, VariableBuffer) + refTy = buffer._type.referencedType - retStr += f"#define OUTPUTTYPE {typeName}\n" - retStr += f"#define ISOUTPUTFLOAT {int(typeName == 'float32_t')}\n" - retStr += f"{typeName} testOutputVector{index}[] =" - retStr += "{" + retStr += f"#define OUTPUTTYPE {refTy.typeName}\n" + retStr += f"#define ISOUTPUTFLOAT {int(refTy.typeName == 'float32_t')}\n" - values = values.flatten() - - if typeName == "float32_t": - list_str = (", ").join([f'{x}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in values]) - else: - list_str = (", ").join([str(x) for x in values]) - - # WIESEP: Arrays have to be 4 byte aligned (at least in banshee) - total_bytes = (len(values) * typeWidth) // 8 - pad_bytes = (-total_bytes) % 4 - if pad_bytes: - paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth - list_str += ", " + (", ").join("0" for _ in range(paddingElements)) - - retStr += list_str - retStr += "};\n" + vectorName = f"testOutputVector{index}" + retStr += generateArray(vectorName, buffer, values) + vectors.append(vectorName) - retStr += f"void* testOutputVector[{len(test_outputs)}] = " + "{" - retStr += ", ".join([f"testOutputVector{idx}" for idx, _ in enumerate(test_outputs)]) + retStr += f"void* testOutputVector[{len(vectors)}] = {{" + retStr += ",".join(vectors) retStr += "};\n" return retStr diff --git a/DeeployTest/testUtils/dmaUtils.py b/DeeployTest/testUtils/dmaUtils.py index 3266ce5129..ec317c3b1c 100644 --- a/DeeployTest/testUtils/dmaUtils.py +++ b/DeeployTest/testUtils/dmaUtils.py @@ -10,8 +10,8 @@ from Deeploy.AbstractDataTypes import BaseType, Pointer, PointerClass from Deeploy.CommonExtensions.DataTypes import minimalIntegerType -from Deeploy.DeeployTypes import NetworkContext, NetworkDeployer, NodeParser, NodeTemplate, NodeTypeChecker, \ - ONNXLayer, OperatorRepresentation, VariableBuffer +from Deeploy.DeeployTypes import NetworkContext, NetworkDeployer, NodeParser, NodeTemplate, ONNXLayer, \ + OperatorRepresentation, VariableBuffer from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper, \ MemoryPlatformWrapper @@ -35,28 +35,6 @@ """) -# Same interface as NodeTypeChecker but allow any input type and the -# output type matches the input type. -class MemcpyTypeChecker(NodeTypeChecker): - - def __init__(self): - super().__init__([], []) - - def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node, - operatorRepresentation: OperatorRepresentation) -> NetworkContext: - assert len(node.inputs) == 1 and len(node.outputs) == 1 - buffer_in = ctxt.lookup(node.inputs[0].name) - ctxt.annotateType(node.outputs[0].name, buffer_in._type) - return ctxt - - def typeCheckNodeInputs(self, ctxt: NetworkContext, node: gs.Node) -> bool: - return True - - def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext: - # Whatever it has already annotated, it's good - return ctxt - - class MemcpyTileConstraint(TileConstraint): @classmethod