if (type == OpType::MyCustomOperator) { auto op = Backend::createMyCustomOperator(name, workspace); op->setParam1(node.param1()); op->setParam2(node.param2()); network->addOperator(op); }
//Calculate the size of each tile (assuming the maximum tile size //is the local memory capacity of the hardware). int maxTileSize = std::min(ReferenceBackend::SpadSize() / inputs0->getDataTypeSize(), inputs0->getShape().storageSize()); TensorShape tileShape( { 1, maxTileSize }, DataLayout::NC, ReferenceBackend::Alignment); // Assuming the data is 1D.
namespace ref { // This is all existing code... constunsigned kConvolutionHw = 0x0001; constunsigned kInnerProductHw = 0x0002; constunsigned kEltwiseOpHw = 0x0003; constunsigned kBatchNormHw = 0x0004; constunsigned kPoolingHw = 0x0005;
// Define our new scratchpads here. int kSpadSize; float* spad0; float* spad1;
// Add a unique ID for our accelerator HW. This will be used to invoke the // accelerator during simulation. constunsigned kMyCustomOperatorHw = 0x00006; } // namespace ref
在头文件core/backend.h中extern
1 2 3 4 5 6 7 8 9 10 11 12 13 14
namespace ref { // This is all existing code... externconstunsigned kConvolutionHw; externconstunsigned kInnerProductHw; externconstunsigned kEltwiseOpHw; externconstunsigned kBatchNormHw; externconstunsigned kPoolingHw;
// Declare our two new global arrays and accelerator IDs here. externint kSpadSize; externfloat* spad0; externfloat* spad1; externconstunsigned kMyCustomOperatorHw; } // namespace ref
并且在头文件中修改init函数对我们使用的pad进行初始化
1 2 3 4 5 6 7 8 9 10 11 12
classReferenceBackend { staticintSpadSize(){ return ref::kSpadSize; } staticvoidinitGlobals(){ ref::kSpadSize = 32*1024; // Replace with your actual value. ref::spad0 = (float*) malloc_aligned(ref::kSpadSize); ref::spad1 = (float*) malloc_aligned(ref::kSpadSize); } staticvoidfreeGlobals(){ free(ref::spad0); free(ref::spad1); } }
for (int i = 0; i < input0.size(); i++) { Tensor* input0Tile = input0.getTileWithData(i); Tensor* input1Tile = input1.getTileWithData(i); Tensor* outputTile = output.getTileWithData(i);
// Get handles to the actual underlying data storage. This performs // a dynamic_cast to the specified data type, which we verified is // safe inside validate(). float* input0Data = input0Tile->data<float>(); float* input1Data = input1Tile->data<float>(); float* outputData = outputTile->data<float>(); int size = outputTile->getShape().size();
// Set up the TLB mappings. mapArrayToAccelerator( ref::kMyCustomOperatorHw, // The accelerator ID this TLB // mapping is for. "host_input0", // The name of the function argument in the // kernel function. input0Data, // The pointer to the data. size * sizeof(float) // The size of the TLB mapping ); mapArrayToAccelerator( ref::kMyCustomOperatorHw, "host_input1", input1Data, size * sizeof(float) ); mapArrayToAccelerator( ref::kMyCustomOperatorHw, "host_output", outputData, size * sizeof(float) );
// Wrap the call to elementwise_add with invokeKernel. invokeKernel( ref::kMyCustomOperatorHw, // our accelerator ID elementwise_add, // if not simulating, the function to call // All of the function call arguments. input0Data, input1Data, outputData, ref::spad0, ref::spad1, outputTile->getShape().size()); } // The results of the elementwise_add are stored in the tiled tensor. We // need to merge the data from the individual tiles back into a single // contiguous Tensor. flattenTiledTensor(tiledTensors[2], dynamic_cast<smaug::Tensor*>(outputs.at(kOutput))); }
voidfillTensorWithSequentialFloat32Data(Tensor* tensor){ float* data = tensor->data<float>(); for (int i = 0; i < tensor->getShape().size(); i++) { data[i] = i; } }
TEST_CASE_METHOD(SmaugTest, "MyCustomOperatorWithTiling", "[tiling]") { // With float32 elements, this will occupy 128KB, which should create four // tiles per tensor. TensorShape shape( {8, 4096}, DataLayout::NC); Tensor* input0 = newTensor("tensor0", shape); Tensor* input1 = newTensor("tensor1", shape); workspace()->addTensor(input0); workspace()->addTensor(input1);
// // Create the operator and fill it with our tensors. using TestOp = MyCustomOperator<ReferenceBackend>; auto op = newTestOp("eltwise_add", workspace()); op->setInput(input0, TestOp::kInput0); op->setInput(input1, TestOp::kInput1); // This will handle creating/allocating storage/filling data into all the // input tensors. createAndFillTensorsWithData<float>(op, &fillTensorWithSequentialFloat32Data); // // Compute the expected output. std::vector<float> expected_output(8 * 4096, 0); for (int i = 0; i < expected_output.size(); i++) { expected_output[i] = 2 * i; }
from smaug.core import node_pb2, types_pb2 from smaug.python.ops import common
defmy_custom_operator(tensor_a, tensor_b, name): if tensor_a.shape.dims != tensor_b.shape.dims: raise ValueError("The input tensors to MyCustomOperator must be of the same shape")
但是,我们发现tracer完全无法采样到我们Custom的算子,为什么呢,前文提到过,我们使用 C style 的elementwise_add正是为了让函数签名不变,但是tracer不认识这个函数签名,于是我们必须在/make/kernel_functions.txt中添加一行elementwise_add,并且重新编译tracer。
这样就可以发现正确的输出:
1 2 3 4 5 6
Scheduling element (CustomOperator). dynamic_trace_acc0.gz: Starting to log at inst=0. dynamic_trace_acc0.gz: Stopping logging at inst10257. Scheduling element_1 (CustomOperator). dynamic_trace_acc0.gz: Starting to log at inst=0. dynamic_trace_acc0.gz: Stopping logging at inst10257.