Skip to content

Commit 7990205

Browse files
hsharma35meta-codesync[bot]
authored andcommitted
Migrate more generic cadence operators to oss.
Summary: * op_fully_connected * op_idma_copy * op_idma_wait * op_requantize * op_transposed_convolution * op_transposed_im2row Differential Revision: D88084935
1 parent cef5af1 commit 7990205

13 files changed

+1399
-22
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/generic/operators/op_fully_connected.h>
10+
11+
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
12+
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
13+
14+
namespace impl {
15+
namespace generic {
16+
namespace native {
17+
18+
using ::executorch::aten::optional;
19+
using ::executorch::aten::Tensor;
20+
using ::executorch::runtime::getLeadingDims;
21+
using ::executorch::runtime::KernelRuntimeContext;
22+
23+
void linear(
24+
const Tensor& input,
25+
const Tensor& weight,
26+
const optional<Tensor>& bias,
27+
Tensor& output) {
28+
const float* __restrict__ input_data = input.const_data_ptr<float>();
29+
const float* __restrict__ weight_data = weight.const_data_ptr<float>();
30+
const float* __restrict__ bias_data = bias.value().const_data_ptr<float>();
31+
float* __restrict__ output_data = output.mutable_data_ptr<float>();
32+
33+
// input comes in shape [batch_size, in_dim]
34+
// weight comes in shape [out_dim, in_dim]
35+
// output comes in empty with shape [batch_size, out_dim]
36+
// Perform matrix multiply (M x N) x (N x P) => M x P
37+
int64_t M = weight.size(0); // = out_dim
38+
int64_t N = weight.size(1); // = in_dim
39+
40+
// Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the
41+
// leading dimensions is d0 * d1 * ... * d_{N-2}
42+
int64_t leading_dims = getLeadingDims(input, input.dim() - 1);
43+
44+
for (int i = 0; i < leading_dims; ++i) {
45+
for (int j = 0; j < M; ++j) {
46+
float sum = bias_data[j];
47+
for (int k = 0; k < N; ++k) {
48+
sum += input_data[i * N + k] * weight_data[j * N + k];
49+
}
50+
output_data[i * M + j] = sum;
51+
}
52+
}
53+
}
54+
55+
Tensor& fully_connected_out(
56+
ET_UNUSED KernelRuntimeContext& ctx,
57+
const Tensor& input,
58+
const Tensor& weight,
59+
const optional<Tensor>& bias,
60+
Tensor& output) {
61+
linear(input, weight, bias, output);
62+
return output;
63+
}
64+
65+
} // namespace native
66+
} // namespace generic
67+
} // namespace impl
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
12+
#include <executorch/runtime/kernel/kernel_runtime_context.h>
13+
14+
namespace impl {
15+
namespace generic {
16+
namespace native {
17+
18+
using ::executorch::aten::optional;
19+
using ::executorch::aten::Tensor;
20+
using ::executorch::runtime::KernelRuntimeContext;
21+
22+
Tensor& fully_connected_out(
23+
KernelRuntimeContext& ctx,
24+
const Tensor& input,
25+
const Tensor& weight,
26+
const optional<Tensor>& bias,
27+
Tensor& output);
28+
29+
} // namespace native
30+
} // namespace generic
31+
} // namespace impl
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/generic/operators/op_idma_copy.h>
10+
11+
#include <cstdint>
12+
#include <cstring> // For std::memcpy
13+
14+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
15+
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
16+
#include <executorch/runtime/kernel/kernel_runtime_context.h>
17+
18+
namespace impl {
19+
namespace generic {
20+
namespace native {
21+
22+
using ::executorch::aten::ScalarType;
23+
using ::executorch::aten::Tensor;
24+
using ::executorch::runtime::KernelRuntimeContext;
25+
26+
// CPU implementation of idma_copy_out using std::memcpy
27+
// This function performs a direct memory copy between tensors
28+
Tensor& idma_copy_out(
29+
KernelRuntimeContext& ctx,
30+
const Tensor& src,
31+
const int64_t
32+
task_num, // Unused in CPU implementation but kept for API compatibility
33+
const int64_t
34+
channel, // Unused in CPU implementation but kept for API compatibility
35+
Tensor& out) {
36+
ET_KERNEL_CHECK(
37+
ctx,
38+
src.dtype() == out.dtype() && src.numel() == out.numel(),
39+
InvalidArgument,
40+
out);
41+
42+
// Use std::memcpy for direct memory copy
43+
std::memcpy(
44+
out.mutable_data_ptr<uint8_t>(),
45+
src.const_data_ptr<uint8_t>(),
46+
out.nbytes());
47+
48+
return out;
49+
}
50+
51+
} // namespace native
52+
} // namespace generic
53+
} // namespace impl
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
12+
#include <executorch/runtime/kernel/kernel_runtime_context.h>
13+
14+
namespace impl {
15+
namespace generic {
16+
namespace native {
17+
18+
::executorch::aten::Tensor& idma_copy_out(
19+
::executorch::runtime::KernelRuntimeContext& ctx,
20+
const ::executorch::aten::Tensor& src,
21+
const int64_t task_num,
22+
const int64_t channel,
23+
::executorch::aten::Tensor& out);
24+
25+
} // namespace native
26+
} // namespace generic
27+
} // namespace impl
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include "executorch/backends/cadence/generic/operators/op_idma_wait.h"
10+
11+
#include <cstdint>
12+
13+
#include "executorch/runtime/core/exec_aten/exec_aten.h"
14+
#include "executorch/runtime/core/exec_aten/util/tensor_util.h"
15+
#include "executorch/runtime/kernel/kernel_runtime_context.h"
16+
17+
namespace impl {
18+
namespace generic {
19+
namespace native {
20+
21+
using ::executorch::aten::ScalarType;
22+
using ::executorch::aten::Tensor;
23+
using ::executorch::runtime::KernelRuntimeContext;
24+
25+
// CPU implementation of idma_wait_out
26+
// Since there's no actual DMA operation in the CPU implementation,
27+
// this is essentially a no-op function that just ensures the output tensor
28+
// has the same content as the input tensor
29+
Tensor& idma_wait_out(
30+
KernelRuntimeContext& ctx,
31+
const Tensor& src,
32+
const int64_t
33+
task_num, // Unused in CPU implementation but kept for API compatibility
34+
Tensor& out) {
35+
ET_KERNEL_CHECK(ctx, src.numel() == out.numel(), InvalidArgument, out);
36+
ET_KERNEL_CHECK(ctx, src.dtype() == out.dtype(), InvalidArgument, out);
37+
ET_KERNEL_CHECK(
38+
ctx,
39+
src.const_data_ptr<uint8_t>() == out.const_data_ptr<uint8_t>(),
40+
InvalidArgument,
41+
out);
42+
43+
return out;
44+
}
45+
46+
} // namespace native
47+
} // namespace generic
48+
} // namespace impl
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
12+
#include <executorch/runtime/kernel/kernel_runtime_context.h>
13+
14+
namespace impl {
15+
namespace generic {
16+
namespace native {
17+
18+
::executorch::aten::Tensor& idma_wait_out(
19+
::executorch::runtime::KernelRuntimeContext& ctx,
20+
const ::executorch::aten::Tensor& src,
21+
const int64_t task_num,
22+
::executorch::aten::Tensor& out);
23+
24+
} // namespace native
25+
} // namespace generic
26+
} // namespace impl

backends/cadence/generic/operators/op_requantize_out.cpp renamed to backends/cadence/generic/operators/op_requantize.cpp

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,26 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include <executorch/backends/cadence/generic/kernels/kernels.h>
9+
#include <executorch/backends/cadence/generic/operators/op_requantize.h>
10+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
1011
#include <executorch/runtime/kernel/kernel_includes.h>
1112

13+
#include <executorch/backends/cadence/generic/kernels/kernels.h>
14+
#include <cstdint>
15+
#include <cstdlib>
16+
1217
namespace impl {
1318
namespace generic {
1419
namespace native {
1520

16-
using executorch::aten::ScalarType;
17-
using executorch::aten::Tensor;
18-
using executorch::runtime::KernelRuntimeContext;
21+
using ::executorch::aten::IntArrayRef;
22+
using ::executorch::aten::optional;
23+
using ::executorch::aten::Scalar;
24+
using ::executorch::aten::ScalarType;
25+
using ::executorch::aten::Tensor;
26+
using ::executorch::runtime::KernelRuntimeContext;
27+
using ::impl::generic::kernels::dequantize;
28+
using ::impl::generic::kernels::quantize;
1929

2030
// Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor.
2131
// The scale and zero_point for requantization are in the args.
@@ -86,15 +96,14 @@ Tensor& requantize_out(
8696
torch::executor::toString(out.scalar_type()),
8797
torch::executor::toString(out_dtype));
8898

89-
#define typed_requantize(ctype, dtype) \
90-
const ctype* input_data = input.const_data_ptr<ctype>(); \
91-
dtype* out_data = out.mutable_data_ptr<dtype>(); \
92-
for (size_t i = 0; i < numel; ++i) { \
93-
float dequant = \
94-
kernels::dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
95-
out_data[i] = \
96-
kernels::quantize<dtype>(dequant, 1 / out_scale, out_zero_point); \
99+
#define typed_requantize(ctype, dtype) \
100+
const ctype* input_data = input.const_data_ptr<ctype>(); \
101+
dtype* out_data = out.mutable_data_ptr<dtype>(); \
102+
for (size_t i = 0; i < numel; ++i) { \
103+
float dequant = dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
104+
out_data[i] = quantize<dtype>(dequant, 1 / out_scale, out_zero_point); \
97105
};
106+
98107
#define typed_requantize_in(ctype) \
99108
switch (out_dtype) { \
100109
case ScalarType::Byte: { \
@@ -187,14 +196,12 @@ Tensor& requantize_per_tensor_out(
187196
torch::executor::toString(out.scalar_type()),
188197
torch::executor::toString(out_dtype));
189198

190-
#define typed_requantize(ctype, dtype) \
191-
const ctype* input_data = input.const_data_ptr<ctype>(); \
192-
dtype* out_data = out.mutable_data_ptr<dtype>(); \
193-
for (size_t i = 0; i < numel; ++i) { \
194-
float dequant = \
195-
kernels::dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
196-
out_data[i] = \
197-
kernels::quantize<dtype>(dequant, 1 / out_scale, out_zero_point); \
199+
#define typed_requantize(ctype, dtype) \
200+
const ctype* input_data = input.const_data_ptr<ctype>(); \
201+
dtype* out_data = out.mutable_data_ptr<dtype>(); \
202+
for (size_t i = 0; i < numel; ++i) { \
203+
float dequant = dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
204+
out_data[i] = quantize<dtype>(dequant, 1 / out_scale, out_zero_point); \
198205
};
199206

200207
#define typed_requantize_in(ctype) \
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
12+
#include <executorch/runtime/kernel/kernel_runtime_context.h>
13+
14+
namespace impl {
15+
namespace generic {
16+
namespace native {
17+
18+
::executorch::aten::Tensor& requantize_out(
19+
::executorch::runtime::KernelRuntimeContext& ctx,
20+
const ::executorch::aten::Tensor& input,
21+
const ::executorch::aten::Tensor& in_scale_t,
22+
const ::executorch::aten::Tensor& in_zero_point_t,
23+
const ::executorch::aten::Tensor& out_scale_t,
24+
const ::executorch::aten::Tensor& out_zero_point_t,
25+
const ::executorch::aten::ScalarType out_dtype,
26+
::executorch::aten::Tensor& out);
27+
28+
::executorch::aten::Tensor& requantize_per_tensor_out(
29+
::executorch::runtime::KernelRuntimeContext& ctx,
30+
const ::executorch::aten::Tensor& input,
31+
double in_scale,
32+
int64_t in_zero_point,
33+
double out_scale,
34+
int64_t out_zero_point,
35+
const ::executorch::aten::ScalarType out_dtype,
36+
::executorch::aten::Tensor& out);
37+
38+
} // namespace native
39+
} // namespace generic
40+
} // namespace impl

0 commit comments

Comments
 (0)