Skip to content

Commit 7fd66d2

Browse files
hsharma35meta-codesync[bot]
authored andcommitted
Migrate generic cadence operators to oss. (#16023)
Summary: Pull Request resolved: #16023 * op_avg_pool2d * op_softmax * op_conv*d Reviewed By: mcremon-meta Differential Revision: D88081853
1 parent 907a468 commit 7fd66d2

File tree

11 files changed

+896
-12
lines changed

11 files changed

+896
-12
lines changed
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/generic/operators/op_avg_pool2d.h>
10+
11+
#include <algorithm>
12+
#include <cmath>
13+
14+
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
15+
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
16+
17+
namespace impl {
18+
namespace generic {
19+
namespace native {
20+
21+
using ::executorch::aten::IntArrayRef;
22+
using ::executorch::aten::optional;
23+
using ::executorch::aten::ScalarType;
24+
using ::executorch::aten::Tensor;
25+
using ::executorch::runtime::getLeadingDims;
26+
using ::executorch::runtime::KernelRuntimeContext;
27+
28+
// Compute the avg_pool2d for in_data in NCHW layout. IT is the input datatype,
29+
// and AT is the accumulation datatype. 'quantized' is true when the input is
30+
// quantized tensor.
31+
template <typename IT, typename AT = IT, bool quantized = false>
32+
void avg_pool2d_nchw(
33+
const IT* __restrict__ in_data,
34+
const int32_t in_zero_point,
35+
IT* __restrict__ out_data,
36+
IntArrayRef kernel_size,
37+
IntArrayRef stride,
38+
IntArrayRef padding,
39+
bool count_include_pad,
40+
int64_t divisor,
41+
int leading_dims,
42+
int ih,
43+
int iw,
44+
int oh,
45+
int ow) {
46+
int kh = kernel_size[0];
47+
int kw = kernel_size[1];
48+
int s0 = stride[0];
49+
int s1 = stride[1];
50+
int p0 = padding[0];
51+
int p1 = padding[1];
52+
53+
for (int _n = 0; _n < leading_dims; ++_n) {
54+
for (int _ih = 0, _oh = 0; _oh < oh; ++_oh, _ih += s0) {
55+
int input_offset = _n * ih * iw;
56+
int output_offset = _n * oh * ow + _oh * ow;
57+
for (int _iw = 0, _ow = 0; _ow < ow; ++_ow, _iw += s1) {
58+
int kh_lo = std::max(0, _ih - p0);
59+
int kh_hi = std::min(ih, _ih + kh - p0);
60+
int kw_lo = std::max(0, _iw - p1);
61+
int kw_hi = std::min(iw, _iw + kw - p1);
62+
// Count the number of contributions sans padding
63+
int count = (kh_hi - kh_lo) * (kw_hi - kw_lo);
64+
// Set the accumulator
65+
AT acc = count_include_pad ? in_zero_point * (kh * kw - count) : 0;
66+
// Accumulate values
67+
for (int _kh = kh_lo; _kh < kh_hi; ++_kh) {
68+
for (int _kw = kw_lo; _kw < kw_hi; ++_kw) {
69+
int input_addr = input_offset + _kh * iw + _kw;
70+
acc += in_data[input_addr];
71+
}
72+
}
73+
// The divisor changes depending on whether the count includes
74+
// padded cells or not.
75+
float inv_divisor = 1. / (count_include_pad ? divisor : count);
76+
float val = acc * inv_divisor;
77+
if (quantized) {
78+
int32_t min_val =
79+
static_cast<int32_t>(std::numeric_limits<IT>::min());
80+
int32_t max_val =
81+
static_cast<int32_t>(std::numeric_limits<IT>::max());
82+
out_data[output_offset + _ow] = std::min(
83+
std::max(int32_t(std::nearbyint(val)), min_val), max_val);
84+
} else {
85+
out_data[output_offset + _ow] = val;
86+
}
87+
}
88+
}
89+
}
90+
}
91+
92+
Tensor& avg_pool2d_out(
93+
ET_UNUSED KernelRuntimeContext& ctx,
94+
const Tensor& input,
95+
IntArrayRef kernel_size,
96+
IntArrayRef stride,
97+
IntArrayRef padding,
98+
bool ceil_mode,
99+
bool count_include_pad,
100+
optional<int64_t> divisor_override,
101+
const optional<Tensor>& in_zero_point_t,
102+
bool channel_last,
103+
Tensor& out) {
104+
ET_DCHECK_MSG(!channel_last, "NHWC layout for avg_pool2d not yet supported");
105+
const int32_t in_zero_point = in_zero_point_t.has_value()
106+
? in_zero_point_t.value().const_data_ptr<int32_t>()[0]
107+
: 0;
108+
const int64_t divisor = divisor_override.has_value()
109+
? divisor_override.value()
110+
: kernel_size[0] * kernel_size[1];
111+
112+
const int odim = out.dim();
113+
const int on = getLeadingDims(out, odim - 2);
114+
const int oh = out.size(odim - 2);
115+
const int ow = out.size(odim - 1);
116+
const int ih = input.size(odim - 2);
117+
const int iw = input.size(odim - 1);
118+
119+
// We generate the kernel for float and uint8_t types. The operator also
120+
// works for double, but does not support other dtypes.
121+
#define typed_avg_pool2d(btype, ctype, quantized, dtype) \
122+
case ScalarType::dtype: { \
123+
avg_pool2d_nchw<btype, ctype, quantized>( \
124+
input.const_data_ptr<btype>(), \
125+
in_zero_point, \
126+
out.mutable_data_ptr<btype>(), \
127+
kernel_size, \
128+
stride, \
129+
padding, \
130+
count_include_pad, \
131+
divisor, \
132+
on, \
133+
ih, \
134+
iw, \
135+
oh, \
136+
ow); \
137+
break; \
138+
}
139+
140+
ScalarType dtype = input.scalar_type();
141+
switch (dtype) {
142+
typed_avg_pool2d(float, float, false, Float);
143+
typed_avg_pool2d(uint8_t, int32_t, true, Byte);
144+
default:
145+
ET_DCHECK_MSG(
146+
false,
147+
"avg_pool2d not implemented for dtype %s",
148+
torch::executor::toString(dtype));
149+
}
150+
151+
return out;
152+
}
153+
154+
} // namespace native
155+
} // namespace generic
156+
} // namespace impl
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
12+
#include <executorch/runtime/kernel/kernel_runtime_context.h>
13+
14+
namespace impl {
15+
namespace generic {
16+
namespace native {
17+
18+
::executorch::aten::Tensor& avg_pool2d_out(
19+
::executorch::runtime::KernelRuntimeContext& ctx,
20+
const ::executorch::aten::Tensor& input,
21+
::executorch::aten::IntArrayRef kernel_size,
22+
::executorch::aten::IntArrayRef stride,
23+
::executorch::aten::IntArrayRef padding,
24+
bool ceil_mode,
25+
bool count_include_pad,
26+
::executorch::aten::optional<int64_t> divisor_override,
27+
const ::executorch::aten::optional<::executorch::aten::Tensor>&
28+
in_zero_point_t,
29+
bool channel_last,
30+
::executorch::aten::Tensor& out);
31+
32+
} // namespace native
33+
} // namespace generic
34+
} // namespace impl
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/generic/operators/op_conv1d.h>
10+
11+
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
12+
13+
namespace impl {
14+
namespace generic {
15+
namespace native {
16+
17+
using ::executorch::aten::IntArrayRef;
18+
using ::executorch::aten::ScalarType;
19+
using ::executorch::aten::Tensor;
20+
using ::executorch::runtime::KernelRuntimeContext;
21+
22+
// This implements a generic 1D float32 convolution kernel.
23+
// The input is of shape [n x c x w] (batch x channels x width)
24+
// The weight is of shape [oc x wc x ww] (out_channels x weight_channels x
25+
// weight_width) The output is of shape [n x oc x ow] (batch x out_channels x
26+
// out_width) The bias is of shape [oc]
27+
28+
Tensor& conv1d_out(
29+
ET_UNUSED KernelRuntimeContext& ctx,
30+
const Tensor& input,
31+
const Tensor& weight,
32+
const Tensor& bias,
33+
IntArrayRef stride,
34+
IntArrayRef padding,
35+
IntArrayRef dilation,
36+
int64_t groups,
37+
Tensor& out) {
38+
// Extract dimensions
39+
const int n = input.size(0);
40+
const int c = input.size(1);
41+
const int w = input.size(2);
42+
const int oc = weight.size(0);
43+
const int wc = weight.size(1);
44+
const int ww = weight.size(2);
45+
const int ow = out.size(2);
46+
47+
const int16_t s = static_cast<int16_t>(stride[0]);
48+
const int16_t p = static_cast<int16_t>(padding[0]);
49+
const int16_t d = static_cast<int16_t>(dilation[0]);
50+
const int16_t g = static_cast<int16_t>(groups);
51+
52+
const float* p_in = input.const_data_ptr<float>();
53+
const float* p_weight = weight.const_data_ptr<float>();
54+
const float* p_bias = bias.const_data_ptr<float>();
55+
float* p_out = out.mutable_data_ptr<float>();
56+
57+
const bool zero_pad_unit_dilation = d == 1 && p == 0;
58+
const int ocpg = oc / g;
59+
const int icpg = c / g;
60+
61+
for (int _n = 0; _n < n; ++_n) {
62+
const float* in_batch = p_in + _n * c * w;
63+
float* out_batch = p_out + _n * oc * ow;
64+
for (int _g = 0; _g < g; ++_g) {
65+
int sic = _g * icpg;
66+
int soc = _g * ocpg;
67+
for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
68+
float* out_plane = out_batch + _oc * ow;
69+
const float* weight_batch = p_weight + _oc * wc * ww;
70+
for (int _w = 0, _ow = 0; _ow < ow; _w += s, ++_ow) {
71+
float acc = p_bias[_oc];
72+
if (zero_pad_unit_dilation) {
73+
for (int _ic = sic; _ic < sic + icpg; ++_ic) {
74+
const float* in_plane = in_batch + _ic * w;
75+
const float* weight_plane = weight_batch + (_ic - sic) * ww;
76+
for (int _ww = 0; _ww < ww; ++_ww) {
77+
int ioff = _w + _ww;
78+
acc += in_plane[ioff] * weight_plane[_ww];
79+
}
80+
}
81+
} else {
82+
for (int _ic = sic; _ic < sic + icpg; ++_ic) {
83+
const float* in_plane = in_batch + _ic * w;
84+
const float* weight_plane = weight_batch + (_ic - sic) * ww;
85+
for (int _ww = 0; _ww < ww; ++_ww) {
86+
int w_pos = _w + d * _ww - p;
87+
if (w_pos >= 0 && w_pos < w) {
88+
acc += in_plane[w_pos] * weight_plane[_ww];
89+
}
90+
}
91+
}
92+
}
93+
out_plane[_ow] = acc;
94+
}
95+
}
96+
}
97+
}
98+
99+
return out;
100+
}
101+
102+
} // namespace native
103+
} // namespace generic
104+
} // namespace impl
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
12+
#include <executorch/runtime/kernel/kernel_runtime_context.h>
13+
14+
namespace impl {
15+
namespace generic {
16+
namespace native {
17+
18+
using ::executorch::aten::IntArrayRef;
19+
using ::executorch::aten::Tensor;
20+
using ::executorch::runtime::KernelRuntimeContext;
21+
22+
Tensor& conv1d_out(
23+
KernelRuntimeContext& ctx,
24+
const Tensor& input,
25+
const Tensor& weight,
26+
const Tensor& bias,
27+
IntArrayRef stride,
28+
IntArrayRef padding,
29+
IntArrayRef dilation,
30+
int64_t groups,
31+
Tensor& out);
32+
33+
} // namespace native
34+
} // namespace generic
35+
} // namespace impl

0 commit comments

Comments
 (0)