Skip to content

Commit 481b70e

Browse files
authored
Gpu type 3 (#517)
* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft * Removed the CPU fseries computation (only used for benchmark no longer needed). * Added complex arithmetic support for cuda_complex type * Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic * integrated flipwind on type 1-2 on GPU * Minor fixes on the GPU code: - removed memory leaks in case of errors - renamed maxbatchsize to batchsize - renamed the fseries and nuft to match CPU code
1 parent e77225d commit 481b70e

38 files changed

+1764
-733
lines changed

CHANGELOG

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,15 @@ List of features / changes made / release notes, in reverse chronological order.
22
If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
33

44
Master (9/10/24)
5-
65
* reduced roundoff error in a[n] phase calc in CPU onedim_fseries_kernel().
76
#534 (Barnett).
7+
* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517).
8+
- Removed the CPU fseries computation (only used for benchmark no longer needed).
9+
- Added complex arithmetic support for cuda_complex type
10+
- Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
11+
- Minor fixes on the GPU code:
12+
a) removed memory leaks in case of errors
13+
b) renamed maxbatchsize to batchsize
814

915
V 2.3.0 (9/5/24)
1016

docs/devnotes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ Developer notes
5454

5555
* CMake compiling on linux at Flatiron Institute (Rusty cluster): We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``.
5656

57+
* Note to the nvcc developer. nvcc with debug symbols causes a stack overflow that is undetected at both compile and runtime. This goes undetected until ns>=10 and dim=3, for ns<10 or dim < 3, one can use -G and debug the code with cuda-gdb. The way to avoid is to not use Debug symbols, possibly using ``--generate-line-info`` might work (not tested). As a side note, compute-sanitizers do not detect the issue.
58+
5759
* Testing cufinufft (for FI, mostly):
5860

5961
.. code-block:: sh

include/cufinufft/common.h

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,31 +7,37 @@
77
#include <finufft_errors.h>
88
#include <finufft_spread_opts.h>
99

10-
#include <complex.h>
10+
#include <complex>
1111

1212
namespace cufinufft {
1313
namespace common {
1414
template<typename T>
15-
__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
16-
cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2,
15+
__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a,
16+
T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3,
17+
int ns);
18+
template<typename T>
19+
__global__ void cu_nuft_kernel_compute(int nf1, int nf2, int nf3, T *f, T *z, T *kx,
20+
T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2,
1721
T *fwkerhalf3, int ns);
1822
template<typename T>
19-
int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
20-
cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2,
21-
T *d_fwkerhalf3, int ns, cudaStream_t stream);
23+
int fseries_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_phase,
24+
T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns,
25+
cudaStream_t stream);
26+
template<typename T>
27+
int nuft_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_z, T *d_kx,
28+
T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2,
29+
T *d_fwkerhalf3, int ns, cudaStream_t stream);
2230
template<typename T>
2331
int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts);
2432

2533
void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts,
2634
CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b);
35+
2736
template<typename T>
28-
void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts);
29-
template<typename T>
30-
void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
37+
void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a,
3138
finufft_spread_opts opts);
3239
template<typename T>
33-
void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
34-
T *fwkerhalf, finufft_spread_opts opts);
40+
void onedim_nuft_kernel_precomp(T *f, T *zout, finufft_spread_opts opts);
3541

3642
template<typename T>
3743
std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
#ifndef FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
2+
#define FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
3+
4+
#include <cuComplex.h>
5+
6+
// This header provides some helper functions for cuComplex types.
7+
// It mainly wraps existing CUDA implementations to provide operator overloads
8+
// e.g. cuAdd, cuSub, cuMul, cuDiv, cuCreal, cuCimag, cuCabs, cuCarg, cuConj are all
9+
// provided by CUDA
10+
11+
// Addition for cuDoubleComplex (double) with cuDoubleComplex (double)
12+
__host__ __device__ __forceinline__ cuDoubleComplex operator+(
13+
const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
14+
return cuCadd(a, b);
15+
}
16+
17+
// Subtraction for cuDoubleComplex (double) with cuDoubleComplex (double)
18+
__host__ __device__ __forceinline__ cuDoubleComplex operator-(
19+
const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
20+
return cuCsub(a, b);
21+
}
22+
23+
// Multiplication for cuDoubleComplex (double) with cuDoubleComplex (double)
24+
__host__ __device__ __forceinline__ cuDoubleComplex operator*(
25+
const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
26+
return cuCmul(a, b);
27+
}
28+
29+
// Division for cuDoubleComplex (double) with cuDoubleComplex (double)
30+
__host__ __device__ __forceinline__ cuDoubleComplex operator/(
31+
const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
32+
return cuCdiv(a, b);
33+
}
34+
35+
// Equality for cuDoubleComplex (double) with cuDoubleComplex (double)
36+
__host__ __device__ __forceinline__ bool operator==(const cuDoubleComplex &a,
37+
const cuDoubleComplex &b) noexcept {
38+
return cuCreal(a) == cuCreal(b) && cuCimag(a) == cuCimag(b);
39+
}
40+
41+
// Inequality for cuDoubleComplex (double) with cuDoubleComplex (double)
42+
__host__ __device__ __forceinline__ bool operator!=(const cuDoubleComplex &a,
43+
const cuDoubleComplex &b) noexcept {
44+
return !(a == b);
45+
}
46+
47+
// Addition for cuDoubleComplex (double) with double
48+
__host__ __device__ __forceinline__ cuDoubleComplex operator+(const cuDoubleComplex &a,
49+
double b) noexcept {
50+
return make_cuDoubleComplex(cuCreal(a) + b, cuCimag(a));
51+
}
52+
53+
__host__ __device__ __forceinline__ cuDoubleComplex operator+(
54+
double a, const cuDoubleComplex &b) noexcept {
55+
return make_cuDoubleComplex(a + cuCreal(b), cuCimag(b));
56+
}
57+
58+
// Subtraction for cuDoubleComplex (double) with double
59+
__host__ __device__ __forceinline__ cuDoubleComplex operator-(const cuDoubleComplex &a,
60+
double b) noexcept {
61+
return make_cuDoubleComplex(cuCreal(a) - b, cuCimag(a));
62+
}
63+
64+
__host__ __device__ __forceinline__ cuDoubleComplex operator-(
65+
double a, const cuDoubleComplex &b) noexcept {
66+
return make_cuDoubleComplex(a - cuCreal(b), -cuCimag(b));
67+
}
68+
69+
// Multiplication for cuDoubleComplex (double) with double
70+
__host__ __device__ __forceinline__ cuDoubleComplex operator*(const cuDoubleComplex &a,
71+
double b) noexcept {
72+
return make_cuDoubleComplex(cuCreal(a) * b, cuCimag(a) * b);
73+
}
74+
75+
__host__ __device__ __forceinline__ cuDoubleComplex operator*(
76+
double a, const cuDoubleComplex &b) noexcept {
77+
return make_cuDoubleComplex(a * cuCreal(b), a * cuCimag(b));
78+
}
79+
80+
// Division for cuDoubleComplex (double) with double
81+
__host__ __device__ __forceinline__ cuDoubleComplex operator/(const cuDoubleComplex &a,
82+
double b) noexcept {
83+
return make_cuDoubleComplex(cuCreal(a) / b, cuCimag(a) / b);
84+
}
85+
86+
__host__ __device__ __forceinline__ cuDoubleComplex operator/(
87+
double a, const cuDoubleComplex &b) noexcept {
88+
double denom = cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b);
89+
return make_cuDoubleComplex((a * cuCreal(b)) / denom, (-a * cuCimag(b)) / denom);
90+
}
91+
92+
// Addition for cuFloatComplex (float) with cuFloatComplex (float)
93+
__host__ __device__ __forceinline__ cuFloatComplex operator+(
94+
const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
95+
return cuCaddf(a, b);
96+
}
97+
98+
// Subtraction for cuFloatComplex (float) with cuFloatComplex (float)
99+
__host__ __device__ __forceinline__ cuFloatComplex operator-(
100+
const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
101+
return cuCsubf(a, b);
102+
}
103+
104+
// Multiplication for cuFloatComplex (float) with cuFloatComplex (float)
105+
__host__ __device__ __forceinline__ cuFloatComplex operator*(
106+
const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
107+
return cuCmulf(a, b);
108+
}
109+
110+
// Division for cuFloatComplex (float) with cuFloatComplex (float)
111+
__host__ __device__ __forceinline__ cuFloatComplex operator/(
112+
const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
113+
return cuCdivf(a, b);
114+
}
115+
116+
// Equality for cuFloatComplex (float) with cuFloatComplex (float)
117+
__host__ __device__ __forceinline__ bool operator==(const cuFloatComplex &a,
118+
const cuFloatComplex &b) noexcept {
119+
return cuCrealf(a) == cuCrealf(b) && cuCimagf(a) == cuCimagf(b);
120+
}
121+
122+
// Inequality for cuFloatComplex (float) with cuFloatComplex (float)
123+
__host__ __device__ __forceinline__ bool operator!=(const cuFloatComplex &a,
124+
const cuFloatComplex &b) noexcept {
125+
return !(a == b);
126+
}
127+
128+
// Addition for cuFloatComplex (float) with float
129+
__host__ __device__ __forceinline__ cuFloatComplex operator+(const cuFloatComplex &a,
130+
float b) noexcept {
131+
return make_cuFloatComplex(cuCrealf(a) + b, cuCimagf(a));
132+
}
133+
134+
__host__ __device__ __forceinline__ cuFloatComplex operator+(
135+
float a, const cuFloatComplex &b) noexcept {
136+
return make_cuFloatComplex(a + cuCrealf(b), cuCimagf(b));
137+
}
138+
139+
// Subtraction for cuFloatComplex (float) with float
140+
__host__ __device__ __forceinline__ cuFloatComplex operator-(const cuFloatComplex &a,
141+
float b) noexcept {
142+
return make_cuFloatComplex(cuCrealf(a) - b, cuCimagf(a));
143+
}
144+
145+
__host__ __device__ __forceinline__ cuFloatComplex operator-(
146+
float a, const cuFloatComplex &b) noexcept {
147+
return make_cuFloatComplex(a - cuCrealf(b), -cuCimagf(b));
148+
}
149+
150+
// Multiplication for cuFloatComplex (float) with float
151+
__host__ __device__ __forceinline__ cuFloatComplex operator*(const cuFloatComplex &a,
152+
float b) noexcept {
153+
return make_cuFloatComplex(cuCrealf(a) * b, cuCimagf(a) * b);
154+
}
155+
156+
__host__ __device__ __forceinline__ cuFloatComplex operator*(
157+
float a, const cuFloatComplex &b) noexcept {
158+
return make_cuFloatComplex(a * cuCrealf(b), a * cuCimagf(b));
159+
}
160+
161+
// Division for cuFloatComplex (float) with float
162+
__host__ __device__ __forceinline__ cuFloatComplex operator/(const cuFloatComplex &a,
163+
float b) noexcept {
164+
return make_cuFloatComplex(cuCrealf(a) / b, cuCimagf(a) / b);
165+
}
166+
167+
__host__ __device__ __forceinline__ cuFloatComplex operator/(
168+
float a, const cuFloatComplex &b) noexcept {
169+
float denom = cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b);
170+
return make_cuFloatComplex((a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom);
171+
}
172+
173+
#endif // FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H

include/cufinufft/defs.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,16 @@
22
#define CUFINUFFT_DEFS_H
33

44
#include <limits>
5-
65
// constants needed within common
76
// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
87
// common
9-
#define MAX_NSPREAD 16
8+
#define MAX_NSPREAD 16
109

1110
// max number of positive quadr nodes
12-
#define MAX_NQUAD 100
11+
#define MAX_NQUAD 100
12+
13+
// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
14+
#define ARRAYWIDCEN_GROWFRAC 0.1
1315

1416
// FIXME: If cufft ever takes N > INT_MAX...
1517
constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();

0 commit comments

Comments
 (0)