From e17fb59e541fa9fffc7860bf6eea9e36d147ef2f Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 6 Jul 2022 09:30:50 +0200
Subject: [PATCH 01/69] LOBPCG with GPU support (CUDA). Does not yet support
 preconditionning

---
 src/DFTK.jl                    |  2 ++
 src/eigen/lobpcg_hyper_impl.jl | 51 +++++++++++++++++++++-------------
 2 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/src/DFTK.jl b/src/DFTK.jl
index 83e7b01686..1f600ec529 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -13,6 +13,8 @@ using spglib_jll
 using Unitful
 using UnitfulAtomic
 using ForwardDiff
+using GPUArrays
+using CUDA
 
 export Vec3
 export Mat3
diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index 205943017d..0e65424467 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -44,19 +44,24 @@ vprintln(args...) = nothing
 
 using LinearAlgebra
 using BlockArrays # used for the `mortar` command which makes block matrices
-
+using CUDA
+using GPUArrays
 # when X or Y are BlockArrays, this makes the return value be a proper array (not a BlockArray)
-function array_mul(X::AbstractArray{T}, Y) where T
-    Z = Array{T}(undef, size(X, 1), size(Y, 2))
+function array_mul(X::AbstractArray, Y::AbstractArray)
+    Z = similar(X, size(X, 1), size(Y, 2))
     mul!(Z, X, Y)
 end
 
 # Perform a Rayleigh-Ritz for the N first eigenvectors.
-@timing function rayleigh_ritz(X, AX, N)
+@timing function rayleigh_ritz(X::AbstractArray, AX::AbstractArray, N)
     F = eigen(Hermitian(array_mul(X', AX)))
     F.vectors[:,1:N], F.values[1:N]
 end
 
+@timing function rayleigh_ritz(X::CuArray, AX::CuArray, N)
+    vals, vects = CUDA.CUSOLVER.syevd!('V','U',X'AX)
+    vects[:,1:N], vals[1:N]
+end
 # B-orthogonalize X (in place) using only one B apply.
 # This uses an unstable method which is only OK if X is already
 # orthogonal (not B-orthogonal) and B is relatively well-conditioned
@@ -178,7 +183,9 @@ end
         # as can happen in extreme cases in the ortho!(cP, cX)
         dropped = drop!(X)
         if dropped != []
-            @views mul!(X[:, dropped], Y, BY' * (X[:, dropped]), -one(T), one(T)) # X -= Y*BY'X
+            Z = similar(X[:, dropped])
+            mul!(Z, Y, BY' * (X[:, dropped]), -one(T), one(T)) # X -= Y*BY'X
+            X[:, dropped] = Z# X -= Y*BY'X
         end
         if norm(BYX) < tol && niter > 1
             push!(ninners, 0)
@@ -213,13 +220,12 @@ end
 function final_retval(X, AX, resid_history, niter, n_matvec)
     λ = real(diag(X' * AX))
     residuals = AX .- X*Diagonal(λ)
-    (λ=λ, X=X,
+    (λ=Array(λ), X=Array(X),
      residual_norms=[norm(residuals[:, i]) for i in 1:size(residuals, 2)],
      residual_history=resid_history[:, 1:niter+1],
      n_matvec=n_matvec)
 end
 
-
 ### The algorithm is Xn+1 = rayleigh_ritz(hcat(Xn, A*Xn, Xn-Xn-1))
 ### We follow the strategy of Hetmaniuk and Lehoucq, and maintain a B-orthonormal basis Y = (X,R,P)
 ### After each rayleigh_ritz step, the B-orthonormal X and P are deduced by an orthogonal rotation from Y
@@ -230,12 +236,14 @@ end
                         miniter=1, ortho_tol=2eps(real(eltype(X))),
                         n_conv_check=nothing, display_progress=false)
     N, M = size(X)
+    typearray = typeof(X)
+
     # If N is too small, we will likely get in trouble
     error_message(verb) = "The eigenproblem is too small, and the iterative " *
-                           "eigensolver $verb fail; increase the number of " *
-                           "degrees of freedom, or use a dense eigensolver."
-    N > 3M    || error(error_message("will"))
-    N >= 3M+5 || @warn error_message("might")
+                            "eigensolver $verb fail; increase the number of " *
+                            "degrees of freedom, or use a dense eigensolver."
+     N > 3M    || error(error_message("will"))
+     N >= 3M+5 || @warn error_message("might")
 
     n_conv_check === nothing && (n_conv_check = M)
     resid_history = zeros(real(eltype(X)), M, maxiter+1)
@@ -271,6 +279,7 @@ end
     nlocked = 0
     niter = 0  # the first iteration is fake
     λs = @views [(X[:,n]'*AX[:,n]) / (X[:,n]'BX[:,n]) for n=1:M]
+    λs = oftype(X[:,1], λs) #Offload to GPU if needed
     new_X = X
     new_AX = AX
     new_BX = BX
@@ -286,13 +295,13 @@ end
 
             # Form Rayleigh-Ritz subspace
             if niter > 1
-                Y = mortar((X, R, P))
-                AY = mortar((AX, AR, AP))
-                BY = mortar((BX, BR, BP))  # data shared with (X, R, P) in non-general case
+                Y = hcat(X, R, P)
+                AY = hcat(AX, AR, AP)
+                BY = hcat(BX, BR, BP)  # data shared with (X, R, P) in non-general case
             else
-                Y  = mortar((X, R))
-                AY = mortar((AX, AR))
-                BY = mortar((BX, BR))  # data shared with (X, R) in non-general case
+                Y  = hcat(X, R)
+                AY = hcat(AX, AR)
+                BY = hcat(BX, BR)  # data shared with (X, R) in non-general case
             end
             cX, λs = rayleigh_ritz(Y, AY, M-nlocked)
 
@@ -360,6 +369,8 @@ end
             for i in 1:length(Xn_indices)
                 e[Xn_indices[i], i] = 1
             end
+            e = convert(typearray,e)
+
             cP = cX .- e
             cP = cP[:, Xn_indices]
             # orthogonalize against all Xn (including newly locked)
@@ -414,8 +425,8 @@ end
 
         # Orthogonalize R wrt all X, newly active P
         if niter > 0
-            Z  = mortar((full_X, P))
-            BZ = mortar((full_BX, BP))  # data shared with (full_X, P) in non-general case
+            Z  = hcat(full_X, P)
+            BZ = hcat(full_BX, BP) # data shared with (full_X, P) in non-general case
         else
             Z  = full_X
             BZ = full_BX
@@ -431,4 +442,4 @@ end
     end
 
     final_retval(full_X, full_AX, resid_history, maxiter, n_matvec)
-end
+end
\ No newline at end of file

From ed15b324a6f104fc1fa3498846d9939917689af3 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Mon, 18 Jul 2022 14:11:58 +0200
Subject: [PATCH 02/69] MWE for self_consistent_field with GPU support (CUDA).
 Only works with no SCF solver (solver=scf_damping_solver(1.0)) and just one
 Kinetic term.

---
 Project.toml                   |  3 +++
 src/DFTK.jl                    |  2 ++
 src/PlaneWaveBasis.jl          | 27 ++++++++++++++++-----------
 src/common/ortho.jl            |  3 ++-
 src/densities.jl               | 20 +++++++++++++++-----
 src/eigen/lobpcg_hyper_impl.jl | 13 +++++++++----
 src/eigen/preconditioners.jl   | 11 ++++++++++-
 src/fft.jl                     | 14 +++++++++-----
 src/guess_density.jl           | 25 +++++++++++++------------
 src/interpolation_transfer.jl  |  1 +
 src/orbitals.jl                |  7 +++++--
 src/terms/Hamiltonian.jl       |  5 ++---
 src/terms/kinetic.jl           | 11 ++++++-----
 13 files changed, 93 insertions(+), 49 deletions(-)

diff --git a/Project.toml b/Project.toml
index 49c74b6a93..4acbd9b6c9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,10 +8,12 @@ AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 Brillouin = "23470ee3-d0df-4052-8b1a-8cbd6363e7f0"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 InteratomicPotentials = "a9efe35a-c65d-452d-b8a8-82646cd5cb04"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -31,6 +33,7 @@ Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
diff --git a/src/DFTK.jl b/src/DFTK.jl
index 1f600ec529..58f24b152f 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -13,8 +13,10 @@ using spglib_jll
 using Unitful
 using UnitfulAtomic
 using ForwardDiff
+using AbstractFFTs
 using GPUArrays
 using CUDA
+using Random
 
 export Vec3
 export Mat3
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 219f2315bf..ccfbc9ee32 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -67,8 +67,8 @@ struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real}
     G_to_r_normalization::T  # G_to_r = G_to_r_normalization * BFFT
 
     # "cubic" basis in reciprocal and real space, on which potentials and densities are stored
-    G_vectors::Array{Vec3{Int}, 3}
-    r_vectors::Array{Vec3{VT }, 3}
+    G_vectors::AbstractArray{Vec3{Int}, 3}
+    r_vectors::AbstractArray{Vec3{VT }, 3}
 
     ## MPI-local information of the kpoints this processor treats
     # Irreducible kpoints. In the case of collinear spin,
@@ -148,7 +148,7 @@ end
 # and are stored in PlaneWaveBasis for easy reconstruction.
 function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
                         kcoords, kweights, kgrid, kshift,
-                        symmetries_respect_rgrid, comm_kpts) where {T <: Real}
+                        symmetries_respect_rgrid, comm_kpts, array_type = Array) where {T <: Real}
     # Validate fft_size
     if variational
         max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
@@ -191,7 +191,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     kweights_global = kweights
 
     # Setup FFT plans
-    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(T, fft_size)
+    G_vects = G_vectors(fft_size, array_type)
+    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(similar(G_vects,T), fft_size)
 
     # Normalization constants
     # r_to_G = r_to_G_normalization * FFT
@@ -255,15 +256,14 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
         r_to_G_normalization, G_to_r_normalization,
-        G_vectors(fft_size), r_vectors,
+        G_vects, r_vectors,
         kpoints, kweights_thisproc, kgrid, kshift,
         kcoords_global, kweights_global, comm_kpts, krange_thisproc, krange_allprocs,
         symmetries, symmetries_respect_rgrid, terms)
-
     # Instantiate the terms with the basis
     for (it, t) in enumerate(model.term_types)
         term_name = string(nameof(typeof(t)))
-        @timing "Instantiation $term_name" basis.terms[it] = t(basis)
+        @timing "Instantiation $term_name" basis.terms[it] = t(basis, array_type = array_type)
     end
     basis
 end
@@ -277,7 +277,7 @@ end
                                 variational=true, fft_size=nothing,
                                 kgrid=nothing, kshift=nothing,
                                 symmetries_respect_rgrid=isnothing(fft_size),
-                                comm_kpts=MPI.COMM_WORLD) where {T <: Real}
+                                comm_kpts=MPI.COMM_WORLD, array_type = Array) where {T <: Real}
     if isnothing(fft_size)
         @assert variational
         if symmetries_respect_rgrid
@@ -295,7 +295,7 @@ end
         fft_size = compute_fft_size(model, Ecut, kcoords; factors=factors)
     end
     PlaneWaveBasis(model, Ecut, fft_size, variational, kcoords, kweights,
-                   kgrid, kshift, symmetries_respect_rgrid, comm_kpts)
+                   kgrid, kshift, symmetries_respect_rgrid, comm_kpts, array_type)
 end
 
 @doc raw"""
@@ -317,12 +317,12 @@ end
 Creates a new basis identical to `basis`, but with a custom set of kpoints
 """
 @timing function PlaneWaveBasis(basis::PlaneWaveBasis, kcoords::AbstractVector,
-                                kweights::AbstractVector)
+                                kweights::AbstractVector; array_type = Array)
     kgrid = kshift = nothing
     PlaneWaveBasis(basis.model, basis.Ecut,
                    basis.fft_size, basis.variational,
                    kcoords, kweights, kgrid, kshift,
-                   basis.symmetries_respect_rgrid, basis.comm_kpts)
+                   basis.symmetries_respect_rgrid, basis.comm_kpts, array_type)
 end
 
 """
@@ -331,6 +331,11 @@ end
 The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
 """
+function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type::UnionAll)
+    #This functions allows to convert the G_vectors (currently being built on the CPU) to a GPU Array.
+    convert(array_type, G_vectors(fft_size))
+end
+
 function G_vectors(fft_size::Union{Tuple,AbstractVector})
     # Note that a collect(G_vectors_generator(fft_size)) is 100-fold slower
     # than this implementation, hence the code duplication.
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index a0f7508339..3936c5bb25 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,2 +1,3 @@
 # Orthonormalize
-ortho_qr(φk) = Matrix(qr(φk).Q)
+ortho_qr(φk::AbstractArray) = Matrix(qr(φk).Q) #LinearAlgebra.QRCompactWYQ -> Matrix
+ortho_qr(φk::CuArray) = CuArray(qr(φk).Q) #CUDA.CUSOLVER.CuQRPackedQ -> CuArray
diff --git a/src/densities.jl b/src/densities.jl
index ac2db6e835..66c0789ce2 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -33,17 +33,27 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
 
     @sync for (ichunk, chunk) in enumerate(Iterators.partition(ik_n, chunk_length))
         Threads.@spawn for (ik, n) in chunk  # spawn a task per chunk
-            ψnk_real = ψnk_real_chunklocal[ichunk]
-            ρ_loc = ρ_chunklocal[ichunk]
-
             kpt = basis.kpoints[ik]
-            G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n])
-            ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* abs2.(ψnk_real)
+            #TODO: is this the right way to got? Probably rewrite compute_density for GPUArrays
+            if typeof(basis.G_vectors) <:AbstractGPUArray
+                ψnk_real = similar(basis.G_vectors, complex(T), basis.fft_size)
+                G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n])
+                ρ_loc = ρ_chunklocal[ichunk]
+                ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* Array(abs2.(ψnk_real))
+            else
+                ψnk_real = ψnk_real_chunklocal[ichunk]
+                ρ_loc = ρ_chunklocal[ichunk]
+
+                G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n])
+                ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* abs2.(ψnk_real)
+            end
         end
     end
 
     ρ = sum(ρ_chunklocal)
     mpi_sum!(ρ, basis.comm_kpts)
+    array_type = typeof(similar(basis.G_vectors,complex(T), size(ρ)))
+    ρ = convert(array_type, ρ)
     ρ = symmetrize_ρ(basis, ρ; do_lowpass=false)
 
     _check_positive(ρ)
diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index 0e65424467..9199c490b1 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -59,7 +59,12 @@ end
 end
 
 @timing function rayleigh_ritz(X::CuArray, AX::CuArray, N)
-    vals, vects = CUDA.CUSOLVER.syevd!('V','U',X'AX)
+    #TODO: this is wacky and should be changed
+    if eltype(X) == ComplexF32 || eltype(X) == ComplexF64
+        vals, vects = CUDA.CUSOLVER.heevd!('V','U',X'AX)
+    else
+        vals, vects = CUDA.CUSOLVER.syevd!('V','U',X'AX)
+    end
     vects[:,1:N], vals[1:N]
 end
 # B-orthogonalize X (in place) using only one B apply.
@@ -220,10 +225,10 @@ end
 function final_retval(X, AX, resid_history, niter, n_matvec)
     λ = real(diag(X' * AX))
     residuals = AX .- X*Diagonal(λ)
-    (λ=Array(λ), X=Array(X),
+    (λ=Array(λ), X=X,
      residual_norms=[norm(residuals[:, i]) for i in 1:size(residuals, 2)],
-     residual_history=resid_history[:, 1:niter+1],
-     n_matvec=n_matvec)
+     residual_history=resid_history[:, 1:niter+1], n_matvec=n_matvec)
+    #λ doesn't have to be on the GPU, but X does (ψ should always be on GPU throughout the code).
 end
 
 ### The algorithm is Xn+1 = rayleigh_ritz(hcat(Xn, A*Xn, Xn-Xn-1))
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index 4c26f39618..b35c4653eb 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -27,7 +27,7 @@ PreconditionerNone(basis, kpt) = I
 mutable struct PreconditionerTPA{T <: Real}
     basis::PlaneWaveBasis
     kpt::Kpoint
-    kin::Vector{T}  # kinetic energy of every G
+    kin::AbstractVector{T}  # kinetic energy of every G
     mean_kin::Union{Nothing, Vector{T}}  # mean kinetic energy of every band
     default_shift::T # if mean_kin is not set by `precondprep!`, this will be used for the shift
 end
@@ -37,6 +37,8 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=
     isempty(kinetic_term) && error("Preconditioner should be disabled when no Kinetic term is used.")
     scaling = only(kinetic_term).scaling_factor
     kin = Vector{T}([scaling * sum(abs2, q) for q in Gplusk_vectors_cart(basis, kpt)] ./ 2)
+    array_type = typeof(similar(basis.G_vectors, T, size(kin)))
+    kin = convert(array_type, kin)
     PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift)
 end
 
@@ -68,4 +70,11 @@ end
 
 function precondprep!(P::PreconditionerTPA, X)
     P.mean_kin = [real(dot(x, Diagonal(P.kin), x)) for x in eachcol(X)]
+    array_type = typeof(similar(X,eltype(X),size(P.mean_kin)))
+    P.mean_kin = convert(array_type, P.mean_kin)
 end
+
+#TODO: remove this if it implemented in GPUArrays
+import LinearAlgebra.dot
+using GPUArrays
+LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal,y::AbstractGPUArray) = x'*(D*y)
\ No newline at end of file
diff --git a/src/fft.jl b/src/fft.jl
index 8d2b73105d..7b9a5c289a 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -1,5 +1,7 @@
 import FFTW
-
+import CUDA
+import GPUArrays
+import AbstractFFTs
 #
 # Perform (i)FFTs.
 #
@@ -253,14 +255,16 @@ _fftw_flags(::Type{Float64}) = FFTW.MEASURE
 Plan a FFT of type `T` and size `fft_size`, spending some time on finding an
 optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are returned.
 """
-function build_fft_plans(T::Union{Type{Float32}, Type{Float64}}, fft_size)
-    tmp = Array{Complex{T}}(undef, fft_size...)
-    ipFFT = FFTW.plan_fft!(tmp, flags=_fftw_flags(T))
-    opFFT = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
+#Removed the flags as CUDA's plan_fft doesn't need flags. If this is a performance issue, we should check array_type's type then call either FFTW.plan_fft(tmp, flags = ...) or CUDA.plan_fft(tmp)
+function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union{Float32,Float64}}
+    tmp = similar(array_type, Complex{Float64}, fft_size...)
+    ipFFT = AbstractFFTs.plan_fft!(tmp)
+    opFFT = AbstractFFTs.plan_fft(tmp)
     # backward by inverting and stripping off normalizations
     ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
 end
 
+LinearAlgebra.mul!(Y::AbstractGPUArray, p::AbstractFFTs.Plan, X::AbstractGPUArray) = Y .= p *X
 
 # TODO Some grid sizes are broken in the generic FFT implementation
 # in FourierTransforms, for more details see workarounds/fft_generic.jl
diff --git a/src/guess_density.jl b/src/guess_density.jl
index d203b473c9..3a2d271c5a 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -67,7 +67,8 @@ function _guess_spin_density(basis::PlaneWaveBasis{T}, atoms, positions, magneti
         @warn("Returning zero spin density guess, because no initial magnetization has " *
               "been specified in any of the given elements / atoms. Your SCF will likely " *
               "not converge to a spin-broken solution.")
-        return zeros(T, basis.fft_size)
+        array_type = typeof(similar(basis.G_vectors, T, basis.fft_size))
+        return convert(array_type,zeros(T, basis.fft_size))
     end
 
     @assert length(magnetic_moments) == length(atoms) == length(positions)
@@ -93,23 +94,23 @@ which follow the functional form
 and are placed at `position` (in fractional coordinates).
 """
 function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
-    ρ = zeros(complex(T), basis.fft_size)
-    isempty(gaussians) && return G_to_r(basis, ρ)
-
-    # Fill ρ with the (unnormalized) Fourier transform, i.e. ∫ e^{-iGx} f(x) dx,
-    # where f(x) is a weighted gaussian
-    #
-    # is formed from a superposition of atomic densities, each scaled by a prefactor
-    for (iG, G) in enumerate(G_vectors(basis))
-        Gsq = sum(abs2, basis.model.recip_lattice * G)
+    ρ = deepcopy(basis.G_vectors)
+    #These copies are required so that recip_lattice and gaussians are isbits (GPU compatibility)
+    recip_lattice = basis.model.recip_lattice
+    gaussians = SVector{size(gaussians)[1]}(gaussians)
+    function build_ρ(G)
+        Gsq = sum(abs2, recip_lattice * G)
+        res = zero(complex(T))
         for (coeff, decay_length, r) in gaussians
             form_factor::T = exp(-Gsq * T(decay_length)^2)
-            ρ[iG] += T(coeff) * form_factor * cis2pi(-dot(G, r))
+            res += T(coeff) * form_factor* cis2pi(-dot(G, r))
         end
+        res
     end
+    ρ = map(build_ρ, ρ)/ sqrt(basis.model.unit_cell_volume) #Can't use map! as we are converting an array of Vec3 to an array of complex
 
     # projection in the normalized plane wave basis
-    G_to_r(basis, ρ / sqrt(basis.model.unit_cell_volume))
+    G_to_r(basis, ρ)
 end
 
 
diff --git a/src/interpolation_transfer.jl b/src/interpolation_transfer.jl
index 91aea2b884..7643952d5d 100644
--- a/src/interpolation_transfer.jl
+++ b/src/interpolation_transfer.jl
@@ -81,6 +81,7 @@ function interpolate_kpoint(data_in::AbstractVecOrMat,
     n_bands  = size(data_in, 2)
     n_Gk_out = length(G_vectors(basis_out, kpoint_out))
     data_out = similar(data_in, n_Gk_out, n_bands) .= 0
+    #TODO: use a map, or this will not be GPU compatible (scalar indexing)
     for iin in 1:size(data_in, 1)
         idx_fft = kpoint_in.mapping[iin]
         idx_fft in keys(kpoint_out.mapping_inv) || continue
diff --git a/src/orbitals.jl b/src/orbitals.jl
index 42ab9967c6..8219d6392c 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -66,6 +66,9 @@ function unsafe_unpack_ψ(x, sizes_ψ)
 end
 unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ))
 
+using Random
 function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
-    ortho_qr(randn(Complex{T}, length(G_vectors(basis, kpt)), howmany))
-end
+    orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
+    randn!(TaskLocalRNG(), orbitals) #Force the use of GPUArrays.jl's random function if using the GPU
+    ortho_qr(orbitals)
+end
\ No newline at end of file
diff --git a/src/terms/Hamiltonian.jl b/src/terms/Hamiltonian.jl
index 4784bf1f9e..b1914bc9b3 100644
--- a/src/terms/Hamiltonian.jl
+++ b/src/terms/Hamiltonian.jl
@@ -89,9 +89,8 @@ Base.:*(H::Hamiltonian, ψ) = mul!(deepcopy(ψ), H, ψ)
     T = eltype(H.basis)
     n_bands = size(ψ, 2)
     Hψ_fourier = similar(Hψ[:, 1])
-    ψ_real  = zeros(complex(T), H.basis.fft_size...)
-    Hψ_real = zeros(complex(T), H.basis.fft_size...)
-
+    ψ_real  = similar(ψ, complex(T), H.basis.fft_size...)
+    Hψ_real = similar(Hψ, complex(T), H.basis.fft_size...)
     # take ψi, IFFT it to ψ_real, apply each term to Hψ_fourier and Hψ_real, and add it to Hψ
     for iband = 1:n_bands
         Hψ_real .= 0
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index b5d757f24a..f3f2047dba 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -5,7 +5,7 @@ struct Kinetic
     scaling_factor::Real
 end
 Kinetic(; scaling_factor=1) = Kinetic(scaling_factor)
-(kin::Kinetic)(basis) = TermKinetic(basis, kin.scaling_factor)
+(kin::Kinetic)(basis; array_type = Array) = TermKinetic(basis, kin.scaling_factor, array_type)
 function Base.show(io::IO, kin::Kinetic)
     fac = isone(kin.scaling_factor) ? "" : ", scaling_factor=$scaling_factor"
     print(io, "Kinetic($fac)")
@@ -15,10 +15,11 @@ struct TermKinetic <: Term
     scaling_factor::Real  # scaling factor, absorbed into kinetic_energies
     kinetic_energies::Vector{<:AbstractVector}  # kinetic energy 1/2|G+k|^2 for every kpoint
 end
-function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
-    kinetic_energies = [[T(scaling_factor) * sum(abs2, Gk) / 2
-                         for Gk in Gplusk_vectors_cart(basis, kpt)]
-                        for kpt in basis.kpoints]
+function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor, array_type = Array) where {T}
+    kinetic_energies = [convert(array_type,
+                            [T(scaling_factor) * sum(abs2, Gk) / 2
+                            for Gk in Gplusk_vectors_cart(basis, kpt)])
+                            for kpt in basis.kpoints]
     TermKinetic(T(scaling_factor), kinetic_energies)
 end
 

From f4748ac401c774683daf48da92046ce462879ad2 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 20 Jul 2022 10:56:48 +0200
Subject: [PATCH 03/69] Stop using BlockArrays and use a custom BlockVector for
 GPU compatibility in LOBPCG

---
 src/eigen/lobpcg_hyper_impl.jl | 126 +++++++++++++++++++++++++--------
 1 file changed, 96 insertions(+), 30 deletions(-)

diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index 9199c490b1..f93121c69d 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -43,27 +43,94 @@
 vprintln(args...) = nothing
 
 using LinearAlgebra
-using BlockArrays # used for the `mortar` command which makes block matrices
 using CUDA
 using GPUArrays
-# when X or Y are BlockArrays, this makes the return value be a proper array (not a BlockArray)
-function array_mul(X::AbstractArray, Y::AbstractArray)
-    Z = similar(X, size(X, 1), size(Y, 2))
-    mul!(Z, X, Y)
+
+
+# For now, BlockVector can store arrays of different types (for example, an element of type views and one of type Matrix). Maybe for performance issues it should only store arrays of the same type?
+
+struct BlockVector
+    blocks::Tuple
+    size::Tuple{Int64,Int64}
 end
 
+"""
+Build a BlockVector containing the given arrays, from left to right.
+This function will fail (for now) if:
+    -the arrays do not all have the same "height" (ie size[1] must match).
+"""
+function make_block_vector(arrays::AbstractArray...)
+    length(arrays) ==0 && error("Empty BlockVector is not currently implemented")
+    n_ref= size(arrays[1])[1]
+    m=0
+    for array in arrays
+        n_i, m_i = size(array)
+        n_ref != n_i && error("The given arrays do not have matching 'height': cannot build a BlockVector out of them.")
+        m += m_i
+    end
+    BlockVector(arrays, (n_ref,m))
+end
+
+
+"""
+Given A and B as two BlockVectors [A1, A2, A3], [B1, B2, B3] form the matrix
+A'B (which is not a BlockVector). block_overlap also has compatible versions with two Arrays. 
+block_overlap always compute some form of adjoint, ie the product A'*B.
+"""
+@views function block_overlap(A::BlockVector, B::BlockVector)
+    rows = A.size[2]
+    cols = B.size[2]
+    ret = similar(A.blocks[1], rows, cols)
+
+    orow = 0  # row offset
+    for (iA, blA) in enumerate(A.blocks)
+        ocol = 0  # column offset
+        for (iB, blB) in enumerate(B.blocks)
+            ret[orow .+ (1:size(blA, 2)), ocol .+ (1:size(blB, 2))] = blA' * blB
+            ocol += size(blB, 2)
+        end
+        orow += size(blA, 2)
+    end
+    ret
+end
+
+block_overlap(blocksA::BlockVector, B) = block_overlap(blocksA, make_block_vector(B))
+block_overlap(A, B) = A' * B #Default fallback method. Note the adjoint.
+
+"""Given A as a BlockVector [A1, A2, A3] this forms the matrix-matrix product
+A * B avoiding a concatenation of the blocks to a dense array. block_mul has compatible versions with two Arrays.
+block_overlap always compute  product A*B (no adjoint).
+"""
+@views function block_mul(Ablock::BlockVector, B)
+    res = Ablock.blocks[1] * B[1:size(Ablock.blocks[1], 2), :]  # First multiplication
+    offset = size(Ablock.blocks[1], 2)
+    for block in Ablock.blocks[2:end]
+        mul!(res, block, B[offset .+ (1:size(block, 2)), :], 1, 1)
+        offset += size(block, 2)
+    end
+    res
+end
+
+block_mul(A, Bblock::BlockVector) = error("Not implemented")
+block_mul(A::Tuple, B::Tuple) = error("not implemented")
+block_mul(A, B) = A * B #Default fallback method.
+
 # Perform a Rayleigh-Ritz for the N first eigenvectors.
-@timing function rayleigh_ritz(X::AbstractArray, AX::AbstractArray, N)
-    F = eigen(Hermitian(array_mul(X', AX)))
+@timing function rayleigh_ritz(X::BlockVector, AX::BlockVector, N)
+    rayleigh_ritz(block_overlap(X, AX), N) #block_overlap(X,AX) is an AbstractArray, not a BlockVector
+end
+
+@timing function rayleigh_ritz(XAX::AbstractArray, N)
+    F = eigen(Hermitian(XAX))
     F.vectors[:,1:N], F.values[1:N]
 end
 
-@timing function rayleigh_ritz(X::CuArray, AX::CuArray, N)
+@timing function rayleigh_ritz(XAX::CuArray, N)
     #TODO: this is wacky and should be changed
-    if eltype(X) == ComplexF32 || eltype(X) == ComplexF64
-        vals, vects = CUDA.CUSOLVER.heevd!('V','U',X'AX)
+    if eltype(XAX) == ComplexF32 || eltype(XAX) == ComplexF64
+        vals, vects = CUDA.CUSOLVER.heevd!('V','U',XAX)
     else
-        vals, vects = CUDA.CUSOLVER.syevd!('V','U',X'AX)
+        vals, vects = CUDA.CUSOLVER.syevd!('V','U',XAX)
     end
     vects[:,1:N], vals[1:N]
 end
@@ -180,17 +247,16 @@ end
     niter = 1
     ninners = zeros(Int,0)
     while true
-        BYX = BY'X
-        # XXX the one(T) instead of plain old 1 is because of https://github.com/JuliaArrays/BlockArrays.jl/issues/176
-        mul!(X, Y, BYX, -one(T), one(T)) # X -= Y*BY'X
+        BYX = block_overlap(BY,X) # = BY' X
+        X .-= block_mul(Y, BYX) #X = X -Y * BYX
         # If the orthogonalization has produced results below 2eps, we drop them
         # This is to be able to orthogonalize eg [1;0] against [e^iθ;0],
         # as can happen in extreme cases in the ortho!(cP, cX)
         dropped = drop!(X)
         if dropped != []
             Z = similar(X[:, dropped])
-            mul!(Z, Y, BY' * (X[:, dropped]), -one(T), one(T)) # X -= Y*BY'X
-            X[:, dropped] = Z# X -= Y*BY'X
+            Z = X[:, dropped] .- block_mul(Y, block_overlap(BY,X[:, dropped]))
+            X[:, dropped] = Z
         end
         if norm(BYX) < tol && niter > 1
             push!(ninners, 0)
@@ -300,13 +366,13 @@ end
 
             # Form Rayleigh-Ritz subspace
             if niter > 1
-                Y = hcat(X, R, P)
-                AY = hcat(AX, AR, AP)
-                BY = hcat(BX, BR, BP)  # data shared with (X, R, P) in non-general case
+                Y = make_block_vector(X, R, P)
+                AY = make_block_vector(AX, AR, AP)
+                BY = make_block_vector(BX, BR, BP)  # data shared with (X, R, P) in non-general case
             else
-                Y  = hcat(X, R)
-                AY = hcat(AX, AR)
-                BY = hcat(BX, BR)  # data shared with (X, R) in non-general case
+                Y  = make_block_vector(X, R)
+                AY = make_block_vector(AX, AR)
+                BY = make_block_vector(BX, BR)  # data shared with (X, R) in non-general case
             end
             cX, λs = rayleigh_ritz(Y, AY, M-nlocked)
 
@@ -314,9 +380,9 @@ end
             # wait on updating P because we have to know which vectors
             # to lock (and therefore the residuals) before computing P
             # only for the unlocked vectors. This results in better convergence.
-            new_X  = array_mul(Y, cX)
-            new_AX = array_mul(AY, cX)  # no accuracy loss, since cX orthogonal
-            new_BX = (B == I) ? new_X : array_mul(BY, cX)
+            new_X  = block_mul(Y, cX)
+            new_AX = block_mul(AY, cX)  # no accuracy loss, since cX orthogonal
+            new_BX = (B == I) ? new_X : block_mul(BY, cX)
         end
 
         ### Compute new residuals
@@ -382,10 +448,10 @@ end
             ortho!(cP, cX, cX, tol=ortho_tol)
 
             # Get new P
-            new_P  = array_mul( Y, cP)
-            new_AP = array_mul(AY, cP)
+            new_P  = block_mul( Y, cP)
+            new_AP = block_mul(AY, cP)
             if B != I
-                new_BP = array_mul(BY, cP)
+                new_BP = block_mul(BY, cP)
             else
                 new_BP = new_P
             end
@@ -430,8 +496,8 @@ end
 
         # Orthogonalize R wrt all X, newly active P
         if niter > 0
-            Z  = hcat(full_X, P)
-            BZ = hcat(full_BX, BP) # data shared with (full_X, P) in non-general case
+            Z  = make_block_vector(full_X, P)
+            BZ = make_block_vector(full_BX, BP) # data shared with (full_X, P) in non-general case
         else
             Z  = full_X
             BZ = full_BX

From 94f1d2ab1bb77e20972a44e17fef41cfcb0abe6c Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Fri, 22 Jul 2022 14:00:45 +0200
Subject: [PATCH 04/69] GPU support for AtomicLocal term

---
 src/terms/Hamiltonian.jl | 3 ++-
 src/terms/local.jl       | 9 +++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/terms/Hamiltonian.jl b/src/terms/Hamiltonian.jl
index b1914bc9b3..f7b8595fc1 100644
--- a/src/terms/Hamiltonian.jl
+++ b/src/terms/Hamiltonian.jl
@@ -51,7 +51,8 @@ function HamiltonianBlock(basis, kpoint, operators, scratch=ham_allocate_scratch
     end
 end
 function ham_allocate_scratch_(basis::PlaneWaveBasis{T}) where {T}
-    (ψ_reals=[zeros(complex(T), basis.fft_size...) for _ = 1:Threads.nthreads()], )
+    array_type = typeof(similar(basis.G_vectors,complex(T), basis.fft_size...))
+    (ψ_reals=[convert(array_type, zeros(complex(T), basis.fft_size...)) for _ = 1:Threads.nthreads()], )
 end
 
 Base.:*(H::HamiltonianBlock, ψ) = mul!(similar(ψ), H, ψ)
diff --git a/src/terms/local.jl b/src/terms/local.jl
index 678adf2197..4eb3c61220 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -66,7 +66,7 @@ end
 Atomic local potential defined by `model.atoms`.
 """
 struct AtomicLocal end
-function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
+function (::AtomicLocal)(basis::PlaneWaveBasis{T}; array_type = Array) where {T}
     model = basis.model
 
     # pot_fourier is <e_G|V|e_G'> expanded in a basis of e_{G-G'}
@@ -74,7 +74,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     # positions, this involves a form factor (`local_potential_fourier`)
     # and a structure factor e^{-i G·r}
 
-    pot_fourier = map(G_vectors(basis)) do G
+    #This operation needs to be done only once, so let's try to make it happen on CPU (else we needs to isbitsify the pseudopotentials)
+    pot_fourier = map(Array(G_vectors(basis))) do G
         pot = sum(model.atom_groups) do group
             element = model.atoms[first(group)]
             form_factor::T = local_potential_fourier(element, norm(model.recip_lattice * G))
@@ -82,8 +83,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
         end
         pot / sqrt(model.unit_cell_volume)
     end
-
-    pot_real = G_to_r(basis, pot_fourier)
+    #If needed, send to the GPU the atomic local term.
+    pot_real = G_to_r(basis, convert(array_type,pot_fourier))
     TermAtomicLocal(pot_real)
 end
 

From 60d80418801e071b2e11ffccefd4fa6ac02d78d6 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Mon, 25 Jul 2022 13:03:35 +0200
Subject: [PATCH 05/69] First GPU implementation of the non local term + LOBPCG
 enhancement

---
 src/eigen/lobpcg_hyper_impl.jl |  8 +++-----
 src/terms/nonlocal.jl          | 17 ++++++++++-------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index f93121c69d..fd221a68a8 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -126,8 +126,7 @@ end
 end
 
 @timing function rayleigh_ritz(XAX::CuArray, N)
-    #TODO: this is wacky and should be changed
-    if eltype(XAX) == ComplexF32 || eltype(XAX) == ComplexF64
+    if eltype(XAX) <: Complex
         vals, vects = CUDA.CUSOLVER.heevd!('V','U',XAX)
     else
         vals, vects = CUDA.CUSOLVER.syevd!('V','U',XAX)
@@ -254,10 +253,9 @@ end
         # as can happen in extreme cases in the ortho!(cP, cX)
         dropped = drop!(X)
         if dropped != []
-            Z = similar(X[:, dropped])
-            Z = X[:, dropped] .- block_mul(Y, block_overlap(BY,X[:, dropped]))
-            X[:, dropped] = Z
+            X[:, dropped] .-= block_mul(Y, block_overlap(BY,X[:, dropped])) #X = X - Y'*BY*X
         end
+
         if norm(BYX) < tol && niter > 1
             push!(ninners, 0)
             break
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 8f6cbc96fa..683cf878be 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -3,7 +3,7 @@ Nonlocal term coming from norm-conserving pseudopotentials in Kleinmann-Bylander
 ``\text{Energy} = \sum_a \sum_{ij} \sum_{n} f_n <ψ_n|p_{ai}> D_{ij} <p_{aj}|ψ_n>.``
 """
 struct AtomicNonlocal end
-function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T}
+function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}; array_type = Array) where {T}
     model = basis.model
 
     # keep only pseudopotential atoms and positions
@@ -14,8 +14,8 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T}
 
     isempty(psp_groups) && return TermNoop()
     ops = map(basis.kpoints) do kpt
-        P = build_projection_vectors_(basis, kpt, psps, psp_positions)
-        D = build_projection_coefficients_(T, psps, psp_positions)
+        P = build_projection_vectors_(basis, kpt, psps, psp_positions, array_type=array_type)
+        D = build_projection_coefficients_(T, psps, psp_positions, array_type = array_type)
         NonlocalOperator(basis, kpt, P, D)
     end
     TermAtomicNonlocal(ops)
@@ -31,6 +31,9 @@ end
     isnothing(ψ) && return (E=T(Inf), ops=term.ops)
 
     E = zero(T)
+    array_type = typeof(similar(G_vectors(basis), T, length(occ)))
+    occ = [convert(array_type, oc) for oc in occ]
+
     for (ik, kpt) in enumerate(basis.kpoints)
         Pψ = term.ops[ik].P' * ψ[ik]  # nproj x nband
         band_enes = dropdims(sum(real.(conj.(Pψ) .* (term.ops[ik].D * Pψ)), dims=1), dims=1)
@@ -90,7 +93,7 @@ end
 # The ordering of the projector indices is (A,l,m,i), where A is running over all
 # atoms, l, m are AM quantum numbers and i is running over all projectors for a
 # given l. The matrix is block-diagonal with non-zeros only if A, l and m agree.
-function build_projection_coefficients_(T, psps, psp_positions)
+function build_projection_coefficients_(T, psps, psp_positions; array_type = array_type)
     # TODO In the current version the proj_coeffs still has a lot of zeros.
     #      One could improve this by storing the blocks as a list or in a
     #      BlockDiagonal data structure
@@ -106,7 +109,7 @@ function build_projection_coefficients_(T, psps, psp_positions)
     end # psp, r
     @assert count == n_proj
 
-    proj_coeffs
+    convert(array_type,proj_coeffs)
 end
 
 # Builds the projection coefficient matrix for a single atom
@@ -142,7 +145,7 @@ where pihat(q) = ∫_R^3 pi(r) e^{-iqr} dr
 We store 1/√Ω pihat(k+G) in proj_vectors.
 """
 function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
-                                   psps, psp_positions) where {T}
+                                   psps, psp_positions; array_type = Array) where {T}
     unit_cell_volume = basis.model.unit_cell_volume
     n_proj = count_n_proj(psps, psp_positions)
     n_G    = length(G_vectors(basis, kpt))
@@ -170,7 +173,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
         end
     end
     @assert offset == n_proj
-    proj_vectors
+    convert(array_type, proj_vectors)
 end
 
 """

From cf1dc3c1a0ca82f92d7f4a1f6d5115b34a1a4288 Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Mon, 25 Jul 2022 21:06:09 +0200
Subject: [PATCH 06/69] add timed examples

---
 examples/gpu.jl | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 examples/gpu.jl

diff --git a/examples/gpu.jl b/examples/gpu.jl
new file mode 100644
index 0000000000..f59c12d12b
--- /dev/null
+++ b/examples/gpu.jl
@@ -0,0 +1,26 @@
+using DFTK
+using CUDA
+using MKL
+setup_threading(n_blas=1)
+
+a = 10.263141334305942  # Lattice constant in Bohr
+lattice = a / 2 .* [[0 1 1.]; [1 0 1.]; [1 1 0.]]
+Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+atoms     = [Si, Si]
+positions = [ones(3)/8, -ones(3)/8];
+terms_LDA = [Kinetic(), AtomicLocal(), AtomicNonlocal()]
+
+# Setup an LDA model and discretize using
+# a single k-point and a small `Ecut` of 5 Hartree.
+mod = Model(lattice, atoms, positions; terms=terms_LDA,symmetries=false)
+basis = PlaneWaveBasis(mod; Ecut=30, kgrid=(1, 1, 1))
+basis_gpu = PlaneWaveBasis(mod; Ecut=30, kgrid=(1, 1, 1), array_type = CuArray)
+
+
+DFTK.reset_timer!(DFTK.timer)
+scfres = self_consistent_field(basis; solver=scf_damping_solver(1.0), is_converged=DFTK.ScfConvergenceDensity(1e-3))
+println(DFTK.timer)
+
+DFTK.reset_timer!(DFTK.timer)
+scfres_gpu = self_consistent_field(basis_gpu; solver=scf_damping_solver(1.0), is_converged=DFTK.ScfConvergenceDensity(1e-3))
+println(DFTK.timer)

From 11b85f0acc304b4e276937ff9a5bee98470dd25c Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 28 Jul 2022 08:51:45 +0200
Subject: [PATCH 07/69] Change some code organisation after PR's feedback

---
 src/PlaneWaveBasis.jl               |  8 ++++----
 src/eigen/lobpcg_hyper_impl.jl      | 12 ++++++++----
 src/eigen/preconditioners.jl        |  5 -----
 src/guess_density.jl                |  5 +++++
 src/workarounds/gpu_computations.jl | 15 +++++++++++++++
 5 files changed, 32 insertions(+), 13 deletions(-)
 create mode 100644 src/workarounds/gpu_computations.jl

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index ccfbc9ee32..8429046e48 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -191,8 +191,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     kweights_global = kweights
 
     # Setup FFT plans
-    G_vects = G_vectors(fft_size, array_type)
-    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(similar(G_vects,T), fft_size)
+    G_vec = G_vectors(fft_size, array_type)
+    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(similar(G_vec,T), fft_size)
 
     # Normalization constants
     # r_to_G = r_to_G_normalization * FFT
@@ -256,7 +256,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
         r_to_G_normalization, G_to_r_normalization,
-        G_vects, r_vectors,
+        G_vec, r_vectors,
         kpoints, kweights_thisproc, kgrid, kshift,
         kcoords_global, kweights_global, comm_kpts, krange_thisproc, krange_allprocs,
         symmetries, symmetries_respect_rgrid, terms)
@@ -331,7 +331,7 @@ end
 The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
 """
-function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type::UnionAll)
+function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type::Type)
     #This functions allows to convert the G_vectors (currently being built on the CPU) to a GPU Array.
     convert(array_type, G_vectors(fft_size))
 end
diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index fd221a68a8..b8741f5dd2 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -45,7 +45,7 @@ vprintln(args...) = nothing
 using LinearAlgebra
 using CUDA
 using GPUArrays
-
+include("../workarounds/gpu_computations.jl")
 
 # For now, BlockVector can store arrays of different types (for example, an element of type views and one of type Matrix). Maybe for performance issues it should only store arrays of the same type?
 
@@ -115,6 +115,10 @@ block_mul(A, Bblock::BlockVector) = error("Not implemented")
 block_mul(A::Tuple, B::Tuple) = error("not implemented")
 block_mul(A, B) = A * B #Default fallback method.
 
+function LinearAlgebra.mul!(res,A::BlockVector,B::AbstractArray,alpha,beta)
+    mul!(res, block_mul(A, B), I, alpha, beta)
+end
+
 # Perform a Rayleigh-Ritz for the N first eigenvectors.
 @timing function rayleigh_ritz(X::BlockVector, AX::BlockVector, N)
     rayleigh_ritz(block_overlap(X, AX), N) #block_overlap(X,AX) is an AbstractArray, not a BlockVector
@@ -247,7 +251,7 @@ end
     ninners = zeros(Int,0)
     while true
         BYX = block_overlap(BY,X) # = BY' X
-        X .-= block_mul(Y, BYX) #X = X -Y * BYX
+        mul!(X, Y, BYX, -one(T), one(T)) # X -= Y*BY'X
         # If the orthogonalization has produced results below 2eps, we drop them
         # This is to be able to orthogonalize eg [1;0] against [e^iθ;0],
         # as can happen in extreme cases in the ortho!(cP, cX)
@@ -289,7 +293,7 @@ end
 function final_retval(X, AX, resid_history, niter, n_matvec)
     λ = real(diag(X' * AX))
     residuals = AX .- X*Diagonal(λ)
-    (λ=Array(λ), X=X,
+    (λ=λ, X=X,
      residual_norms=[norm(residuals[:, i]) for i in 1:size(residuals, 2)],
      residual_history=resid_history[:, 1:niter+1], n_matvec=n_matvec)
     #λ doesn't have to be on the GPU, but X does (ψ should always be on GPU throughout the code).
@@ -511,4 +515,4 @@ end
     end
 
     final_retval(full_X, full_AX, resid_history, maxiter, n_matvec)
-end
\ No newline at end of file
+end
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index b35c4653eb..a3a401b22e 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -73,8 +73,3 @@ function precondprep!(P::PreconditionerTPA, X)
     array_type = typeof(similar(X,eltype(X),size(P.mean_kin)))
     P.mean_kin = convert(array_type, P.mean_kin)
 end
-
-#TODO: remove this if it implemented in GPUArrays
-import LinearAlgebra.dot
-using GPUArrays
-LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal,y::AbstractGPUArray) = x'*(D*y)
\ No newline at end of file
diff --git a/src/guess_density.jl b/src/guess_density.jl
index 3a9c09a303..a52cb24742 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -94,10 +94,15 @@ which follow the functional form
 and are placed at `position` (in fractional coordinates).
 """
 function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
+    isempty(gaussians) && return G_to_r(basis, similar(G_vectors(basis),complex(T), basis.fft_size))
+
     ρ = deepcopy(basis.G_vectors)
     #These copies are required so that recip_lattice and gaussians are isbits (GPU compatibility)
     recip_lattice = basis.model.recip_lattice
     gaussians = SVector{size(gaussians)[1]}(gaussians)
+
+    # Fill ρ with the (unnormalized) Fourier transform, i.e. ∫ e^{-iGx} f(x) dx,
+    # where f(x) is a weighted gaussian
     function build_ρ(G)
         Gsq = sum(abs2, recip_lattice * G)
         res = zero(complex(T))
diff --git a/src/workarounds/gpu_computations.jl b/src/workarounds/gpu_computations.jl
new file mode 100644
index 0000000000..d864a0c0c5
--- /dev/null
+++ b/src/workarounds/gpu_computations.jl
@@ -0,0 +1,15 @@
+#TODO: remove this when it is implemented in GPUArrays
+import LinearAlgebra.dot
+using LinearAlgebra
+using GPUArrays
+import Base.iszero, Base.isone
+
+LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal,y::AbstractGPUArray) = x'*(D*y)
+
+Base.iszero(x::AbstractGPUMatrix{T}) where {T} = all(iszero, x)
+
+function Base.isone(x::AbstractGPUMatrix{T}) where {T}
+    n,m = size(x)
+    m != n && return false
+    all(iszero, x-I)
+end

From abb99f433ddd559459d6b8f4f0d806c9f642b5ac Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 28 Jul 2022 08:59:33 +0200
Subject: [PATCH 08/69] Code organisation and performance optimisation after
 PR's feedback

---
 src/densities.jl               | 27 ++++++++++-----------------
 src/eigen/diag_lobpcg_hyper.jl |  4 +++-
 src/fft.jl                     |  5 ++---
 3 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/src/densities.jl b/src/densities.jl
index 5e42798396..df489b6119 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -19,6 +19,7 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
 `ψ` should be one coefficient matrix per ``k``-point. 
 """
 @views @timing function compute_density(basis, ψ, occupation)
+    Threads.nthreads() !=1 && G_vectors(basis) isa AbstractGPUArray && error("Can't mix multi-threading and GPU computations yet.") #We assume there is only 1 thread
     T = promote_type(eltype(basis), real(eltype(ψ[1])))
 
     # we split the total iteration range (ik, n) in chunks, and parallelize over them
@@ -26,34 +27,26 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
     chunk_length = cld(length(ik_n), Threads.nthreads())
 
     # chunk-local variables
-    ρ_chunklocal = Array{T,4}[zeros(T, basis.fft_size..., basis.model.n_spin_components)
+    array_type = typeof(similar(G_vectors(basis),T, basis.fft_size..., basis.model.n_spin_components))
+    ρ_chunklocal = [convert(array_type, zeros(T, basis.fft_size..., basis.model.n_spin_components))
                                for _ = 1:Threads.nthreads()]
-    ψnk_real_chunklocal = Array{complex(T),3}[zeros(complex(T), basis.fft_size)
-                                               for _ = 1:Threads.nthreads()]
+    array_type = typeof(similar(G_vectors(basis),complex(T), basis.fft_size))
+    ψnk_real_chunklocal = [convert(array_type, zeros(complex(T), basis.fft_size)) 
+                                for _ = 1:Threads.nthreads()]
 
     @sync for (ichunk, chunk) in enumerate(Iterators.partition(ik_n, chunk_length))
         Threads.@spawn for (ik, n) in chunk  # spawn a task per chunk
             kpt = basis.kpoints[ik]
-            #TODO: is this the right way to got? Probably rewrite compute_density for GPUArrays
-            if typeof(basis.G_vectors) <:AbstractGPUArray
-                ψnk_real = similar(basis.G_vectors, complex(T), basis.fft_size)
-                G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n])
-                ρ_loc = ρ_chunklocal[ichunk]
-                ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* Array(abs2.(ψnk_real))
-            else
-                ψnk_real = ψnk_real_chunklocal[ichunk]
-                ρ_loc = ρ_chunklocal[ichunk]
+            ψnk_real = ψnk_real_chunklocal[ichunk]
+            ρ_loc = ρ_chunklocal[ichunk]
 
-                G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n])
-                ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* abs2.(ψnk_real)
-            end
+            G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n])            
+            ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* abs2.(ψnk_real)
         end
     end
 
     ρ = sum(ρ_chunklocal)
     mpi_sum!(ρ, basis.comm_kpts)
-    array_type = typeof(similar(basis.G_vectors,complex(T), size(ρ)))
-    ρ = convert(array_type, ρ)
     ρ = symmetrize_ρ(basis, ρ; do_lowpass=false)
 
     _check_positive(ρ)
diff --git a/src/eigen/diag_lobpcg_hyper.jl b/src/eigen/diag_lobpcg_hyper.jl
index 832eaf2cee..77da77052b 100644
--- a/src/eigen/diag_lobpcg_hyper.jl
+++ b/src/eigen/diag_lobpcg_hyper.jl
@@ -9,8 +9,10 @@ function lobpcg_hyper(A, X0; maxiter=100, prec=nothing,
     result = LOBPCG(A, X0, I, prec, tol, maxiter; n_conv_check=n_conv_check, kwargs...)
 
     n_conv_check === nothing && (n_conv_check = size(X0, 2))
+
     converged = maximum(result.residual_norms[1:n_conv_check]) < tol
     iterations = size(result.residual_history, 2) - 1
+    λ = Array(result.λ) #TODO: offload this to gpu? Careful then, as self_consistent_field's eigenvalues will be a CuArray -> due to the Smearing.occupation function, occupation will also be a CuArray, so no scalar indexing (in ene_ops, in compute_density...)
 
-    merge(result, (iterations=iterations, converged=converged))
+    merge(result, (iterations=iterations, converged=converged, λ=λ))
 end
diff --git a/src/fft.jl b/src/fft.jl
index 7b9a5c289a..09f1c16bd3 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -2,6 +2,7 @@ import FFTW
 import CUDA
 import GPUArrays
 import AbstractFFTs
+
 #
 # Perform (i)FFTs.
 #
@@ -257,15 +258,13 @@ optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are r
 """
 #Removed the flags as CUDA's plan_fft doesn't need flags. If this is a performance issue, we should check array_type's type then call either FFTW.plan_fft(tmp, flags = ...) or CUDA.plan_fft(tmp)
 function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union{Float32,Float64}}
-    tmp = similar(array_type, Complex{Float64}, fft_size...)
+    tmp = similar(array_type, Complex{T}, fft_size...)
     ipFFT = AbstractFFTs.plan_fft!(tmp)
     opFFT = AbstractFFTs.plan_fft(tmp)
     # backward by inverting and stripping off normalizations
     ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
 end
 
-LinearAlgebra.mul!(Y::AbstractGPUArray, p::AbstractFFTs.Plan, X::AbstractGPUArray) = Y .= p *X
-
 # TODO Some grid sizes are broken in the generic FFT implementation
 # in FourierTransforms, for more details see workarounds/fft_generic.jl
 default_primes(::Type{Float32}) = (2, 3, 5)

From a89171a275379fbac3e835cb4c65ae854c1c5e5b Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 2 Aug 2022 07:19:32 +0200
Subject: [PATCH 09/69] Code refactoring following PR's feedback

---
 src/PlaneWaveBasis.jl                            | 16 +++++++---------
 src/densities.jl                                 |  5 ++---
 src/eigen/diag.jl                                |  5 ++++-
 src/eigen/diag_lobpcg_hyper.jl                   |  3 +--
 src/eigen/lobpcg_hyper_impl.jl                   | 11 ++++++-----
 src/guess_density.jl                             |  3 +--
 .../{gpu_computations.jl => gpu_arrays.jl}       |  0
 7 files changed, 21 insertions(+), 22 deletions(-)
 rename src/workarounds/{gpu_computations.jl => gpu_arrays.jl} (100%)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 8429046e48..afeb51eb73 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -191,8 +191,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     kweights_global = kweights
 
     # Setup FFT plans
-    G_vec = G_vectors(fft_size, array_type)
-    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(similar(G_vec,T), fft_size)
+    Gs = G_vectors(fft_size, array_type)
+    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(similar(Gs,T), fft_size)
 
     # Normalization constants
     # r_to_G = r_to_G_normalization * FFT
@@ -245,6 +245,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     end
     @assert mpi_sum(sum(kweights_thisproc), comm_kpts) ≈ model.n_spin_components
     @assert length(kpoints) == length(kweights_thisproc)
+    Threads.nthreads() != 1 && Gs isa AbstractGPUArray && error("Can't mix multi-threading and GPU computations yet.")
 
     VT = value_type(T)
     dvol  = model.unit_cell_volume ./ prod(fft_size)
@@ -256,7 +257,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
         r_to_G_normalization, G_to_r_normalization,
-        G_vec, r_vectors,
+        Gs, r_vectors,
         kpoints, kweights_thisproc, kgrid, kshift,
         kcoords_global, kweights_global, comm_kpts, krange_thisproc, krange_allprocs,
         symmetries, symmetries_respect_rgrid, terms)
@@ -331,18 +332,15 @@ end
 The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
 """
-function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type::Type)
-    #This functions allows to convert the G_vectors (currently being built on the CPU) to a GPU Array.
-    convert(array_type, G_vectors(fft_size))
-end
 
-function G_vectors(fft_size::Union{Tuple,AbstractVector})
+function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type = Array)
     # Note that a collect(G_vectors_generator(fft_size)) is 100-fold slower
     # than this implementation, hence the code duplication.
     start = .- cld.(fft_size .- 1, 2)
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
-    [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
+    Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
+    convert(array_type, Gs) #Offload to GPU if needed.
 end
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
     # The generator version is used mainly in symmetry.jl for lowpass_for_symmetry! and
diff --git a/src/densities.jl b/src/densities.jl
index df489b6119..54ac4606d6 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -19,7 +19,6 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
 `ψ` should be one coefficient matrix per ``k``-point. 
 """
 @views @timing function compute_density(basis, ψ, occupation)
-    Threads.nthreads() !=1 && G_vectors(basis) isa AbstractGPUArray && error("Can't mix multi-threading and GPU computations yet.") #We assume there is only 1 thread
     T = promote_type(eltype(basis), real(eltype(ψ[1])))
 
     # we split the total iteration range (ik, n) in chunks, and parallelize over them
@@ -29,10 +28,10 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
     # chunk-local variables
     array_type = typeof(similar(G_vectors(basis),T, basis.fft_size..., basis.model.n_spin_components))
     ρ_chunklocal = [convert(array_type, zeros(T, basis.fft_size..., basis.model.n_spin_components))
-                               for _ = 1:Threads.nthreads()]
+                    for _ = 1:Threads.nthreads()]
     array_type = typeof(similar(G_vectors(basis),complex(T), basis.fft_size))
     ψnk_real_chunklocal = [convert(array_type, zeros(complex(T), basis.fft_size)) 
-                                for _ = 1:Threads.nthreads()]
+                            for _ = 1:Threads.nthreads()]
 
     @sync for (ichunk, chunk) in enumerate(Iterators.partition(ik_n, chunk_length))
         Threads.@spawn for (ik, n) in chunk  # spawn a task per chunk
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index 38a555f262..ee46fa1199 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -54,7 +54,10 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
     end
 
     # Transform results into a nicer datastructure
-    (λ=[real.(res.λ) for res in results],
+    # TODO: keep λ on the gpu? Careful then, as self_consistent_field's eigenvalues
+    # will be a CuArray -> due to the Smearing.occupation function, occupation will also
+    # be a CuArray, so no scalar indexing (in ene_ops, in compute_density...)
+    (λ=[Array(real.(res.λ)) for res in results],
      X=[res.X for res in results],
      residual_norms=[res.residual_norms for res in results],
      iterations=[res.iterations for res in results],
diff --git a/src/eigen/diag_lobpcg_hyper.jl b/src/eigen/diag_lobpcg_hyper.jl
index 77da77052b..8548d79c58 100644
--- a/src/eigen/diag_lobpcg_hyper.jl
+++ b/src/eigen/diag_lobpcg_hyper.jl
@@ -12,7 +12,6 @@ function lobpcg_hyper(A, X0; maxiter=100, prec=nothing,
 
     converged = maximum(result.residual_norms[1:n_conv_check]) < tol
     iterations = size(result.residual_history, 2) - 1
-    λ = Array(result.λ) #TODO: offload this to gpu? Careful then, as self_consistent_field's eigenvalues will be a CuArray -> due to the Smearing.occupation function, occupation will also be a CuArray, so no scalar indexing (in ene_ops, in compute_density...)
 
-    merge(result, (iterations=iterations, converged=converged, λ=λ))
+    merge(result, (iterations=iterations, converged=converged))
 end
diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index b8741f5dd2..2b51b01dcb 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -45,7 +45,7 @@ vprintln(args...) = nothing
 using LinearAlgebra
 using CUDA
 using GPUArrays
-include("../workarounds/gpu_computations.jl")
+include("../workarounds/gpu_arrays.jl")
 
 # For now, BlockVector can store arrays of different types (for example, an element of type views and one of type Matrix). Maybe for performance issues it should only store arrays of the same type?
 
@@ -113,15 +113,16 @@ end
 
 block_mul(A, Bblock::BlockVector) = error("Not implemented")
 block_mul(A::Tuple, B::Tuple) = error("not implemented")
-block_mul(A, B) = A * B #Default fallback method.
+block_mul(A, B) = A * B # Default fallback method.
 
-function LinearAlgebra.mul!(res,A::BlockVector,B::AbstractArray,alpha,beta)
-    mul!(res, block_mul(A, B), I, alpha, beta)
+function LinearAlgebra.mul!(res,A::BlockVector,B::AbstractArray,α,β)
+    # Has slightly better performances than a naive res = α*block_mul(A,B) - β*res
+    mul!(res, block_mul(A, B), I, α, β)
 end
 
 # Perform a Rayleigh-Ritz for the N first eigenvectors.
 @timing function rayleigh_ritz(X::BlockVector, AX::BlockVector, N)
-    rayleigh_ritz(block_overlap(X, AX), N) #block_overlap(X,AX) is an AbstractArray, not a BlockVector
+    rayleigh_ritz(block_overlap(X, AX), N) # block_overlap(X,AX) is an AbstractArray, not a BlockVector
 end
 
 @timing function rayleigh_ritz(XAX::AbstractArray, N)
diff --git a/src/guess_density.jl b/src/guess_density.jl
index a52cb24742..0e1cb4e2ef 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -96,7 +96,6 @@ and are placed at `position` (in fractional coordinates).
 function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
     isempty(gaussians) && return G_to_r(basis, similar(G_vectors(basis),complex(T), basis.fft_size))
 
-    ρ = deepcopy(basis.G_vectors)
     #These copies are required so that recip_lattice and gaussians are isbits (GPU compatibility)
     recip_lattice = basis.model.recip_lattice
     gaussians = SVector{size(gaussians)[1]}(gaussians)
@@ -112,7 +111,7 @@ function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
         end
         res
     end
-    ρ = map(build_ρ, ρ)/ sqrt(basis.model.unit_cell_volume) #Can't use map! as we are converting an array of Vec3 to an array of complex
+    ρ = map(build_ρ, basis.G_vectors)/ sqrt(basis.model.unit_cell_volume) #Can't use map! as we are converting an array of Vec3 to an array of complex
 
     # projection in the normalized plane wave basis
     G_to_r(basis, ρ)
diff --git a/src/workarounds/gpu_computations.jl b/src/workarounds/gpu_arrays.jl
similarity index 100%
rename from src/workarounds/gpu_computations.jl
rename to src/workarounds/gpu_arrays.jl

From 44bcb6183fd0b1bcd77bed7fa361ec12b8dcea95 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 9 Aug 2022 15:23:47 +0200
Subject: [PATCH 10/69] PWB is now parametric on the array type: this also
 fixes type issues

---
 src/PlaneWaveBasis.jl        | 23 ++++++++++++++++-------
 src/densities.jl             |  6 ++----
 src/eigen/preconditioners.jl |  6 ++----
 src/guess_density.jl         |  3 +--
 src/terms/Hamiltonian.jl     |  3 +--
 src/terms/kinetic.jl         |  6 +++---
 src/terms/local.jl           |  4 ++--
 src/terms/nonlocal.jl        | 15 +++++++--------
 8 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index afeb51eb73..f3e4c07795 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -39,7 +39,7 @@ Normalization conventions:
 
 `G_to_r` and `r_to_G` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real}
+struct PlaneWaveBasis{T, VT, AT, GT, RT} <: AbstractBasis{T} where {VT <: Real, GT <: AT, RT <: AT, AT <: AbstractArray}
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 
@@ -67,8 +67,8 @@ struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real}
     G_to_r_normalization::T  # G_to_r = G_to_r_normalization * BFFT
 
     # "cubic" basis in reciprocal and real space, on which potentials and densities are stored
-    G_vectors::AbstractArray{Vec3{Int}, 3}
-    r_vectors::AbstractArray{Vec3{VT }, 3}
+    G_vectors::GT
+    r_vectors::RT
 
     ## MPI-local information of the kpoints this processor treats
     # Irreducible kpoints. In the case of collinear spin,
@@ -252,7 +252,10 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) for i = 1:N1, j = 1:N2, k = 1:N3]
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
-    basis = PlaneWaveBasis{T,value_type(T)}(
+    RT = array_type{Vec3{VT }, 3}
+    GT = array_type{Vec3{Int }, 3}
+
+    basis = PlaneWaveBasis{T,value_type(T), array_type, GT, RT}(
         model, fft_size, dvol,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
@@ -264,7 +267,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     # Instantiate the terms with the basis
     for (it, t) in enumerate(model.term_types)
         term_name = string(nameof(typeof(t)))
-        @timing "Instantiation $term_name" basis.terms[it] = t(basis, array_type = array_type)
+        @timing "Instantiation $term_name" basis.terms[it] = t(basis)
     end
     basis
 end
@@ -318,12 +321,12 @@ end
 Creates a new basis identical to `basis`, but with a custom set of kpoints
 """
 @timing function PlaneWaveBasis(basis::PlaneWaveBasis, kcoords::AbstractVector,
-                                kweights::AbstractVector; array_type = Array)
+                                kweights::AbstractVector)
     kgrid = kshift = nothing
     PlaneWaveBasis(basis.model, basis.Ecut,
                    basis.fft_size, basis.variational,
                    kcoords, kweights, kgrid, kshift,
-                   basis.symmetries_respect_rgrid, basis.comm_kpts, array_type)
+                   basis.symmetries_respect_rgrid, basis.comm_kpts, array_type = array_type(basis))
 end
 
 """
@@ -361,6 +364,12 @@ or a ``k``-point `kpt`.
 G_vectors(basis::PlaneWaveBasis) = basis.G_vectors
 G_vectors(::PlaneWaveBasis, kpt::Kpoint) = kpt.G_vectors
 
+"""
+Return the type of array used for computations (Array if on CPU, CuArray, 
+ROCArray... if on GPU).
+"""
+array_type(basis::PlaneWaveBasis{T,VT,AT}) where {T, VT, AT} = AT
+
 
 @doc raw"""
     G_vectors_cart(basis::PlaneWaveBasis)
diff --git a/src/densities.jl b/src/densities.jl
index 54ac4606d6..4855bce9c3 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -26,11 +26,9 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
     chunk_length = cld(length(ik_n), Threads.nthreads())
 
     # chunk-local variables
-    array_type = typeof(similar(G_vectors(basis),T, basis.fft_size..., basis.model.n_spin_components))
-    ρ_chunklocal = [convert(array_type, zeros(T, basis.fft_size..., basis.model.n_spin_components))
+    ρ_chunklocal = [convert(array_type(basis), zeros(T, basis.fft_size..., basis.model.n_spin_components))
                     for _ = 1:Threads.nthreads()]
-    array_type = typeof(similar(G_vectors(basis),complex(T), basis.fft_size))
-    ψnk_real_chunklocal = [convert(array_type, zeros(complex(T), basis.fft_size)) 
+    ψnk_real_chunklocal = [convert(array_type(basis), zeros(complex(T), basis.fft_size)) 
                             for _ = 1:Threads.nthreads()]
 
     @sync for (ichunk, chunk) in enumerate(Iterators.partition(ik_n, chunk_length))
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index a3a401b22e..6e37d54ae4 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -37,8 +37,7 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=
     isempty(kinetic_term) && error("Preconditioner should be disabled when no Kinetic term is used.")
     scaling = only(kinetic_term).scaling_factor
     kin = Vector{T}([scaling * sum(abs2, q) for q in Gplusk_vectors_cart(basis, kpt)] ./ 2)
-    array_type = typeof(similar(basis.G_vectors, T, size(kin)))
-    kin = convert(array_type, kin)
+    kin = convert(array_type(basis), kin)
     PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift)
 end
 
@@ -70,6 +69,5 @@ end
 
 function precondprep!(P::PreconditionerTPA, X)
     P.mean_kin = [real(dot(x, Diagonal(P.kin), x)) for x in eachcol(X)]
-    array_type = typeof(similar(X,eltype(X),size(P.mean_kin)))
-    P.mean_kin = convert(array_type, P.mean_kin)
+    P.mean_kin = convert(array_type(P.basis), P.mean_kin)
 end
diff --git a/src/guess_density.jl b/src/guess_density.jl
index 0e1cb4e2ef..796a7e82d4 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -66,8 +66,7 @@ function _guess_spin_density(basis::PlaneWaveBasis{T}, atoms, positions, magneti
         @warn("Returning zero spin density guess, because no initial magnetization has " *
               "been specified in any of the given elements / atoms. Your SCF will likely " *
               "not converge to a spin-broken solution.")
-        array_type = typeof(similar(basis.G_vectors, T, basis.fft_size))
-        return convert(array_type,zeros(T, basis.fft_size))
+        return convert(array_type(basis),zeros(T, basis.fft_size))
     end
 
     @assert length(magmoms) == length(atoms) == length(positions)
diff --git a/src/terms/Hamiltonian.jl b/src/terms/Hamiltonian.jl
index f7b8595fc1..9f73ddc98e 100644
--- a/src/terms/Hamiltonian.jl
+++ b/src/terms/Hamiltonian.jl
@@ -51,8 +51,7 @@ function HamiltonianBlock(basis, kpoint, operators, scratch=ham_allocate_scratch
     end
 end
 function ham_allocate_scratch_(basis::PlaneWaveBasis{T}) where {T}
-    array_type = typeof(similar(basis.G_vectors,complex(T), basis.fft_size...))
-    (ψ_reals=[convert(array_type, zeros(complex(T), basis.fft_size...)) for _ = 1:Threads.nthreads()], )
+    (ψ_reals=[convert(array_type(basis), zeros(complex(T), basis.fft_size...)) for _ = 1:Threads.nthreads()], )
 end
 
 Base.:*(H::HamiltonianBlock, ψ) = mul!(similar(ψ), H, ψ)
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index f3f2047dba..0d200f78a5 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -5,7 +5,7 @@ struct Kinetic
     scaling_factor::Real
 end
 Kinetic(; scaling_factor=1) = Kinetic(scaling_factor)
-(kin::Kinetic)(basis; array_type = Array) = TermKinetic(basis, kin.scaling_factor, array_type)
+(kin::Kinetic)(basis) = TermKinetic(basis, kin.scaling_factor)
 function Base.show(io::IO, kin::Kinetic)
     fac = isone(kin.scaling_factor) ? "" : ", scaling_factor=$scaling_factor"
     print(io, "Kinetic($fac)")
@@ -15,8 +15,8 @@ struct TermKinetic <: Term
     scaling_factor::Real  # scaling factor, absorbed into kinetic_energies
     kinetic_energies::Vector{<:AbstractVector}  # kinetic energy 1/2|G+k|^2 for every kpoint
 end
-function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor, array_type = Array) where {T}
-    kinetic_energies = [convert(array_type,
+function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
+    kinetic_energies = [convert(array_type(basis),
                             [T(scaling_factor) * sum(abs2, Gk) / 2
                             for Gk in Gplusk_vectors_cart(basis, kpt)])
                             for kpt in basis.kpoints]
diff --git a/src/terms/local.jl b/src/terms/local.jl
index 4eb3c61220..9867eeef1e 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -66,7 +66,7 @@ end
 Atomic local potential defined by `model.atoms`.
 """
 struct AtomicLocal end
-function (::AtomicLocal)(basis::PlaneWaveBasis{T}; array_type = Array) where {T}
+function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     model = basis.model
 
     # pot_fourier is <e_G|V|e_G'> expanded in a basis of e_{G-G'}
@@ -84,7 +84,7 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}; array_type = Array) where {T}
         pot / sqrt(model.unit_cell_volume)
     end
     #If needed, send to the GPU the atomic local term.
-    pot_real = G_to_r(basis, convert(array_type,pot_fourier))
+    pot_real = G_to_r(basis, convert(array_type(basis),pot_fourier))
     TermAtomicLocal(pot_real)
 end
 
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 683cf878be..47ab910b52 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -3,7 +3,7 @@ Nonlocal term coming from norm-conserving pseudopotentials in Kleinmann-Bylander
 ``\text{Energy} = \sum_a \sum_{ij} \sum_{n} f_n <ψ_n|p_{ai}> D_{ij} <p_{aj}|ψ_n>.``
 """
 struct AtomicNonlocal end
-function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}; array_type = Array) where {T}
+function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T}
     model = basis.model
 
     # keep only pseudopotential atoms and positions
@@ -14,8 +14,8 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}; array_type = Array) where
 
     isempty(psp_groups) && return TermNoop()
     ops = map(basis.kpoints) do kpt
-        P = build_projection_vectors_(basis, kpt, psps, psp_positions, array_type=array_type)
-        D = build_projection_coefficients_(T, psps, psp_positions, array_type = array_type)
+        P = build_projection_vectors_(basis, kpt, psps, psp_positions)
+        D = build_projection_coefficients_(T, psps, psp_positions, array_type = array_type(basis))
         NonlocalOperator(basis, kpt, P, D)
     end
     TermAtomicNonlocal(ops)
@@ -31,8 +31,7 @@ end
     isnothing(ψ) && return (E=T(Inf), ops=term.ops)
 
     E = zero(T)
-    array_type = typeof(similar(G_vectors(basis), T, length(occ)))
-    occ = [convert(array_type, oc) for oc in occ]
+    occ = [convert(array_type(basis), oc) for oc in occ]
 
     for (ik, kpt) in enumerate(basis.kpoints)
         Pψ = term.ops[ik].P' * ψ[ik]  # nproj x nband
@@ -93,7 +92,7 @@ end
 # The ordering of the projector indices is (A,l,m,i), where A is running over all
 # atoms, l, m are AM quantum numbers and i is running over all projectors for a
 # given l. The matrix is block-diagonal with non-zeros only if A, l and m agree.
-function build_projection_coefficients_(T, psps, psp_positions; array_type = array_type)
+function build_projection_coefficients_(T, psps, psp_positions; array_type = Array)
     # TODO In the current version the proj_coeffs still has a lot of zeros.
     #      One could improve this by storing the blocks as a list or in a
     #      BlockDiagonal data structure
@@ -145,7 +144,7 @@ where pihat(q) = ∫_R^3 pi(r) e^{-iqr} dr
 We store 1/√Ω pihat(k+G) in proj_vectors.
 """
 function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
-                                   psps, psp_positions; array_type = Array) where {T}
+                                   psps, psp_positions) where {T}
     unit_cell_volume = basis.model.unit_cell_volume
     n_proj = count_n_proj(psps, psp_positions)
     n_G    = length(G_vectors(basis, kpt))
@@ -173,7 +172,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
         end
     end
     @assert offset == n_proj
-    convert(array_type, proj_vectors)
+    convert(array_type(basis), proj_vectors)
 end
 
 """

From 646b44c5fbb78575b526b634d2a0cf26a9cba295 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 16 Aug 2022 10:05:07 +0200
Subject: [PATCH 11/69] Update workarounds: remove iszero and isone, add eigen

---
 src/eigen/lobpcg_hyper_impl.jl | 14 +-------------
 src/workarounds/gpu_arrays.jl  | 16 ++++++++--------
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index 2b51b01dcb..67178a81db 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -122,22 +122,10 @@ end
 
 # Perform a Rayleigh-Ritz for the N first eigenvectors.
 @timing function rayleigh_ritz(X::BlockVector, AX::BlockVector, N)
-    rayleigh_ritz(block_overlap(X, AX), N) # block_overlap(X,AX) is an AbstractArray, not a BlockVector
-end
-
-@timing function rayleigh_ritz(XAX::AbstractArray, N)
-    F = eigen(Hermitian(XAX))
+    F = eigen(Hermitian(block_overlap(X, AX))) # block_overlap(X,AX) is an AbstractArray, not a BlockVector
     F.vectors[:,1:N], F.values[1:N]
 end
 
-@timing function rayleigh_ritz(XAX::CuArray, N)
-    if eltype(XAX) <: Complex
-        vals, vects = CUDA.CUSOLVER.heevd!('V','U',XAX)
-    else
-        vals, vects = CUDA.CUSOLVER.syevd!('V','U',XAX)
-    end
-    vects[:,1:N], vals[1:N]
-end
 # B-orthogonalize X (in place) using only one B apply.
 # This uses an unstable method which is only OK if X is already
 # orthogonal (not B-orthogonal) and B is relatively well-conditioned
diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl
index d864a0c0c5..3669d9171d 100644
--- a/src/workarounds/gpu_arrays.jl
+++ b/src/workarounds/gpu_arrays.jl
@@ -1,15 +1,15 @@
 #TODO: remove this when it is implemented in GPUArrays
-import LinearAlgebra.dot
+import LinearAlgebra.dot, LinearAlgebra.eigen, LinearAlgebra.RealHermSymComplexHerm
 using LinearAlgebra
 using GPUArrays
-import Base.iszero, Base.isone
 
 LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal,y::AbstractGPUArray) = x'*(D*y)
 
-Base.iszero(x::AbstractGPUMatrix{T}) where {T} = all(iszero, x)
-
-function Base.isone(x::AbstractGPUMatrix{T}) where {T}
-    n,m = size(x)
-    m != n && return false
-    all(iszero, x-I)
+function LinearAlgebra.eigen(A::RealHermSymComplexHerm{T,AT}) where {T,AT <: CuArray}
+    if eltype(A) <: Complex
+        vals, vects = CUDA.CUSOLVER.heevd!('V','U', A.data)
+    else
+        vals, vects = CUDA.CUSOLVER.syevd!('V','U',A.data)
+    end
+    (vectors = vects, values = vals)
 end

From 76c697d11b120debefad1c62b57165fe68c413b7 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Mon, 22 Aug 2022 15:01:06 +0200
Subject: [PATCH 12/69] Rename block_mul into * + build e on GPU

---
 src/eigen/lobpcg_hyper_impl.jl | 41 ++++++++++++++++------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index 67178a81db..60b46c43e2 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -45,6 +45,7 @@ vprintln(args...) = nothing
 using LinearAlgebra
 using CUDA
 using GPUArrays
+import Base: *
 include("../workarounds/gpu_arrays.jl")
 
 # For now, BlockVector can store arrays of different types (for example, an element of type views and one of type Matrix). Maybe for performance issues it should only store arrays of the same type?
@@ -98,10 +99,10 @@ block_overlap(blocksA::BlockVector, B) = block_overlap(blocksA, make_block_vecto
 block_overlap(A, B) = A' * B #Default fallback method. Note the adjoint.
 
 """Given A as a BlockVector [A1, A2, A3] this forms the matrix-matrix product
-A * B avoiding a concatenation of the blocks to a dense array. block_mul has compatible versions with two Arrays.
-block_overlap always compute  product A*B (no adjoint).
+A * B avoiding a concatenation of the blocks to a dense array. 
+There is also a compatible versions with two Arrays.
 """
-@views function block_mul(Ablock::BlockVector, B)
+@views function *(Ablock::BlockVector, B)
     res = Ablock.blocks[1] * B[1:size(Ablock.blocks[1], 2), :]  # First multiplication
     offset = size(Ablock.blocks[1], 2)
     for block in Ablock.blocks[2:end]
@@ -111,13 +112,9 @@ block_overlap always compute  product A*B (no adjoint).
     res
 end
 
-block_mul(A, Bblock::BlockVector) = error("Not implemented")
-block_mul(A::Tuple, B::Tuple) = error("not implemented")
-block_mul(A, B) = A * B # Default fallback method.
-
 function LinearAlgebra.mul!(res,A::BlockVector,B::AbstractArray,α,β)
-    # Has slightly better performances than a naive res = α*block_mul(A,B) - β*res
-    mul!(res, block_mul(A, B), I, α, β)
+    # Has slightly better performances than a naive res = α*A*B - β*res
+    mul!(res, A*B, I, α, β)
 end
 
 # Perform a Rayleigh-Ritz for the N first eigenvectors.
@@ -246,7 +243,7 @@ end
         # as can happen in extreme cases in the ortho!(cP, cX)
         dropped = drop!(X)
         if dropped != []
-            X[:, dropped] .-= block_mul(Y, block_overlap(BY,X[:, dropped])) #X = X - Y'*BY*X
+            X[:, dropped] .-= Y * block_overlap(BY,X[:, dropped]) #X = X - Y'*BY*X
         end
 
         if norm(BYX) < tol && niter > 1
@@ -298,7 +295,6 @@ end
                         miniter=1, ortho_tol=2eps(real(eltype(X))),
                         n_conv_check=nothing, display_progress=false)
     N, M = size(X)
-    typearray = typeof(X)
 
     # If N is too small, we will likely get in trouble
     error_message(verb) = "The eigenproblem is too small, and the iterative " *
@@ -371,9 +367,9 @@ end
             # wait on updating P because we have to know which vectors
             # to lock (and therefore the residuals) before computing P
             # only for the unlocked vectors. This results in better convergence.
-            new_X  = block_mul(Y, cX)
-            new_AX = block_mul(AY, cX)  # no accuracy loss, since cX orthogonal
-            new_BX = (B == I) ? new_X : block_mul(BY, cX)
+            new_X  = Y * cX
+            new_AX = AY * cX  # no accuracy loss, since cX orthogonal
+            new_BX = (B == I) ? new_X : BY * cX
         end
 
         ### Compute new residuals
@@ -427,11 +423,12 @@ end
             # orthogonalization, see Hetmaniuk & Lehoucq, and Duersch et. al.
             # cP = copy(cX)
             # cP[Xn_indices,:] .= 0
-            e = zeros(eltype(X), size(cX, 1), M - prev_nlocked)
-            for i in 1:length(Xn_indices)
-                e[Xn_indices[i], i] = 1
-            end
-            e = convert(typearray,e)
+
+            lenXn = length(Xn_indices)
+            e = zero(similar(X, size(cX, 1), M - prev_nlocked))
+            lower_diag = one(similar(X, lenXn, lenXn))
+            #e has zeros everywhere except on one of its lower diagonal
+            e[Xn_indices[1] : last(Xn_indices), 1 : lenXn] = lower_diag
 
             cP = cX .- e
             cP = cP[:, Xn_indices]
@@ -439,10 +436,10 @@ end
             ortho!(cP, cX, cX, tol=ortho_tol)
 
             # Get new P
-            new_P  = block_mul( Y, cP)
-            new_AP = block_mul(AY, cP)
+            new_P  = Y * cP
+            new_AP = AY * cP
             if B != I
-                new_BP = block_mul(BY, cP)
+                new_BP = BY * cP
             else
                 new_BP = new_P
             end

From bd684d754ca06240805903c81834bfeab952fb4c Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 23 Aug 2022 09:07:40 +0200
Subject: [PATCH 13/69] Modify the change of basis functions to be GPU
 compatible

---
 src/Model.jl | 47 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/src/Model.jl b/src/Model.jl
index 6f9e0c90cd..17f16c8092 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -263,13 +263,48 @@ Examples of covectors are forces.
 Reciprocal vectors are a special case: they are covectors, but conventionally have an
 additional factor of 2π in their definition, so they transform rather with 2π times the
 inverse lattice transpose: q_cart = 2π lattice' \ q_red = recip_lattice * q_red.
+
+The trans_mat functions return the transition matrices required to do such a change of basis.
 =#
-vector_red_to_cart(model::Model, rred)        = model.lattice * rred
-vector_cart_to_red(model::Model, rcart)       = model.inv_lattice * rcart
-covector_red_to_cart(model::Model, fred)      = model.inv_lattice' * fred
-covector_cart_to_red(model::Model, fcart)     = model.lattice' * fcart
-recip_vector_red_to_cart(model::Model, qred)  = model.recip_lattice * qred
-recip_vector_cart_to_red(model::Model, qcart) = model.inv_recip_lattice * qcart
+
+trans_mat_vector_red_to_cart(model::Model) = model.lattice
+trans_mat_vector_cart_to_red(model::Model)   = model.inv_lattice
+trans_mat_covector_red_to_cart(model::Model)      = model.inv_lattice'
+trans_mat_covector_cart_to_red(model::Model)     = model.lattice'
+trans_mat_recip_vector_red_to_cart(model::Model)  = model.recip_lattice 
+trans_mat_recip_vector_cart_to_red(model::Model) = model.inv_recip_lattice
+
+fun_mat_list =(:vector_red_to_cart,
+                :vector_cart_to_red,
+                :covector_red_to_cart,
+                :covector_cart_to_red,
+                :recip_vector_red_to_cart,
+                :recip_vector_cart_to_red
+)
+
+for fun1 in fun_mat_list
+    #=
+    The following functions compute the change of basis for a given vector. To do so,
+    they call the trans_mat functions to get the corresponding transition matrix.
+    These functions can be broadcasted over an Array of vectors: however, they are
+    not GPU compatible, as they require the model, which is no isbits.
+    =#
+    @eval $fun1(model::Model, vec) = $(Symbol("trans_mat_"*string(fun1)))(model::Model) * vec
+    #=
+    The following functions take an AbstractArray of vectors and compute the change of basis
+    for every vector in the AbstractArray: they return an AbstractArray of the same type
+    and size as the input, but containing the vectors in a new basis.
+    These functions are GPU compatible (ie the AbstractArray can be a GPUArray), since
+    they use a map and the transition matrices are static arrays.
+    =#
+    @eval function $(Symbol("map_"*string(fun1)))(model::Model, A::AbstractArray)
+        trans_matrix = $(Symbol("trans_mat_"*string(fun1)))(model)
+        in_new_basis = map(A) do Ai
+            trans_matrix  * Ai
+        end
+        in_new_basis
+    end
+end
 
 #=
 Transformations on vectors and covectors are matrices and comatrices.

From 15d1324cd7311727f7ba91cb0a2697406e5fb16c Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 23 Aug 2022 10:30:33 +0200
Subject: [PATCH 14/69] Keep this branch synced with LOBPCG_GPU

---
 src/eigen/lobpcg_hyper_impl.jl | 38 +++++++++++++++++-----------------
 src/workarounds/gpu_arrays.jl  |  1 +
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index 60b46c43e2..702acbd2c5 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -43,42 +43,42 @@
 vprintln(args...) = nothing
 
 using LinearAlgebra
-using CUDA
-using GPUArrays
 import Base: *
 include("../workarounds/gpu_arrays.jl")
 
-# For now, BlockVector can store arrays of different types (for example, an element of type views and one of type Matrix). Maybe for performance issues it should only store arrays of the same type?
+# For now, BlockMatrix can store arrays of different types (for example, an element 
+# of type views and one of type Matrix). Maybe for performance issues it should only
+# store arrays of the same type?
 
-struct BlockVector
+struct BlockMatrix
     blocks::Tuple
     size::Tuple{Int64,Int64}
 end
 
 """
-Build a BlockVector containing the given arrays, from left to right.
+Build a BlockMatrix containing the given arrays, from left to right.
 This function will fail (for now) if:
     -the arrays do not all have the same "height" (ie size[1] must match).
 """
 function make_block_vector(arrays::AbstractArray...)
-    length(arrays) ==0 && error("Empty BlockVector is not currently implemented")
+    length(arrays) ==0 && error("Empty BlockMatrix is not currently implemented")
     n_ref= size(arrays[1])[1]
     m=0
     for array in arrays
         n_i, m_i = size(array)
-        n_ref != n_i && error("The given arrays do not have matching 'height': cannot build a BlockVector out of them.")
+        n_ref != n_i && error("The given arrays do not have matching 'height': cannot build a BlockMatrix out of them.")
         m += m_i
     end
-    BlockVector(arrays, (n_ref,m))
+    BlockMatrix(arrays, (n_ref,m))
 end
 
 
 """
-Given A and B as two BlockVectors [A1, A2, A3], [B1, B2, B3] form the matrix
-A'B (which is not a BlockVector). block_overlap also has compatible versions with two Arrays. 
+Given A and B as two BlockMatrixs [A1, A2, A3], [B1, B2, B3] form the matrix
+A'B (which is not a BlockMatrix). block_overlap also has compatible versions with two Arrays. 
 block_overlap always compute some form of adjoint, ie the product A'*B.
 """
-@views function block_overlap(A::BlockVector, B::BlockVector)
+@views function block_overlap(A::BlockMatrix, B::BlockMatrix)
     rows = A.size[2]
     cols = B.size[2]
     ret = similar(A.blocks[1], rows, cols)
@@ -95,14 +95,14 @@ block_overlap always compute some form of adjoint, ie the product A'*B.
     ret
 end
 
-block_overlap(blocksA::BlockVector, B) = block_overlap(blocksA, make_block_vector(B))
-block_overlap(A, B) = A' * B #Default fallback method. Note the adjoint.
+block_overlap(blocksA::BlockMatrix, B) = block_overlap(blocksA, make_block_vector(B))
+block_overlap(A, B) = A' * B # Default fallback method. Note the adjoint.
 
-"""Given A as a BlockVector [A1, A2, A3] this forms the matrix-matrix product
+"""
+Given A as a BlockMatrix [A1, A2, A3] and B a Matrix, compute the matrix-matrix product
 A * B avoiding a concatenation of the blocks to a dense array. 
-There is also a compatible versions with two Arrays.
 """
-@views function *(Ablock::BlockVector, B)
+@views function *(Ablock::BlockMatrix, B)
     res = Ablock.blocks[1] * B[1:size(Ablock.blocks[1], 2), :]  # First multiplication
     offset = size(Ablock.blocks[1], 2)
     for block in Ablock.blocks[2:end]
@@ -112,14 +112,14 @@ There is also a compatible versions with two Arrays.
     res
 end
 
-function LinearAlgebra.mul!(res,A::BlockVector,B::AbstractArray,α,β)
+function LinearAlgebra.mul!(res,A::BlockMatrix,B::AbstractArray,α,β)
     # Has slightly better performances than a naive res = α*A*B - β*res
     mul!(res, A*B, I, α, β)
 end
 
 # Perform a Rayleigh-Ritz for the N first eigenvectors.
-@timing function rayleigh_ritz(X::BlockVector, AX::BlockVector, N)
-    F = eigen(Hermitian(block_overlap(X, AX))) # block_overlap(X,AX) is an AbstractArray, not a BlockVector
+@timing function rayleigh_ritz(X::BlockMatrix, AX::BlockMatrix, N)
+    F = eigen(Hermitian(block_overlap(X, AX))) # block_overlap(X,AX) is an AbstractArray, not a BlockMatrix
     F.vectors[:,1:N], F.values[1:N]
 end
 
diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl
index 3669d9171d..cb3e43fd09 100644
--- a/src/workarounds/gpu_arrays.jl
+++ b/src/workarounds/gpu_arrays.jl
@@ -2,6 +2,7 @@
 import LinearAlgebra.dot, LinearAlgebra.eigen, LinearAlgebra.RealHermSymComplexHerm
 using LinearAlgebra
 using GPUArrays
+using CUDA
 
 LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal,y::AbstractGPUArray) = x'*(D*y)
 

From 62d9f79aeeaba0a45e82c6b5e5eda00f618fa3c8 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 23 Aug 2022 11:13:28 +0200
Subject: [PATCH 15/69] Add the Hartree term

---
 src/Model.jl          | 2 +-
 src/PlaneWaveBasis.jl | 2 +-
 src/terms/hartree.jl  | 8 ++++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/Model.jl b/src/Model.jl
index 17f16c8092..9878841c27 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -297,7 +297,7 @@ for fun1 in fun_mat_list
     These functions are GPU compatible (ie the AbstractArray can be a GPUArray), since
     they use a map and the transition matrices are static arrays.
     =#
-    @eval function $(Symbol("map_"*string(fun1)))(model::Model, A::AbstractArray)
+    @eval function $(Symbol("map_"*string(fun1)))(model::Model, A::AbstractArray{AT}) where {AT <: Vec3}
         trans_matrix = $(Symbol("trans_mat_"*string(fun1)))(model)
         in_new_basis = map(A) do Ai
             trans_matrix  * Ai
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 5456c07d94..0c7fe3bbca 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -377,7 +377,7 @@ array_type(basis::PlaneWaveBasis{T,VT,AT}) where {T, VT, AT} = AT
 
 The list of ``G`` vectors of a given `basis` or `kpt`, in cartesian coordinates.
 """
-G_vectors_cart(basis::PlaneWaveBasis) = recip_vector_red_to_cart.(basis.model, G_vectors(basis))
+G_vectors_cart(basis::PlaneWaveBasis) = map_recip_vector_red_to_cart(basis.model, G_vectors(basis))
 function G_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint)
     recip_vector_red_to_cart.(basis.model, G_vectors(basis, kpt))
 end
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 07dc61548a..9c72e2d58d 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -31,13 +31,17 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where T
 
     # Solving the Poisson equation ΔV = -4π ρ in Fourier space
     # is multiplying elementwise by 4π / |G|^2.
-    poisson_green_coeffs = 4T(π) ./ [sum(abs2, G) for G in G_vectors_cart(basis)]
+    poisson_green_coeffs = map(G_vectors_cart(basis)) do G
+        4T(π) /sum(abs2, G)
+    end
     if !isempty(model.atoms)
         # Assume positive charge from nuclei is exactly compensated by the electrons
         sum_charges = sum(charge_ionic, model.atoms)
         @assert sum_charges == model.n_electrons
     end
-    poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
+    poisson_green_coeffs[1:1,1:1,1:1] .= zero(similar(G_vectors(basis), T, 1,1,1))
+    #Hackish way to do the following
+    # poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)
 end

From 19100cfdb500c4c074987d72d9e1f7fad92eb00b Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 23 Aug 2022 16:38:06 +0200
Subject: [PATCH 16/69] Remove CUDA dependency from ortho_qr

---
 src/common/ortho.jl | 5 ++---
 src/eigen/diag.jl   | 2 +-
 src/orbitals.jl     | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index 3936c5bb25..c7023091ed 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,3 +1,2 @@
-# Orthonormalize
-ortho_qr(φk::AbstractArray) = Matrix(qr(φk).Q) #LinearAlgebra.QRCompactWYQ -> Matrix
-ortho_qr(φk::CuArray) = CuArray(qr(φk).Q) #CUDA.CUSOLVER.CuQRPackedQ -> CuArray
+# Orthonormalize and return an array of the same type as the input.
+ortho_qr(φk::AbstractArray; array_type = Matrix) = array_type(qr(φk).Q)
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index ee46fa1199..baeb8d9db0 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -36,7 +36,7 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
                 # use information from previous k-point
                 X0 = interpolate_kpoint(results[ik - 1].X, ham.basis, kpoints[ik - 1],
                                         ham.basis, kpoints[ik])
-                ψguessk = ortho_qr(X0)  # Re-orthogonalize and renormalize
+                ψguessk = ortho_qr(X0; array_type = array_type(basis))  # Re-orthogonalize and renormalize
             else
                 ψguessk = random_orbitals(ham.basis, kpt, nev_per_kpoint)
             end
diff --git a/src/orbitals.jl b/src/orbitals.jl
index 22ce9c2e51..966e9e5173 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -55,5 +55,5 @@ using Random
 function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
     orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
     randn!(TaskLocalRNG(), orbitals) #Force the use of GPUArrays.jl's random function if using the GPU
-    ortho_qr(orbitals)
-end
\ No newline at end of file
+    ortho_qr(orbitals; array_type = array_type(basis))
+end

From 1184ec1e017a506b1711ccab58370f72aaab22e0 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Mon, 29 Aug 2022 16:21:40 +0200
Subject: [PATCH 17/69] Bugfix when plotting bandstructure + typo fixes

---
 src/PlaneWaveBasis.jl | 4 ++--
 src/eigen/diag.jl     | 2 +-
 src/terms/hartree.jl  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 0c7fe3bbca..bd883c5ad0 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -326,7 +326,7 @@ Creates a new basis identical to `basis`, but with a custom set of kpoints
     PlaneWaveBasis(basis.model, basis.Ecut,
                    basis.fft_size, basis.variational,
                    kcoords, kweights, kgrid, kshift,
-                   basis.symmetries_respect_rgrid, basis.comm_kpts, array_type = array_type(basis))
+                   basis.symmetries_respect_rgrid, basis.comm_kpts, array_type(basis))
 end
 
 """
@@ -412,7 +412,7 @@ r_vectors(basis::PlaneWaveBasis) = basis.r_vectors
 
 The list of ``r`` vectors, in cartesian coordinates.
 """
-r_vectors_cart(basis::PlaneWaveBasis) = vector_red_to_cart.(basis.model, r_vectors(basis))
+r_vectors_cart(basis::PlaneWaveBasis) = map_recip_vector_red_to_cart(basis.model, r_vectors(basis))
 
 
 """
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index baeb8d9db0..da4f007d8b 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -36,7 +36,7 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
                 # use information from previous k-point
                 X0 = interpolate_kpoint(results[ik - 1].X, ham.basis, kpoints[ik - 1],
                                         ham.basis, kpoints[ik])
-                ψguessk = ortho_qr(X0; array_type = array_type(basis))  # Re-orthogonalize and renormalize
+                ψguessk = ortho_qr(X0; array_type = array_type(ham.basis))  # Re-orthogonalize and renormalize
             else
                 ψguessk = random_orbitals(ham.basis, kpt, nev_per_kpoint)
             end
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 9c72e2d58d..2d6f1820e4 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -40,7 +40,7 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where T
         @assert sum_charges == model.n_electrons
     end
     poisson_green_coeffs[1:1,1:1,1:1] .= zero(similar(G_vectors(basis), T, 1,1,1))
-    #Hackish way to do the following
+    ## Hackish way to do the following
     # poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)

From 833928b2c76ee71cfafdc5866daf24fe74788ec4 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 30 Aug 2022 18:09:24 +0200
Subject: [PATCH 18/69] Make all mixings except Chi0 mixing GPu compatible

---
 src/scf/mixing.jl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl
index 38f70e5ef8..da15beb976 100644
--- a/src/scf/mixing.jl
+++ b/src/scf/mixing.jl
@@ -50,7 +50,9 @@ end
 @timing "KerkerMixing" function mix_density(mixing::KerkerMixing, basis::PlaneWaveBasis,
                                             δF; kwargs...)
     T      = eltype(δF)
-    G²     = [sum(abs2, G) for G in G_vectors_cart(basis)]
+    G²     = map(G_vectors_cart(basis)) do G
+                sum(abs2, G)
+            end
     kTF    = T.(mixing.kTF)
     ΔDOS_Ω = T.(mixing.ΔDOS_Ω)
 
@@ -72,7 +74,6 @@ end
     δF_fourier     = r_to_G(basis, δF)
     δFtot_fourier  = total_density(δF_fourier)
     δFspin_fourier = spin_density(δF_fourier)
-
     δρtot_fourier = δFtot_fourier .* G² ./ (kTF.^2 .+ G²)
     δρtot = G_to_r(basis, δρtot_fourier)
 
@@ -135,7 +136,9 @@ end
     εr > 1 / sqrt(eps(T)) && return mix_density(KerkerMixing(kTF=kTF), basis, δF)
 
     C0 = 1 - εr
-    Gsq = [sum(abs2, G) for G in G_vectors_cart(basis)]
+    Gsq = map(G_vectors_cart(basis)) do G
+        sum(abs2, G)
+    end
     δF_fourier = r_to_G(basis, δF)
     δρ = @. δF_fourier * (kTF^2 - C0 * Gsq) / (εr * kTF^2 - C0 * Gsq)
     δρ = G_to_r(basis, δρ)

From e12f35be94c0ef904c2b918db7454351884d75ac Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 1 Sep 2022 20:06:31 +0200
Subject: [PATCH 19/69] Prettier way to overload eigen for CuArrays

---
 src/workarounds/gpu_arrays.jl | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl
index cb3e43fd09..91b5873054 100644
--- a/src/workarounds/gpu_arrays.jl
+++ b/src/workarounds/gpu_arrays.jl
@@ -1,16 +1,17 @@
 #TODO: remove this when it is implemented in GPUArrays
-import LinearAlgebra.dot, LinearAlgebra.eigen, LinearAlgebra.RealHermSymComplexHerm
+import LinearAlgebra.dot, LinearAlgebra.eigen
 using LinearAlgebra
 using GPUArrays
 using CUDA
 
 LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal,y::AbstractGPUArray) = x'*(D*y)
 
-function LinearAlgebra.eigen(A::RealHermSymComplexHerm{T,AT}) where {T,AT <: CuArray}
-    if eltype(A) <: Complex
-        vals, vects = CUDA.CUSOLVER.heevd!('V','U', A.data)
-    else
-        vals, vects = CUDA.CUSOLVER.syevd!('V','U',A.data)
-    end
+function LinearAlgebra.eigen(A::Hermitian{T,AT}) where {T <: Complex,AT <: CuArray}
+    vals, vects = CUDA.CUSOLVER.heevd!('V','U', A.data)
+    (vectors = vects, values = vals)
+end
+
+function LinearAlgebra.eigen(A::Hermitian{T,AT}) where {T <: Real,AT <: CuArray}
+    vals, vects = CUDA.CUSOLVER.syevd!('V','U', A.data)
     (vectors = vects, values = vals)
 end

From a0c40669bffc8e6685cdfbd5be51982c7724c65f Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 6 Sep 2022 10:33:47 +0200
Subject: [PATCH 20/69] Update comments + remove unnecessary code

---
 src/PlaneWaveBasis.jl        | 2 +-
 src/common/ortho.jl          | 2 +-
 src/eigen/diag.jl            | 5 +----
 src/eigen/preconditioners.jl | 3 +--
 src/terms/kinetic.jl         | 1 +
 src/terms/local.jl           | 9 ++++++---
 src/terms/nonlocal.jl        | 2 ++
 7 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index bd883c5ad0..65e26c9d30 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -343,7 +343,7 @@ function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type = Array)
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
     Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
-    convert(array_type, Gs) #Offload to GPU if needed.
+    convert(array_type, Gs)  # GPU computation only : offload the Gs to the GPU.
 end
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
     # The generator version is used mainly in symmetry.jl for lowpass_for_symmetry! and
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index c7023091ed..ffdf1659fe 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,2 +1,2 @@
-# Orthonormalize and return an array of the same type as the input.
+# Orthonormalize and convert to an array of the type array_type.
 ortho_qr(φk::AbstractArray; array_type = Matrix) = array_type(qr(φk).Q)
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index da4f007d8b..e56157bd79 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -54,10 +54,7 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
     end
 
     # Transform results into a nicer datastructure
-    # TODO: keep λ on the gpu? Careful then, as self_consistent_field's eigenvalues
-    # will be a CuArray -> due to the Smearing.occupation function, occupation will also
-    # be a CuArray, so no scalar indexing (in ene_ops, in compute_density...)
-    (λ=[Array(real.(res.λ)) for res in results],
+    (λ=[Array(real.(res.λ)) for res in results],  # GPU computation only : get λ back on the CPU
      X=[res.X for res in results],
      residual_norms=[res.residual_norms for res in results],
      iterations=[res.iterations for res in results],
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index 6e37d54ae4..fa9182cb54 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -37,7 +37,7 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=
     isempty(kinetic_term) && error("Preconditioner should be disabled when no Kinetic term is used.")
     scaling = only(kinetic_term).scaling_factor
     kin = Vector{T}([scaling * sum(abs2, q) for q in Gplusk_vectors_cart(basis, kpt)] ./ 2)
-    kin = convert(array_type(basis), kin)
+    kin = convert(array_type(basis), kin)  # GPU computation only : offload kinetic energies to the GPU
     PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift)
 end
 
@@ -69,5 +69,4 @@ end
 
 function precondprep!(P::PreconditionerTPA, X)
     P.mean_kin = [real(dot(x, Diagonal(P.kin), x)) for x in eachcol(X)]
-    P.mean_kin = convert(array_type(P.basis), P.mean_kin)
 end
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index 0d200f78a5..a382f97a6b 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -16,6 +16,7 @@ struct TermKinetic <: Term
     kinetic_energies::Vector{<:AbstractVector}  # kinetic energy 1/2|G+k|^2 for every kpoint
 end
 function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
+    # GPU computation only : build the kinetic energies on CPU then offload them to GPU
     kinetic_energies = [convert(array_type(basis),
                             [T(scaling_factor) * sum(abs2, Gk) / 2
                             for Gk in Gplusk_vectors_cart(basis, kpt)])
diff --git a/src/terms/local.jl b/src/terms/local.jl
index 9867eeef1e..384c76a24d 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -74,8 +74,11 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     # positions, this involves a form factor (`local_potential_fourier`)
     # and a structure factor e^{-i G·r}
 
-    #This operation needs to be done only once, so let's try to make it happen on CPU (else we needs to isbitsify the pseudopotentials)
-    pot_fourier = map(Array(G_vectors(basis))) do G
+    # GPU computation only : put the Gs on CPU for compatibility with the
+    # pseudopotentials which are not isbits
+    Gs = Array(G_vectors(basis))
+
+    pot_fourier = map(Gs) do G
         pot = sum(model.atom_groups) do group
             element = model.atoms[first(group)]
             form_factor::T = local_potential_fourier(element, norm(model.recip_lattice * G))
@@ -83,7 +86,7 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
         end
         pot / sqrt(model.unit_cell_volume)
     end
-    #If needed, send to the GPU the atomic local term.
+    # GPU computation only : build the potential values on CPU then offload them to GPU
     pot_real = G_to_r(basis, convert(array_type(basis),pot_fourier))
     TermAtomicLocal(pot_real)
 end
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 47ab910b52..ac7791da87 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -108,6 +108,7 @@ function build_projection_coefficients_(T, psps, psp_positions; array_type = Arr
     end # psp, r
     @assert count == n_proj
 
+    # GPU computation only : build the coefficients on CPU then offload them to the GPU
     convert(array_type,proj_coeffs)
 end
 
@@ -172,6 +173,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
         end
     end
     @assert offset == n_proj
+    # GPU computation only : build the vectors on CPU then offload them to the GPU
     convert(array_type(basis), proj_vectors)
 end
 

From 9cdff931d80d3fd6d543c340dfc33b2a3909e67c Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 6 Sep 2022 18:31:48 +0200
Subject: [PATCH 21/69] Update the GPU example

---
 examples/gpu.jl | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/examples/gpu.jl b/examples/gpu.jl
index f59c12d12b..dcf8842385 100644
--- a/examples/gpu.jl
+++ b/examples/gpu.jl
@@ -1,26 +1,28 @@
 using DFTK
 using CUDA
-using MKL
-setup_threading(n_blas=1)
 
 a = 10.263141334305942  # Lattice constant in Bohr
 lattice = a / 2 .* [[0 1 1.]; [1 0 1.]; [1 1 0.]]
 Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
 atoms     = [Si, Si]
-positions = [ones(3)/8, -ones(3)/8];
-terms_LDA = [Kinetic(), AtomicLocal(), AtomicNonlocal()]
+positions = [ones(3)/8, -ones(3)/8]
+terms = [Kinetic(),
+            AtomicLocal(),
+            AtomicNonlocal(),
+            Ewald(),
+            PspCorrection(),
+            Entropy(),
+            Hartree()]
+# Now, build a supercell to have a larger system
+pystruct = pymatgen_structure(lattice, atoms, positions)
+pystruct.make_supercell([4,2,2])
+lattice   = load_lattice(pystruct)
+positions = load_positions(pystruct)
+atoms     = fill(Si, length(positions))
 
-# Setup an LDA model and discretize using
-# a single k-point and a small `Ecut` of 5 Hartree.
-mod = Model(lattice, atoms, positions; terms=terms_LDA,symmetries=false)
-basis = PlaneWaveBasis(mod; Ecut=30, kgrid=(1, 1, 1))
-basis_gpu = PlaneWaveBasis(mod; Ecut=30, kgrid=(1, 1, 1), array_type = CuArray)
+model = Model(lattice, atoms, positions; terms=terms, temperature=1e-3, symmetries=false)
+# Notice the only difference in the code, with the optional argument array_type
+basis_gpu = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), array_type = CuArray)
+# You can now check that some of the fields of the basis, such as the G_vectors, are CuArrays
 
-
-DFTK.reset_timer!(DFTK.timer)
-scfres = self_consistent_field(basis; solver=scf_damping_solver(1.0), is_converged=DFTK.ScfConvergenceDensity(1e-3))
-println(DFTK.timer)
-
-DFTK.reset_timer!(DFTK.timer)
-scfres_gpu = self_consistent_field(basis_gpu; solver=scf_damping_solver(1.0), is_converged=DFTK.ScfConvergenceDensity(1e-3))
-println(DFTK.timer)
+scfres = self_consistent_field(basis_gpu; tol=1e-3, solver=scf_anderson_solver(), mixing = KerkerMixing())

From 20b7b109130660ac8122b304208d7028d1afcd8b Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 7 Sep 2022 10:49:30 +0200
Subject: [PATCH 22/69] Put the Gvectors for each kpoint on the GPU

---
 src/PlaneWaveBasis.jl        | 23 +++++++++++++++--------
 src/eigen/preconditioners.jl |  5 +++--
 src/terms/kinetic.jl         |  9 ++++++---
 src/terms/nonlocal.jl        |  4 +++-
 4 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 65e26c9d30..92886f6198 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -15,7 +15,7 @@ Discretization information for ``k``-point-dependent quantities such as orbitals
 More generally, a ``k``-point is a block of the Hamiltonian;
 eg collinear spin is treated by doubling the number of kpoints.
 """
-struct Kpoint{T <: Real}
+struct Kpoint{T<:Real, AT <: AbstractArray, GT <: AT}
     spin::Int                     # Spin component can be 1 or 2 as index into what is
                                   # returned by the `spin_components` function
     coordinate::Vec3{T}           # Fractional coordinate of k-point
@@ -23,8 +23,9 @@ struct Kpoint{T <: Real}
                                   # G_vectors(basis)[kpt.mapping[i]] == G_vectors(basis, kpt)[i]
     mapping_inv::Dict{Int, Int}   # Inverse of `mapping`:
                                   # G_vectors(basis)[i] == G_vectors(basis, kpt)[mapping_inv[i]]
-    G_vectors::Vector{Vec3{Int}}  # Wave vectors in integer coordinates:
+    G_vectors::GT                 # Wave vectors in integer coordinates:
                                   # ({G, 1/2 |k+G|^2 ≤ Ecut})
+                                  # The G_vectors are a 1D array of Vec3 of Ints
 end
 
 @doc raw"""
@@ -113,7 +114,7 @@ Base.Broadcast.broadcastable(basis::PlaneWaveBasis) = Ref(basis)
 Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 
 @timing function build_kpoints(model::Model{T}, fft_size, kcoords, Ecut;
-                               variational=true) where T
+                               variational=true, array_type = Array) where T
     kpoints_per_spin = [Kpoint[] for _ in 1:model.n_spin_components]
     for k in kcoords
         k = Vec3{T}(k)  # rationals are sloooow
@@ -129,10 +130,14 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
                 push!(Gvecs_k, G)
             end
         end
+        Gvecs_k = convert(array_type, Gvecs_k)  # GPU computation only: offload the Gs to the GPU
+        AT = array_type
+        GT = array_type{Vec3{Int }}
+
         mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping))
         for iσ = 1:model.n_spin_components
             push!(kpoints_per_spin[iσ],
-                  Kpoint(iσ, k, mapping, mapping_inv, Gvecs_k))
+                  Kpoint{T,AT,GT}(iσ, k, mapping, mapping_inv, Gvecs_k))
         end
     end
 
@@ -140,7 +145,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 end
 function build_kpoints(basis::PlaneWaveBasis, kcoords)
     build_kpoints(basis.model, basis.fft_size, kcoords, basis.Ecut;
-                  variational=basis.variational)
+                  variational=basis.variational, array_type = array_type(basis))
 end
 
 # Lowest-level constructor, should not be called directly.
@@ -236,7 +241,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
         "Non-variational calculations are experimental. " *
         "Not all features of DFTK may be supported or work as intended."
     )
-    kpoints = build_kpoints(model, fft_size, kcoords_global[krange_thisproc], Ecut; variational)
+    kpoints = build_kpoints(model, fft_size, kcoords_global[krange_thisproc], Ecut; variational, array_type)
     # kpoints is now possibly twice the size of krange. Make things consistent
     if model.n_spin_components == 2
         krange_thisproc   = vcat(krange_thisproc, n_kpt .+ krange_thisproc)
@@ -388,7 +393,9 @@ end
 The list of ``G + k`` vectors, in reduced coordinates.
 """
 function Gplusk_vectors(basis::PlaneWaveBasis, kpt::Kpoint)
-    map(G -> G + kpt.coordinate, G_vectors(basis, kpt))
+    coordinate = kpt.coordinate
+    Gs = G_vectors(basis,kpt)
+    map(G -> G + coordinate, Gs)
 end
 
 @doc raw"""
@@ -397,7 +404,7 @@ end
 The list of ``G + k`` vectors, in cartesian coordinates.
 """
 function Gplusk_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint)
-    recip_vector_red_to_cart.(basis.model, Gplusk_vectors(basis, kpt))
+    map_recip_vector_red_to_cart(basis.model, Gplusk_vectors(basis, kpt))
 end
 
 @doc raw"""
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index fa9182cb54..586a075995 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -36,8 +36,9 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=
     kinetic_term = [t for t in basis.model.term_types if t isa Kinetic]
     isempty(kinetic_term) && error("Preconditioner should be disabled when no Kinetic term is used.")
     scaling = only(kinetic_term).scaling_factor
-    kin = Vector{T}([scaling * sum(abs2, q) for q in Gplusk_vectors_cart(basis, kpt)] ./ 2)
-    kin = convert(array_type(basis), kin)  # GPU computation only : offload kinetic energies to the GPU
+    kin = map(Gplusk_vectors_cart(basis, kpt)) do q
+        scaling * sum(abs2, q) /2
+    end
     PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift)
 end
 
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index a382f97a6b..a2784c127d 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -17,9 +17,12 @@ struct TermKinetic <: Term
 end
 function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
     # GPU computation only : build the kinetic energies on CPU then offload them to GPU
-    kinetic_energies = [convert(array_type(basis),
-                            [T(scaling_factor) * sum(abs2, Gk) / 2
-                            for Gk in Gplusk_vectors_cart(basis, kpt)])
+    function build_kin(Gs)
+        map(Gs) do Gk
+            T(scaling_factor) * sum(abs2, Gk) / 2
+        end
+    end
+    kinetic_energies = [build_kin(Gplusk_vectors_cart(basis, kpt))
                             for kpt in basis.kpoints]
     TermKinetic(T(scaling_factor), kinetic_energies)
 end
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index ac7791da87..e93cce631c 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -163,7 +163,8 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
         # Combine with structure factors
         for r in positions
             # k+G in this formula can also be G, this only changes an unimportant phase factor
-            structure_factors = map(q -> cis2pi(-dot(q, r)), Gplusk_vectors(basis, kpt))
+            Gs = Array(Gplusk_vectors(basis, kpt))  # GPU computation only: get Gs on CPU for the following map
+            structure_factors = map(q -> cis2pi(-dot(q, r)), Gs)
             @views for iproj = 1:count_n_proj(psp)
                 proj_vectors[:, offset+iproj] .= (
                     structure_factors .* form_factors[:, iproj] ./ sqrt(unit_cell_volume)
@@ -181,6 +182,7 @@ end
 Build form factors (Fourier transforms of projectors) for an atom centered at 0.
 """
 function build_form_factors(psp, qs)
+    qs = Array(qs)  # GPU computation only : get qs back on CPU
     qnorms = norm.(qs)
     T = real(eltype(qnorms))
     # Compute position-independent form factors

From a2d811b16b13f17ab90ece10a08906fc3f2b137d Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 7 Sep 2022 11:34:16 +0200
Subject: [PATCH 23/69] Bugfix after launching the tests

---
 src/PlaneWaveBasis.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 92886f6198..391f538300 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -419,7 +419,7 @@ r_vectors(basis::PlaneWaveBasis) = basis.r_vectors
 
 The list of ``r`` vectors, in cartesian coordinates.
 """
-r_vectors_cart(basis::PlaneWaveBasis) = map_recip_vector_red_to_cart(basis.model, r_vectors(basis))
+r_vectors_cart(basis::PlaneWaveBasis) = map_vector_red_to_cart(basis.model, r_vectors(basis))
 
 
 """

From 8ee55a45ca48e1ca34582fbfc87fa37c64b5cf3b Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 7 Sep 2022 12:02:14 +0200
Subject: [PATCH 24/69] Put the occupation on GPU

---
 src/densities.jl      | 1 +
 src/occupation.jl     | 5 +++--
 src/terms/kinetic.jl  | 1 +
 src/terms/nonlocal.jl | 1 -
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/densities.jl b/src/densities.jl
index 4855bce9c3..84ca34bf7b 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -20,6 +20,7 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
 """
 @views @timing function compute_density(basis, ψ, occupation)
     T = promote_type(eltype(basis), real(eltype(ψ[1])))
+    occupation = [Array(oc) for oc in occupation]  # GPU computation only: offload to CPU
 
     # we split the total iteration range (ik, n) in chunks, and parallelize over them
     ik_n = [(ik, n) for ik = 1:length(basis.kpoints) for n = 1:size(ψ[ik], 2)]
diff --git a/src/occupation.jl b/src/occupation.jl
index 2ac3e0d29b..126c25a73b 100644
--- a/src/occupation.jl
+++ b/src/occupation.jl
@@ -11,8 +11,9 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF;
     inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature
 
     filled_occ = filled_occupation(basis.model)
-    [filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature)
-     for εk in eigenvalues]
+    [convert(array_type(basis), # GPU computation only: put each occupation on GPU
+        filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature))
+        for εk in eigenvalues]
 end
 
 """
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index a2784c127d..d35c63b401 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -32,6 +32,7 @@ end
     ops = [FourierMultiplication(basis, kpoint, term.kinetic_energies[ik])
            for (ik, kpoint) in enumerate(basis.kpoints)]
     isnothing(ψ) && return (E=T(Inf), ops=ops)
+    occ = [Array(oc) for oc in occ]  # GPU computation only: put the occupations back on CPU
 
     E = zero(T)
     for (ik, k) in enumerate(basis.kpoints)
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index e93cce631c..04d68c5fe3 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -31,7 +31,6 @@ end
     isnothing(ψ) && return (E=T(Inf), ops=term.ops)
 
     E = zero(T)
-    occ = [convert(array_type(basis), oc) for oc in occ]
 
     for (ik, kpt) in enumerate(basis.kpoints)
         Pψ = term.ops[ik].P' * ψ[ik]  # nproj x nband

From b700d1c723fbeca6f9130aaf1181845d8d46a48b Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 5 Oct 2022 16:28:09 +0200
Subject: [PATCH 25/69] GPU compatibility for Anderson acceleration

---
 src/scf/potential_mixing.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scf/potential_mixing.jl b/src/scf/potential_mixing.jl
index cfb41fc0a5..440e09a06f 100644
--- a/src/scf/potential_mixing.jl
+++ b/src/scf/potential_mixing.jl
@@ -61,6 +61,7 @@ function (anderson::AndersonAcceleration)(xₙ, αₙ, Pfxₙ)
 
     xₙ₊₁ = vec(xₙ) .+ αₙ .* vec(Pfxₙ)
     βs   = -(Mfac \ vec(Pfxₙ))
+    βs = Array(βs)  # GPU computation only : get βs back on the CPU so we can iterate through it
     for (iβ, β) in enumerate(βs)
         xₙ₊₁ .+= β .* (xs[iβ] .- vec(xₙ) .+ αₙ .* (Pfxs[iβ] .- vec(Pfxₙ)))
     end

From 981e7a1a5d9356de7b3f513c5ae7ad4b29cc5893 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 6 Oct 2022 10:28:35 +0200
Subject: [PATCH 26/69] Remove unnecessary dependencies

---
 src/fft.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index 09f1c16bd3..d4f2cc6ed1 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -1,6 +1,4 @@
 import FFTW
-import CUDA
-import GPUArrays
 import AbstractFFTs
 
 #
@@ -256,7 +254,8 @@ _fftw_flags(::Type{Float64}) = FFTW.MEASURE
 Plan a FFT of type `T` and size `fft_size`, spending some time on finding an
 optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are returned.
 """
-#Removed the flags as CUDA's plan_fft doesn't need flags. If this is a performance issue, we should check array_type's type then call either FFTW.plan_fft(tmp, flags = ...) or CUDA.plan_fft(tmp)
+# Removed the flags as CUDA's plan_fft doesn't need flags. If this is a performance issue, we should
+# check array_type's type then call either FFTW.plan_fft(tmp, flags = ...) or CUDA.plan_fft(tmp)
 function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union{Float32,Float64}}
     tmp = similar(array_type, Complex{T}, fft_size...)
     ipFFT = AbstractFFTs.plan_fft!(tmp)

From 5c3b620150580ae85580fa83752bc77a411b90a1 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 6 Oct 2022 14:05:03 +0200
Subject: [PATCH 27/69] Crude bugfix for the nbands algorithm

---
 src/scf/nbands_algorithm.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/scf/nbands_algorithm.jl b/src/scf/nbands_algorithm.jl
index f75880d204..91cb5db44f 100644
--- a/src/scf/nbands_algorithm.jl
+++ b/src/scf/nbands_algorithm.jl
@@ -68,6 +68,9 @@ function determine_n_bands(bands::AdaptiveBands, occupation::AbstractVector,
     # TODO Could return different bands per k-Points
 
     # Determine number of bands to be actually converged
+
+    occupation = map(Array, occupation) # GPU computation only: bring occupation back
+    # on the CPU, or maximum (following line) will fail
     n_bands_occ = maximum(occupation) do occk
         something(findlast(fnk -> fnk ≥ bands.occupation_threshold, occk), length(occk) + 1)
     end

From bf77db27d63e53c1e96bcb032faf3dcef0e72c16 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 6 Oct 2022 15:28:06 +0200
Subject: [PATCH 28/69] Less CPU-GPU transfers in guess_density + whitespaces

---
 src/PlaneWaveBasis.jl          | 11 +++++++----
 src/eigen/lobpcg_hyper_impl.jl | 12 ++++++------
 src/eigen/preconditioners.jl   |  2 +-
 src/guess_density.jl           | 25 +++++++++++++------------
 src/occupation.jl              |  2 +-
 src/orbitals.jl                |  4 ++--
 src/scf/nbands_algorithm.jl    |  2 +-
 7 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 2bf3918cb1..484b5ed494 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -427,10 +427,9 @@ Return the index tuple `I` such that `G_vectors(basis)[I] == G`
 or the index `i` such that `G_vectors(basis, kpoint)[i] == G`.
 Returns nothing if outside the range of valid wave vectors.
 """
-@inline function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{T}) where {T <: Integer}
-    # the inline declaration encourages the compiler to hoist these (G-independent) precomputations
-    start = .- cld.(basis.fft_size .- 1, 2)
-    stop  = fld.(basis.fft_size .- 1, 2)
+@inline function index_G_vectors(fft_size::Tuple, G::AbstractVector{T}) where {T <: Integer}
+    start = .- cld.(fft_size .- 1, 2)
+    stop  = fld.(fft_size .- 1, 2)
     lengths = stop .- start .+ 1
 
     # FFTs store wavevectors as [0 1 2 3 -2 -1] (example for N=5)
@@ -445,6 +444,10 @@ Returns nothing if outside the range of valid wave vectors.
     end
 end
 
+@inline function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{T}) where {T <: Integer}
+    index_G_vectors(basis.fft_size, G)
+end
+
 function index_G_vectors(basis::PlaneWaveBasis, kpoint::Kpoint,
                          G::AbstractVector{T}) where {T <: Integer}
     fft_size = basis.fft_size
diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index c8fd30c3b4..cb099fbe17 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -183,7 +183,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))
         end
         invR = inv(R)
         @assert all(!isnan, invR)
-        rmul!(X, invR) # we do not use X/R because we use invR next
+        rmul!(X, invR)  # we do not use X/R because we use invR next
 
         # We would like growth_factor *= opnorm(inv(R)) but it's too
         # expensive, so we use an upper bound which is sharp enough to
@@ -196,7 +196,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))
         growth_factor *= norminvR
 
         # condR = 1/LAPACK.trcon!('I', 'U', 'N', Array(R))
-        condR = normest(R)*norminvR # in practice this seems to be an OK estimate
+        condR = normest(R)*norminvR  # in practice this seems to be an OK estimate
 
         vprintln("Ortho(X) success? $success ", eps(real(T))*condR^2, " < $tol")
 
@@ -267,7 +267,7 @@ end
         niter > 10 && error("Ortho(X,Y) is failing badly, this should never happen")
         niter += 1
     end
-    vprintln("ortho choleskys: ", ninners) # get how many Choleskys are performed
+    vprintln("ortho choleskys: ", ninners)  # get how many Choleskys are performed
 
     # @assert (norm(BY'X)) < tol
     # @assert (norm(X'X-I)) < tol
@@ -345,7 +345,7 @@ end
     full_BX = BX
 
     while true
-        if niter > 0 # first iteration is just to compute the residuals (no X update)
+        if niter > 0  # first iteration is just to compute the residuals (no X update)
             ###  Perform the Rayleigh-Ritz
             mul!(AR, A, R)
             n_matvec += size(R, 2)
@@ -413,7 +413,7 @@ end
             return final_retval(full_X, full_AX, resid_history, niter, n_matvec)
         end
         newly_locked = nlocked - prev_nlocked
-        active = newly_locked+1:size(X,2) # newly active vectors
+        active = newly_locked+1:size(X,2)  # newly active vectors
 
         if niter > 0
             ### compute P = Y*cP only for the newly active vectors
@@ -484,7 +484,7 @@ end
         # Orthogonalize R wrt all X, newly active P
         if niter > 0
             Z  = LazyHcat(full_X, P)
-            BZ = LazyHcat(full_BX, BP) # data shared with (full_X, P) in non-general case
+            BZ = LazyHcat(full_BX, BP)  # data shared with (full_X, P) in non-general case
         else
             Z  = full_X
             BZ = full_BX
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index c07ce55bbe..699c9f8ef9 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -28,7 +28,7 @@ mutable struct PreconditionerTPA{T <: Real}
     kpt::Kpoint
     kin::AbstractVector{T}  # kinetic energy of every G
     mean_kin::Union{Nothing, Vector{T}}  # mean kinetic energy of every band
-    default_shift::T # if mean_kin is not set by `precondprep!`, this will be used for the shift
+    default_shift::T  # if mean_kin is not set by `precondprep!`, this will be used for the shift
 end
 
 function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=1) where {T}
diff --git a/src/guess_density.jl b/src/guess_density.jl
index 85f65289d9..dd0b9bc0af 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -93,8 +93,7 @@ which follow the functional form
 and are placed at `position` (in fractional coordinates).
 """
 function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
-    ρ = similar(G_vectors(basis), complex(T), basis.fft_size)
-    ρ .= 0
+    ρ = zeros_like(G_vectors(basis), complex(T), basis.fft_size...)
 
     isempty(gaussians) && return irfft(basis, ρ)
 
@@ -106,21 +105,23 @@ function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
     # where f(x) is a weighted gaussian
     #
     # is formed from a superposition of atomic densities, each scaled by a prefactor
-    ρ = Array(ρ)
-    for (iG, G) in enumerate(Array(G_vectors(basis)))
-        # Ensure that we only set G-vectors that have a -G counterpart
-        if isnothing(index_G_vectors(basis, -G))
-            ρ[iG] = zero(complex(T))
-            continue
-        end
 
-        Gsq = sum(abs2, basis.model.recip_lattice * G)
+    fft_size = basis.fft_size
+    function build_ρ(G)
+        if isnothing(index_G_vectors(fft_size, -G))
+            return zero(complex(T))
+        end
+        Gsq = sum(abs2, recip_lattice * G)
+        res = zero(complex(T))
         for (coeff, decay_length, r) in gaussians
             form_factor::T = exp(-Gsq * T(decay_length)^2)
-            ρ[iG] += T(coeff) * form_factor * cis2pi(-dot(G, r))
+            res += T(coeff) * form_factor * cis2pi(-dot(G, r))
         end
+        res
     end
-    ρ = convert(array_type(basis), ρ)
+    #  Can't use map! as the Gs are converted from an array of Vec3 to an array of complex
+    ρ = map(build_ρ, basis.G_vectors)
+
     irfft(basis, ρ / sqrt(basis.model.unit_cell_volume))
 end
 
diff --git a/src/occupation.jl b/src/occupation.jl
index efd94273ae..f6855bf66c 100644
--- a/src/occupation.jl
+++ b/src/occupation.jl
@@ -11,7 +11,7 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF;
     inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature
 
     filled_occ = filled_occupation(basis.model)
-    [convert(array_type(basis), # GPU computation only: put each occupation on GPU
+    [convert(array_type(basis),  # GPU computation only: put each occupation on GPU
         filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature))
         for εk in eigenvalues]
 end
diff --git a/src/orbitals.jl b/src/orbitals.jl
index 7e2d27ae38..c99d4f7960 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -42,7 +42,7 @@ end
 function unsafe_unpack_ψ(x, sizes_ψ)
     lengths = prod.(sizes_ψ)
     ends = cumsum(lengths)
-    # We unsafe_wrap the resulting array to avoid a complicated type for ψ.    
+    # We unsafe_wrap the resulting array to avoid a complicated type for ψ.
     map(1:length(sizes_ψ)) do ik
         unsafe_wrap(Array{complex(eltype(x))},
                     pointer(@views x[ends[ik]-lengths[ik]+1:ends[ik]]),
@@ -54,6 +54,6 @@ unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ))
 using Random
 function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
     orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
-    randn!(TaskLocalRNG(), orbitals) #Force the use of GPUArrays.jl's random function if using the GPU
+    randn!(TaskLocalRNG(), orbitals)  # Force the use of GPUArrays.jl's random function if using the GPU
     ortho_qr(orbitals; array_type = array_type(basis))
 end
diff --git a/src/scf/nbands_algorithm.jl b/src/scf/nbands_algorithm.jl
index 91cb5db44f..30ae3692da 100644
--- a/src/scf/nbands_algorithm.jl
+++ b/src/scf/nbands_algorithm.jl
@@ -69,7 +69,7 @@ function determine_n_bands(bands::AdaptiveBands, occupation::AbstractVector,
 
     # Determine number of bands to be actually converged
 
-    occupation = map(Array, occupation) # GPU computation only: bring occupation back
+    occupation = map(Array, occupation)  # GPU computation only: bring occupation back
     # on the CPU, or maximum (following line) will fail
     n_bands_occ = maximum(occupation) do occk
         something(findlast(fnk -> fnk ≥ bands.occupation_threshold, occk), length(occk) + 1)

From 0c8067e82a3979f4aecbaf8eaa6d8d04f117a61c Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 11 Oct 2022 10:58:08 +0200
Subject: [PATCH 29/69] Whitespaces, typos + stick to the same conventions

---
 src/Model.jl                   | 10 +++++-----
 src/PlaneWaveBasis.jl          |  2 +-
 src/common/ortho.jl            |  2 +-
 src/eigen/diag_lobpcg_hyper.jl |  1 -
 src/fft.jl                     |  1 +
 src/scf/mixing.jl              |  3 +++
 src/scf/nbands_algorithm.jl    |  2 +-
 src/terms/hartree.jl           |  3 +++
 src/terms/kinetic.jl           |  2 +-
 src/terms/local.jl             |  2 +-
 src/terms/nonlocal.jl          |  2 +-
 11 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/Model.jl b/src/Model.jl
index 9878841c27..1513191906 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -267,11 +267,11 @@ inverse lattice transpose: q_cart = 2π lattice' \ q_red = recip_lattice * q_red
 The trans_mat functions return the transition matrices required to do such a change of basis.
 =#
 
-trans_mat_vector_red_to_cart(model::Model) = model.lattice
-trans_mat_vector_cart_to_red(model::Model)   = model.inv_lattice
-trans_mat_covector_red_to_cart(model::Model)      = model.inv_lattice'
+trans_mat_vector_red_to_cart(model::Model)       = model.lattice
+trans_mat_vector_cart_to_red(model::Model)       = model.inv_lattice
+trans_mat_covector_red_to_cart(model::Model)     = model.inv_lattice'
 trans_mat_covector_cart_to_red(model::Model)     = model.lattice'
-trans_mat_recip_vector_red_to_cart(model::Model)  = model.recip_lattice 
+trans_mat_recip_vector_red_to_cart(model::Model) = model.recip_lattice
 trans_mat_recip_vector_cart_to_red(model::Model) = model.inv_recip_lattice
 
 fun_mat_list =(:vector_red_to_cart,
@@ -287,7 +287,7 @@ for fun1 in fun_mat_list
     The following functions compute the change of basis for a given vector. To do so,
     they call the trans_mat functions to get the corresponding transition matrix.
     These functions can be broadcasted over an Array of vectors: however, they are
-    not GPU compatible, as they require the model, which is no isbits.
+    not GPU compatible, as they require the model, which is not isbits.
     =#
     @eval $fun1(model::Model, vec) = $(Symbol("trans_mat_"*string(fun1)))(model::Model) * vec
     #=
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 484b5ed494..3527aa979d 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -15,7 +15,7 @@ Discretization information for ``k``-point-dependent quantities such as orbitals
 More generally, a ``k``-point is a block of the Hamiltonian;
 eg collinear spin is treated by doubling the number of kpoints.
 """
-struct Kpoint{T<:Real, AT <: AbstractArray, GT <: AT}
+struct Kpoint{T <: Real, AT <: AbstractArray, GT <: AT}
     spin::Int                     # Spin component can be 1 or 2 as index into what is
                                   # returned by the `spin_components` function
     coordinate::Vec3{T}           # Fractional coordinate of k-point
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index c18c0ab642..7d89622016 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,2 +1,2 @@
-# Orthonormalize and convert to an array of the type array_type.
+# Orthonormalize and convert to an array of the type "array_type".
 @timing ortho_qr(φk::AbstractArray; array_type = Matrix) = array_type(qr(φk).Q)
diff --git a/src/eigen/diag_lobpcg_hyper.jl b/src/eigen/diag_lobpcg_hyper.jl
index 8548d79c58..832eaf2cee 100644
--- a/src/eigen/diag_lobpcg_hyper.jl
+++ b/src/eigen/diag_lobpcg_hyper.jl
@@ -9,7 +9,6 @@ function lobpcg_hyper(A, X0; maxiter=100, prec=nothing,
     result = LOBPCG(A, X0, I, prec, tol, maxiter; n_conv_check=n_conv_check, kwargs...)
 
     n_conv_check === nothing && (n_conv_check = size(X0, 2))
-
     converged = maximum(result.residual_norms[1:n_conv_check]) < tol
     iterations = size(result.residual_history, 2) - 1
 
diff --git a/src/fft.jl b/src/fft.jl
index 75b2ab6006..4001111d8b 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -276,6 +276,7 @@ function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union
     ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
 end
 
+
 # TODO Some grid sizes are broken in the generic FFT implementation
 # in FourierTransforms, for more details see workarounds/fft_generic.jl
 default_primes(::Type{Float32}) = (2, 3, 5)
diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl
index 58d33499d2..39596ba327 100644
--- a/src/scf/mixing.jl
+++ b/src/scf/mixing.jl
@@ -75,9 +75,12 @@ end
     δFtot_fourier  = total_density(δF_fourier)
     δFspin_fourier = spin_density(δF_fourier)
     δρtot_fourier = δFtot_fourier .* G² ./ (kTF.^2 .+ G²)
+    # force_real! is currently not GPU compatible, so we have to do this very ugly thing
+    # of calling back the array on CPU, running force_real!, then putting it back on GPU
     δρtot_fourier = Array(δρtot_fourier)
     force_real!(basis, δρtot_fourier)
     δρtot_fourier = convert(array_type(basis), δρtot_fourier)
+
     δρtot = irfft(basis, δρtot_fourier)
 
     # Copy DC component, otherwise it never gets updated
diff --git a/src/scf/nbands_algorithm.jl b/src/scf/nbands_algorithm.jl
index 30ae3692da..5b445707b2 100644
--- a/src/scf/nbands_algorithm.jl
+++ b/src/scf/nbands_algorithm.jl
@@ -69,7 +69,7 @@ function determine_n_bands(bands::AdaptiveBands, occupation::AbstractVector,
 
     # Determine number of bands to be actually converged
 
-    occupation = map(Array, occupation)  # GPU computation only: bring occupation back
+    occupation = [Array(oc) for oc in occupation]  # GPU computation only: bring occupation back
     # on the CPU, or maximum (following line) will fail
     n_bands_occ = maximum(occupation) do occk
         something(findlast(fnk -> fnk ≥ bands.occupation_threshold, occk), length(occk) + 1)
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 441a012711..666d9a1247 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -42,6 +42,9 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
     poisson_green_coeffs[1:1,1:1,1:1] .= zero(similar(G_vectors(basis), T, 1,1,1))
     ## Hackish way to do the following
     # poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
+
+    # force_real! is currently not GPU compatible, so we have to do this very ugly thing
+    # of calling back the array on CPU, running force_real!, then putting it back on GPU
     poisson_green_coeffs = Array(poisson_green_coeffs)
     force_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
     poisson_green_coeffs = convert(array_type(basis), poisson_green_coeffs)
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index f4ddb44896..3f3ea6682b 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -36,7 +36,7 @@ end
     if isnothing(ψ) || isnothing(occupation)
         return (E=T(Inf), ops=ops)
     end
-    occupation = map(Array, occupation)  # GPU computation only: put the occupations back on CPU
+    occupation = [Array(oc) for oc in occupation]  # GPU computation only: put the occupations back on CPU
 
     E = zero(T)
     for (ik, ψk) in enumerate(ψ)
diff --git a/src/terms/local.jl b/src/terms/local.jl
index b044c6bf37..bfe2b03ab7 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -90,7 +90,7 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     end
     force_real!(basis, pot_fourier)  # Symmetrize Fourier coeffs to have real iFFT
     # GPU computation only : build the potential values on CPU then offload them to GPU
-    pot_real = irfft(basis, convert(array_type(basis),pot_fourier))
+    pot_real = irfft(basis, convert(array_type(basis), pot_fourier))
 
     TermAtomicLocal(pot_real)
 end
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index b38bf4d94b..84790400e5 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -109,7 +109,7 @@ function build_projection_coefficients_(T, psps, psp_positions; array_type = Arr
     @assert count == n_proj
 
     # GPU computation only : build the coefficients on CPU then offload them to the GPU
-    convert(array_type,proj_coeffs)
+    convert(array_type, proj_coeffs)
 end
 
 # Builds the projection coefficient matrix for a single atom

From b5112257dab173cc2ac6fcf46560ea4e991b6ebb Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Fri, 14 Oct 2022 12:28:33 +0200
Subject: [PATCH 30/69] Removed unnecessary imports

---
 src/DFTK.jl     | 3 ---
 src/orbitals.jl | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/DFTK.jl b/src/DFTK.jl
index 5d9a0b8822..612109afd1 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -13,9 +13,6 @@ using spglib_jll
 using Unitful
 using UnitfulAtomic
 using ForwardDiff
-using AbstractFFTs
-using GPUArraysCore
-using CUDA
 using Random
 using ChainRulesCore
 
diff --git a/src/orbitals.jl b/src/orbitals.jl
index c99d4f7960..b52ff46abd 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -1,3 +1,5 @@
+using Random  # Used to have a generic API for CPU and GPU computations alike: see random_orbitals
+
 # Returns the occupied orbitals, the occupation array and optionally the eigenvalues without
 # virtual states (or states with small occupation level for metals).
 # threshold is a parameter to distinguish between states we want to keep and the
@@ -51,7 +53,6 @@ function unsafe_unpack_ψ(x, sizes_ψ)
 end
 unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ))
 
-using Random
 function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
     orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
     randn!(TaskLocalRNG(), orbitals)  # Force the use of GPUArrays.jl's random function if using the GPU

From 28e05e74a04d6349e3ce6568e757f6a158d77ff1 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Fri, 14 Oct 2022 12:02:01 +0200
Subject: [PATCH 31/69] WIP: solve dependency issues

---
 src/DFTK.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/DFTK.jl b/src/DFTK.jl
index 612109afd1..256387d1b5 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -13,6 +13,8 @@ using spglib_jll
 using Unitful
 using UnitfulAtomic
 using ForwardDiff
+using AbstractFFTs
+using GPUArraysCore
 using Random
 using ChainRulesCore
 

From 06eb5b6dcfa3c2e66394eff5e87ced827046e1ac Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Fri, 14 Oct 2022 15:26:42 +0200
Subject: [PATCH 32/69] WIP: remove TaskLocalRNG for julia versions < 1.6

---
 src/orbitals.jl | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/orbitals.jl b/src/orbitals.jl
index b52ff46abd..11c0d41444 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -53,8 +53,17 @@ function unsafe_unpack_ψ(x, sizes_ψ)
 end
 unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ))
 
-function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
-    orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
-    randn!(TaskLocalRNG(), orbitals)  # Force the use of GPUArrays.jl's random function if using the GPU
-    ortho_qr(orbitals; array_type = array_type(basis))
+@static if VERSION < v"1.7"
+    # Don't use TaskLocalRNG as it is not available.
+    function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
+        orbitals = randn(Complex{T}, length(G_vectors(basis, kpt)), howmany)
+        orbitals = convert(array_type(basis), orbitals)
+        ortho_qr(orbitals; array_type = array_type(basis))
+    end
+else
+    function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
+        orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
+        randn!(TaskLocalRNG(), orbitals)  # Force the use of GPUArrays.jl's random function if using the GPU
+        ortho_qr(orbitals; array_type = array_type(basis))
+    end
 end

From de28ac46d1709b651fcd26d51960121c8a0ae746 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Fri, 14 Oct 2022 14:42:40 +0200
Subject: [PATCH 33/69] Better fallback for TaskLocalRng for Julia < 1.6

---
 src/orbitals.jl | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/orbitals.jl b/src/orbitals.jl
index 11c0d41444..06509b2708 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -53,17 +53,14 @@ function unsafe_unpack_ψ(x, sizes_ψ)
 end
 unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ))
 
-@static if VERSION < v"1.7"
-    # Don't use TaskLocalRNG as it is not available.
-    function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
+function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
+    @static if VERSION < v"1.7"
+        # Don't use TaskLocalRNG as it is not available.
         orbitals = randn(Complex{T}, length(G_vectors(basis, kpt)), howmany)
         orbitals = convert(array_type(basis), orbitals)
-        ortho_qr(orbitals; array_type = array_type(basis))
-    end
-else
-    function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
+    else
         orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
         randn!(TaskLocalRNG(), orbitals)  # Force the use of GPUArrays.jl's random function if using the GPU
-        ortho_qr(orbitals; array_type = array_type(basis))
     end
+    ortho_qr(orbitals; array_type = array_type(basis))
 end

From 1c5a5b506147f9b342948e7a05766c8c47af2f9f Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Fri, 14 Oct 2022 15:18:55 +0200
Subject: [PATCH 34/69] Adapt the workarounds for forward differentiation to
 the new API

---
 src/workarounds/forwarddiff_rules.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index 5a21fea069..3d6a400f30 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -107,7 +107,7 @@ next_working_fft_size(::Type{<:ForwardDiff.Dual}, size::Int) = size
 
 _fftw_flags(::Type{<:ForwardDiff.Dual}) = FFTW.MEASURE | FFTW.UNALIGNED
 
-function build_fft_plans(T::Type{<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.Dual}}}, fft_size)
+function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.Dual}}}
     tmp = Array{complex(T)}(undef, fft_size...) # TODO think about other Array types
     opFFT  = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
     opBFFT = FFTW.plan_bfft(tmp, flags=_fftw_flags(T))
@@ -209,7 +209,7 @@ function self_consistent_field(basis_dual::PlaneWaveBasis{T};
         εF_dual = T(scfres.εF)  # Only needed for entropy term
         eigenvalues_dual = [T.(εk) for εk in scfres.eigenvalues]
         _, ham_dual = energy_hamiltonian(basis_dual, ψ_dual, occupation_dual;
-                                         ρ=ρ_dual, eigenvalues=eigenvalues_dual, 
+                                         ρ=ρ_dual, eigenvalues=eigenvalues_dual,
                                          εF=εF_dual)
         ham_dual * ψ_dual
     end

From 866d6bf35075d834608468e846c03648c46552c4 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Fri, 14 Oct 2022 16:53:26 +0200
Subject: [PATCH 35/69] Change build_fft_plans in fft_generic.jl to take arrays
 as arguments

---
 src/workarounds/fft_generic.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/workarounds/fft_generic.jl b/src/workarounds/fft_generic.jl
index 371ec2b061..993bca485b 100644
--- a/src/workarounds/fft_generic.jl
+++ b/src/workarounds/fft_generic.jl
@@ -23,7 +23,7 @@ end
 default_primes(::Any) = (2, )
 
 # Generic fallback function, Float32 and Float64 specialization in fft.jl
-function build_fft_plans(T, fft_size)
+function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T}
     tmp = Array{Complex{T}}(undef, fft_size...)
 
     # Note: FourierTransforms has no support for in-place FFTs at the moment

From 7ad1744d41e88bd897e786834a38764db2b1baba Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Mon, 17 Oct 2022 17:04:02 +0200
Subject: [PATCH 36/69] Fixes

---
 src/PlaneWaveBasis.jl  | 37 ++++++++++++++++++-------------------
 src/external/jld2io.jl | 25 ++++++++++++++++---------
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 3527aa979d..fd60dc6378 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -15,7 +15,7 @@ Discretization information for ``k``-point-dependent quantities such as orbitals
 More generally, a ``k``-point is a block of the Hamiltonian;
 eg collinear spin is treated by doubling the number of kpoints.
 """
-struct Kpoint{T <: Real, AT <: AbstractArray, GT <: AT}
+struct Kpoint{T <: Real, GT <: AbstractArray}
     spin::Int                     # Spin component can be 1 or 2 as index into what is
                                   # returned by the `spin_components` function
     coordinate::Vec3{T}           # Fractional coordinate of k-point
@@ -40,7 +40,7 @@ Normalization conventions:
 
 `ifft` and `fft` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT, AT, GT, RT} <: AbstractBasis{T} where {VT <: Real, GT <: AT, RT <: AT, AT <: AbstractArray}
+struct PlaneWaveBasis{T, VT, AT, GT, RT} <: AbstractBasis{T} where {VT <: Real, GT, RT, AT <: AbstractArray}
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 
@@ -111,10 +111,15 @@ end
 import Base.Broadcast.broadcastable
 Base.Broadcast.broadcastable(basis::PlaneWaveBasis) = Ref(basis)
 
+"""
+Return the type of array used for computations (Array if on CPU, CuArray,
+ROCArray... if on GPU).
+"""
+array_type(basis::PlaneWaveBasis{T,VT,AT}) where {T, VT, AT} = AT
 Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 
 @timing function build_kpoints(model::Model{T}, fft_size, kcoords, Ecut;
-                               variational=true, array_type = Array) where {T}
+                               variational=true, array_type::Type=Array) where {T}
     kpoints_per_spin = [Kpoint[] for _ in 1:model.n_spin_components]
     for k in kcoords
         k = Vec3{T}(k)  # rationals are sloooow
@@ -131,13 +136,12 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
             end
         end
         Gvecs_k = convert(array_type, Gvecs_k)  # GPU computation only: offload the Gs to the GPU
-        AT = array_type
-        GT = array_type{Vec3{Int }}
+        GT = array_type{Vec3{Int}}
 
         mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping))
         for iσ = 1:model.n_spin_components
             push!(kpoints_per_spin[iσ],
-                  Kpoint{T,AT,GT}(iσ, k, mapping, mapping_inv, Gvecs_k))
+                  Kpoint{T,GT}(iσ, k, mapping, mapping_inv, Gvecs_k))
         end
     end
 
@@ -153,7 +157,7 @@ end
 # and are stored in PlaneWaveBasis for easy reconstruction.
 function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
                         kcoords, kweights, kgrid, kshift,
-                        symmetries_respect_rgrid, comm_kpts, array_type = Array) where {T <: Real}
+                        symmetries_respect_rgrid, comm_kpts, array_type::Type) where {T <: Real}
     # Validate fft_size
     if variational
         max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
@@ -257,9 +261,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) for i = 1:N1, j = 1:N2, k = 1:N3]
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
-    RT = array_type{Vec3{VT }, 3}
-    GT = array_type{Vec3{Int }, 3}
-
+    RT = array_type{Vec3{VT}, 3}
+    GT = array_type{Vec3{Int}, 3}
     basis = PlaneWaveBasis{T,value_type(T), array_type, GT, RT}(
         model, fft_size, dvol,
         Ecut, variational,
@@ -286,7 +289,7 @@ end
                                 variational=true, fft_size=nothing,
                                 kgrid=nothing, kshift=nothing,
                                 symmetries_respect_rgrid=isnothing(fft_size),
-                                comm_kpts=MPI.COMM_WORLD, array_type = Array) where {T <: Real}
+                                comm_kpts=MPI.COMM_WORLD, array_type=Array) where {T <: Real}
     if isnothing(fft_size)
         @assert variational
         if symmetries_respect_rgrid
@@ -314,7 +317,8 @@ number of points in each dimension and `kshift` the shift (0 or 1/2 in each dire
 If not specified a grid is generated using `kgrid_from_minimal_spacing` with
 a minimal spacing of `2π * 0.022` per Bohr.
 """
-function PlaneWaveBasis(model::Model; Ecut,
+function PlaneWaveBasis(model::Model;
+                        Ecut,
                         kgrid=kgrid_from_minimal_spacing(model, 2π * 0.022),
                         kshift=zeros(3),
                         kwargs...)
@@ -369,11 +373,6 @@ or a ``k``-point `kpt`.
 G_vectors(basis::PlaneWaveBasis) = basis.G_vectors
 G_vectors(::PlaneWaveBasis, kpt::Kpoint) = kpt.G_vectors
 
-"""
-Return the type of array used for computations (Array if on CPU, CuArray,
-ROCArray... if on GPU).
-"""
-array_type(basis::PlaneWaveBasis{T,VT,AT}) where {T, VT, AT} = AT
 
 
 @doc raw"""
@@ -482,7 +481,7 @@ it as a `PlaneWaveBasis`. On the other (non-master) processes `nothing` is retur
 The returned object should not be used for computations and only to extract data
 for post-processing and serialisation to disk.
 """
-function gather_kpts(basis::PlaneWaveBasis)
+function gather_kpts(basis)
     # No need to allocate and setup a new basis object
     mpi_nprocs(basis.comm_kpts) == 1 && return basis
 
@@ -507,7 +506,7 @@ function gather_kpts(basis::PlaneWaveBasis)
                        basis.kshift,
                        basis.symmetries_respect_rgrid,
                        comm_kpts=MPI.COMM_SELF,
-                      )
+                       array_type=DFTK.array_type(basis))
     end
 end
 
diff --git a/src/external/jld2io.jl b/src/external/jld2io.jl
index 8dfbb05cac..7b1e1ff2d6 100644
--- a/src/external/jld2io.jl
+++ b/src/external/jld2io.jl
@@ -74,7 +74,7 @@ load_scfres(file::AbstractString) = JLD2.jldopen(load_scfres, file, "r")
 #
 # Custom serialisations
 #
-struct PlaneWaveBasisSerialisation{T <: Real}
+struct PlaneWaveBasisSerialisation{T <: Real, AT <: AbstractArray}
     model::Model{T,T}
     Ecut::T
     variational::Bool
@@ -85,10 +85,13 @@ struct PlaneWaveBasisSerialisation{T <: Real}
     symmetries_respect_rgrid::Bool
     fft_size::Tuple{Int, Int, Int}
 end
-JLD2.writeas(::Type{PlaneWaveBasis{T,T}}) where {T} = PlaneWaveBasisSerialisation{T}
+function JLD2.writeas(::Type{PlaneWaveBasis{T,T,AT,GT,RT}}) where {T,AT,GT,RT}
+    PlaneWaveBasisSerialisation{T,AT}
+end
 
-function Base.convert(::Type{PlaneWaveBasisSerialisation{T}}, basis::PlaneWaveBasis{T,T}) where {T}
-    PlaneWaveBasisSerialisation{T}(
+function Base.convert(::Type{PlaneWaveBasisSerialisation{T,AT}},
+                      basis::PlaneWaveBasis{T,T,AT}) where {T,AT}
+    PlaneWaveBasisSerialisation{T,AT}(
         basis.model,
         basis.Ecut,
         basis.variational,
@@ -101,9 +104,13 @@ function Base.convert(::Type{PlaneWaveBasisSerialisation{T}}, basis::PlaneWaveBa
     )
 end
 
-function Base.convert(::Type{PlaneWaveBasis{T,T}}, serial::PlaneWaveBasisSerialisation{T}) where {T}
-    PlaneWaveBasis(serial.model, serial.Ecut, serial.kcoords,
-                   serial.kweights; serial.fft_size,
-                   serial.kgrid, serial.kshift, serial.symmetries_respect_rgrid,
-                   serial.variational)
+function Base.convert(::Type{PlaneWaveBasis{T,T,AT,GT,RT}},
+                      serial::PlaneWaveBasisSerialisation{T,AT}) where {T,AT,GT,RT}
+    PlaneWaveBasis(serial.model, serial.Ecut, serial.kcoords, serial.kweights;
+                   serial.fft_size,
+                   serial.kgrid,
+                   serial.kshift,
+                   serial.symmetries_respect_rgrid,
+                   serial.variational,
+                   array_type=AT)
 end

From f6e45b9eac9f053d97d560979b989428883d36a5 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Mon, 24 Oct 2022 13:57:06 +0200
Subject: [PATCH 37/69] Fix tests

---
 src/postprocess/stresses.jl | 2 +-
 src/supercell.jl            | 2 +-
 src/symmetry.jl             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/postprocess/stresses.jl b/src/postprocess/stresses.jl
index ba4970c83f..583f038d28 100644
--- a/src/postprocess/stresses.jl
+++ b/src/postprocess/stresses.jl
@@ -19,7 +19,7 @@ Compute the stresses (= 1/Vol dE/d(M*lattice), taken at M=I) of an obtained SCF
                                    basis.Ecut, basis.fft_size, basis.variational,
                                    basis.kcoords_global, basis.kweights_global,
                                    basis.kgrid, basis.kshift, basis.symmetries_respect_rgrid,
-                                   basis.comm_kpts)
+                                   basis.comm_kpts, array_type(basis))
         ρ = DFTK.compute_density(new_basis, scfres.ψ, scfres.occupation)
         energies, _ = energy_hamiltonian(new_basis, scfres.ψ, scfres.occupation;
                                          ρ, scfres.eigenvalues, scfres.εF)
diff --git a/src/supercell.jl b/src/supercell.jl
index 4fb68ef511..906e85983e 100644
--- a/src/supercell.jl
+++ b/src/supercell.jl
@@ -43,7 +43,7 @@ function cell_to_supercell(basis::PlaneWaveBasis)
                    ones(3),              # kgrid = Γ point only
                    basis.kshift,         # kshift
                    symmetries_respect_rgrid,
-                   basis.comm_kpts)
+                   basis.comm_kpts, array_type(basis))
 end
 
 @doc raw"""
diff --git a/src/symmetry.jl b/src/symmetry.jl
index ec170ce5f5..6b4557b79b 100644
--- a/src/symmetry.jl
+++ b/src/symmetry.jl
@@ -269,7 +269,7 @@ function unfold_bz(basis::PlaneWaveBasis)
                               basis.Ecut, basis.fft_size, basis.variational,
                               kcoords, [1/length(kcoords) for _ in kcoords],
                               basis.kgrid, basis.kshift,
-                              basis.symmetries_respect_rgrid, basis.comm_kpts)
+                              basis.symmetries_respect_rgrid, basis.comm_kpts, array_type(basis))
     end
 end
 

From bd4e315b062b7b7b126da7db282e70c5cf485463 Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Wed, 26 Oct 2022 15:27:45 +0200
Subject: [PATCH 38/69] Update examples/gpu.jl

---
 examples/gpu.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/gpu.jl b/examples/gpu.jl
index dcf8842385..8d1ff8fa37 100644
--- a/examples/gpu.jl
+++ b/examples/gpu.jl
@@ -14,10 +14,9 @@ terms = [Kinetic(),
             Entropy(),
             Hartree()]
 # Now, build a supercell to have a larger system
-pystruct = pymatgen_structure(lattice, atoms, positions)
-pystruct.make_supercell([4,2,2])
-lattice   = load_lattice(pystruct)
-positions = load_positions(pystruct)
+supercell = ase_atoms(lattice, atoms, positions) * (repeat, 1, 1)
+lattice   = load_lattice(supercell)
+positions = load_positions(supercell)
 atoms     = fill(Si, length(positions))
 
 model = Model(lattice, atoms, positions; terms=terms, temperature=1e-3, symmetries=false)

From a339a4dfd5a599b4b8b4060b942cb8ee0e07d421 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Fri, 28 Oct 2022 12:08:58 +0200
Subject: [PATCH 39/69] Fix CPU performance bug when using FFTs

---
 src/fft.jl | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index 4001111d8b..3ed401571d 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -266,8 +266,7 @@ _fftw_flags(::Type{Float64}) = FFTW.MEASURE
 Plan a FFT of type `T` and size `fft_size`, spending some time on finding an
 optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are returned.
 """
-# Removed the flags as CUDA's plan_fft doesn't need flags. If this is a performance issue, we should
-# check array_type's type then call either FFTW.plan_fft(tmp, flags = ...) or CUDA.plan_fft(tmp)
+# Default fallback, which will be used when doing GPU computations.
 function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union{Float32,Float64}}
     tmp = similar(array_type, Complex{T}, fft_size...)
     ipFFT = AbstractFFTs.plan_fft!(tmp)
@@ -276,6 +275,14 @@ function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union
     ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
 end
 
+# Specific CPU version, using flags to be a bit faster.
+function build_fft_plans(array_type::Array{T}, fft_size) where {T<:Union{Float32,Float64}}
+    tmp = Array{Complex{T}}(undef, fft_size...)
+    ipFFT = FFTW.plan_fft!(tmp, flags=_fftw_flags(T))
+    opFFT = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
+    # backward by inverting and stripping off normalizations
+    ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
+end
 
 # TODO Some grid sizes are broken in the generic FFT implementation
 # in FourierTransforms, for more details see workarounds/fft_generic.jl

From 695263a34abb45f066550b2d776d68e1983c85af Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 2 Nov 2022 10:50:59 +0100
Subject: [PATCH 40/69] WIP: remove the AT type in PWB

---
 src/PlaneWaveBasis.jl    |  9 ++++++---
 src/common/ortho.jl      |  9 +++++++++
 src/common/zeros_like.jl | 11 +++++++++++
 src/densities.jl         |  4 ++--
 src/guess_density.jl     |  2 +-
 src/occupation.jl        |  2 +-
 src/orbitals.jl          |  2 +-
 src/terms/Hamiltonian.jl |  2 +-
 src/terms/hartree.jl     |  2 +-
 src/terms/nonlocal.jl    |  2 +-
 10 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index fd60dc6378..f6c4441d2c 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -40,7 +40,7 @@ Normalization conventions:
 
 `ifft` and `fft` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT, AT, GT, RT} <: AbstractBasis{T} where {VT <: Real, GT, RT, AT <: AbstractArray}
+struct PlaneWaveBasis{T, VT, GT, RT} <: AbstractBasis{T} where {VT <: Real, GT <: AbstractArray, RT <: AbstractArray}
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 
@@ -115,7 +115,10 @@ Base.Broadcast.broadcastable(basis::PlaneWaveBasis) = Ref(basis)
 Return the type of array used for computations (Array if on CPU, CuArray,
 ROCArray... if on GPU).
 """
-array_type(basis::PlaneWaveBasis{T,VT,AT}) where {T, VT, AT} = AT
+function array_type(basis::PlaneWaveBasis)
+    Base.typename(typeof(basis.G_vectors)).wrapper
+end
+
 Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 
 @timing function build_kpoints(model::Model{T}, fft_size, kcoords, Ecut;
@@ -263,7 +266,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
 
     RT = array_type{Vec3{VT}, 3}
     GT = array_type{Vec3{Int}, 3}
-    basis = PlaneWaveBasis{T,value_type(T), array_type, GT, RT}(
+    basis = PlaneWaveBasis{T,value_type(T), GT, RT}(
         model, fft_size, dvol,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index 7d89622016..3d2548fbeb 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,2 +1,11 @@
 # Orthonormalize and convert to an array of the type "array_type".
 @timing ortho_qr(φk::AbstractArray; array_type = Matrix) = array_type(qr(φk).Q)
+
+# @timing function ortho_qr(φk::AbstractArray; array_type = Matrix)
+#     Q = qr(φk).Q
+#     println(typeof(φk))
+#     print(size(Array(Q)))
+#     res = convert(typeof(φk), Q)
+#     print(size(res))
+#     res
+# end
diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl
index ebcdfad05d..204cc82a98 100644
--- a/src/common/zeros_like.jl
+++ b/src/common/zeros_like.jl
@@ -8,3 +8,14 @@ end
 zeros_like(X::AbstractArray, dims::Integer...) = zeros_like(X, eltype(X), dims...)
 zeros_like(X::Array, T::Type=eltype(X), dims::Integer...=size(X)...) = zeros(T, dims...)
 zeros_like(X::StaticArray, T::Type=eltype(X), dims::Integer...=size(X)...) = @SArray zeros(T, dims...)
+
+function copy_like(array_model::AbstractArray, src::AbstractArray)
+    copy!(similar(array_model, eltype(src), size(src)...), src)
+end
+
+copy_like(array_model::Array, src::Array) = src
+
+# function copy_like(array_model::Array, src::Array, T::Type=eltype(src), dims::Integer...=size(src)...)
+#     T == eltype(src) && dims == size(src) && src
+#     copy_like(array_model, src, T, dims...)
+# end
diff --git a/src/densities.jl b/src/densities.jl
index 7a86a24cbc..ecc14c721d 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -27,9 +27,9 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
     chunk_length = cld(length(ik_n), Threads.nthreads())
 
     # chunk-local variables
-    ρ_chunklocal = [convert(array_type(basis), zeros(T, basis.fft_size..., basis.model.n_spin_components))
+    ρ_chunklocal = [zeros_like(basis.G_vectors, T, basis.fft_size..., basis.model.n_spin_components)
                     for _ = 1:Threads.nthreads()]
-    ψnk_real_chunklocal = [convert(array_type(basis), zeros(complex(T), basis.fft_size))
+    ψnk_real_chunklocal = [zeros_like(basis.G_vectors, complex(T), basis.fft_size...)
                             for _ = 1:Threads.nthreads()]
 
     # TODO We should probably pass occupation_threshold here and ignore bands
diff --git a/src/guess_density.jl b/src/guess_density.jl
index dd0b9bc0af..9785b0a9a9 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -66,7 +66,7 @@ function _guess_spin_density(basis::PlaneWaveBasis{T}, atoms, positions, magneti
         @warn("Returning zero spin density guess, because no initial magnetization has " *
               "been specified in any of the given elements / atoms. Your SCF will likely " *
               "not converge to a spin-broken solution.")
-        return convert(array_type(basis),zeros(T, basis.fft_size))
+        return zeros_like(basis.G_vectors, T, basis.fft_size...)
     end
 
     @assert length(magmoms) == length(atoms) == length(positions)
diff --git a/src/occupation.jl b/src/occupation.jl
index 08164e0614..bd0fd91ddb 100644
--- a/src/occupation.jl
+++ b/src/occupation.jl
@@ -29,7 +29,7 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF;
     inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature
 
     filled_occ = filled_occupation(basis.model)
-    [convert(array_type(basis),  # GPU computation only: put each occupation on GPU
+    [copy_like(basis.G_vectors,  # GPU computation only: put each occupation on GPU
         filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature))
         for εk in eigenvalues]
 end
diff --git a/src/orbitals.jl b/src/orbitals.jl
index 06509b2708..cf4be47b58 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -57,7 +57,7 @@ function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {
     @static if VERSION < v"1.7"
         # Don't use TaskLocalRNG as it is not available.
         orbitals = randn(Complex{T}, length(G_vectors(basis, kpt)), howmany)
-        orbitals = convert(array_type(basis), orbitals)
+        orbitals = copy_like(basis.G_vectors, orbitals)
     else
         orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
         randn!(TaskLocalRNG(), orbitals)  # Force the use of GPUArrays.jl's random function if using the GPU
diff --git a/src/terms/Hamiltonian.jl b/src/terms/Hamiltonian.jl
index 4252588fb0..9ac79dd86d 100644
--- a/src/terms/Hamiltonian.jl
+++ b/src/terms/Hamiltonian.jl
@@ -51,7 +51,7 @@ function HamiltonianBlock(basis, kpoint, operators, scratch=ham_allocate_scratch
     end
 end
 function ham_allocate_scratch_(basis::PlaneWaveBasis{T}) where {T}
-    (ψ_reals=[convert(array_type(basis), zeros(complex(T), basis.fft_size...)) for _ = 1:Threads.nthreads()], )
+    (ψ_reals=[zeros_like(basis.G_vectors, complex(T), basis.fft_size...) for _ = 1:Threads.nthreads()], )
 end
 
 Base.:*(H::HamiltonianBlock, ψ) = mul!(similar(ψ), H, ψ)
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 382e203f97..a20793772f 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -42,7 +42,7 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
     # of calling back the array on CPU, running force_real!, then putting it back on GPU
     poisson_green_coeffs = Array(poisson_green_coeffs)
     enforce_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
-    poisson_green_coeffs = convert(array_type(basis), poisson_green_coeffs)
+    poisson_green_coeffs = copy_like(basis.G_vectors,poisson_green_coeffs)
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)
 end
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index e7f0a70c05..3416015c58 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -175,7 +175,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     end
     @assert offset == n_proj
     # GPU computation only : build the vectors on CPU then offload them to the GPU
-    convert(array_type(basis), proj_vectors)
+    copy_like(basis.G_vectors, proj_vectors)
 end
 
 """

From 7afc5fd293b852c9f63a5db0b2de0838045b549c Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 2 Nov 2022 11:22:21 +0100
Subject: [PATCH 41/69] WIP: copy_like also takes Type arguments

---
 src/PlaneWaveBasis.jl       | 26 +++++++++++++-------------
 src/common/zeros_like.jl    |  4 ++++
 src/postprocess/stresses.jl |  2 +-
 src/scf/mixing.jl           |  2 +-
 src/supercell.jl            |  2 +-
 src/symmetry.jl             |  2 +-
 src/terms/local.jl          |  2 +-
 src/terms/nonlocal.jl       |  4 ++--
 8 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index f6c4441d2c..9e6c332e20 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -122,7 +122,8 @@ end
 Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 
 @timing function build_kpoints(model::Model{T}, fft_size, kcoords, Ecut;
-                               variational=true, array_type::Type=Array) where {T}
+                               variational=true,
+                               array_type::Union{Type,AbstractArray} = Array) where {T}
     kpoints_per_spin = [Kpoint[] for _ in 1:model.n_spin_components]
     for k in kcoords
         k = Vec3{T}(k)  # rationals are sloooow
@@ -138,13 +139,12 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
                 push!(Gvecs_k, G)
             end
         end
-        Gvecs_k = convert(array_type, Gvecs_k)  # GPU computation only: offload the Gs to the GPU
-        GT = array_type{Vec3{Int}}
+        Gvecs_k = copy_like(array_type, Gvecs_k)  # GPU computation only: offload the Gs to the GPU
 
         mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping))
         for iσ = 1:model.n_spin_components
             push!(kpoints_per_spin[iσ],
-                  Kpoint{T,GT}(iσ, k, mapping, mapping_inv, Gvecs_k))
+                  Kpoint{T,typeof(Gvecs_k)}(iσ, k, mapping, mapping_inv, Gvecs_k))
         end
     end
 
@@ -152,7 +152,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 end
 function build_kpoints(basis::PlaneWaveBasis, kcoords)
     build_kpoints(basis.model, basis.fft_size, kcoords, basis.Ecut;
-                  variational=basis.variational, array_type = array_type(basis))
+                  variational=basis.variational, array_type = basis.G_vectors)
 end
 
 # Lowest-level constructor, should not be called directly.
@@ -160,7 +160,8 @@ end
 # and are stored in PlaneWaveBasis for easy reconstruction.
 function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
                         kcoords, kweights, kgrid, kshift,
-                        symmetries_respect_rgrid, comm_kpts, array_type::Type) where {T <: Real}
+                        symmetries_respect_rgrid, comm_kpts,
+                        array_type::Union{Type,AbstractArray} = Array) where {T <: Real}
     # Validate fft_size
     if variational
         max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
@@ -262,11 +263,10 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     VT = value_type(T)
     dvol  = model.unit_cell_volume ./ prod(fft_size)
     r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) for i = 1:N1, j = 1:N2, k = 1:N3]
+    r_vectors = copy_like(array_type, r_vectors)
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
-    RT = array_type{Vec3{VT}, 3}
-    GT = array_type{Vec3{Int}, 3}
-    basis = PlaneWaveBasis{T,value_type(T), GT, RT}(
+    basis = PlaneWaveBasis{T,value_type(T), typeof(Gs), typeof(r_vectors)}(
         model, fft_size, dvol,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
@@ -338,7 +338,7 @@ Creates a new basis identical to `basis`, but with a custom set of kpoints
     PlaneWaveBasis(basis.model, basis.Ecut,
                    basis.fft_size, basis.variational,
                    kcoords, kweights, kgrid, kshift,
-                   basis.symmetries_respect_rgrid, basis.comm_kpts, array_type(basis))
+                   basis.symmetries_respect_rgrid, basis.comm_kpts, basis.G_vectors)
 end
 
 """
@@ -348,14 +348,14 @@ The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
 """
 
-function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type = Array)
+function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type::Union{Type,AbstractArray} = Array)
     # Note that a collect(G_vectors_generator(fft_size)) is 100-fold slower
     # than this implementation, hence the code duplication.
     start = .- cld.(fft_size .- 1, 2)
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
     Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
-    convert(array_type, Gs)  # GPU computation only : offload the Gs to the GPU.
+    copy_like(array_type, Gs)  # GPU computation only : offload the Gs to the GPU.
 end
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
     # The generator version is used mainly in symmetry.jl for lowpass_for_symmetry! and
@@ -509,7 +509,7 @@ function gather_kpts(basis)
                        basis.kshift,
                        basis.symmetries_respect_rgrid,
                        comm_kpts=MPI.COMM_SELF,
-                       array_type=DFTK.array_type(basis))
+                       array_type=basis.G_vectors)
     end
 end
 
diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl
index 204cc82a98..0cc9517578 100644
--- a/src/common/zeros_like.jl
+++ b/src/common/zeros_like.jl
@@ -15,6 +15,10 @@ end
 
 copy_like(array_model::Array, src::Array) = src
 
+function copy_like(array_model::Type, src::AbstractArray)
+    convert(array_model, src)
+end
+
 # function copy_like(array_model::Array, src::Array, T::Type=eltype(src), dims::Integer...=size(src)...)
 #     T == eltype(src) && dims == size(src) && src
 #     copy_like(array_model, src, T, dims...)
diff --git a/src/postprocess/stresses.jl b/src/postprocess/stresses.jl
index 583f038d28..4190d00290 100644
--- a/src/postprocess/stresses.jl
+++ b/src/postprocess/stresses.jl
@@ -19,7 +19,7 @@ Compute the stresses (= 1/Vol dE/d(M*lattice), taken at M=I) of an obtained SCF
                                    basis.Ecut, basis.fft_size, basis.variational,
                                    basis.kcoords_global, basis.kweights_global,
                                    basis.kgrid, basis.kshift, basis.symmetries_respect_rgrid,
-                                   basis.comm_kpts, array_type(basis))
+                                   basis.comm_kpts, basis.G_vectors)
         ρ = DFTK.compute_density(new_basis, scfres.ψ, scfres.occupation)
         energies, _ = energy_hamiltonian(new_basis, scfres.ψ, scfres.occupation;
                                          ρ, scfres.eigenvalues, scfres.εF)
diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl
index 1dc6da2028..c80700f757 100644
--- a/src/scf/mixing.jl
+++ b/src/scf/mixing.jl
@@ -79,7 +79,7 @@ end
     # of calling back the array on CPU, running force_real!, then putting it back on GPU
     δρtot_fourier = Array(δρtot_fourier)
     enforce_real!(basis, δρtot_fourier)
-    δρtot_fourier = convert(array_type(basis), δρtot_fourier)
+    δρtot_fourier = copy_like(basis.G_vectors, δρtot_fourier)
 
     δρtot = irfft(basis, δρtot_fourier)
 
diff --git a/src/supercell.jl b/src/supercell.jl
index 906e85983e..c201c64a31 100644
--- a/src/supercell.jl
+++ b/src/supercell.jl
@@ -43,7 +43,7 @@ function cell_to_supercell(basis::PlaneWaveBasis)
                    ones(3),              # kgrid = Γ point only
                    basis.kshift,         # kshift
                    symmetries_respect_rgrid,
-                   basis.comm_kpts, array_type(basis))
+                   basis.comm_kpts, basis.G_vectors)
 end
 
 @doc raw"""
diff --git a/src/symmetry.jl b/src/symmetry.jl
index 6b4557b79b..fef9a7422c 100644
--- a/src/symmetry.jl
+++ b/src/symmetry.jl
@@ -269,7 +269,7 @@ function unfold_bz(basis::PlaneWaveBasis)
                               basis.Ecut, basis.fft_size, basis.variational,
                               kcoords, [1/length(kcoords) for _ in kcoords],
                               basis.kgrid, basis.kshift,
-                              basis.symmetries_respect_rgrid, basis.comm_kpts, array_type(basis))
+                              basis.symmetries_respect_rgrid, basis.comm_kpts, basis.G_vectors)
     end
 end
 
diff --git a/src/terms/local.jl b/src/terms/local.jl
index 0aec828069..bd9b7a7114 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -104,7 +104,7 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     end
     enforce_real!(basis, pot_fourier)  # Symmetrize Fourier coeffs to have real iFFT
     # GPU computation only : build the potential values on CPU then offload them to GPU
-    pot_real = irfft(basis, convert(array_type(basis), pot_fourier))
+    pot_real = irfft(basis, copy_like(basis.G_vectors, pot_fourier))
 
     TermAtomicLocal(pot_real)
 end
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 3416015c58..b924604858 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -15,7 +15,7 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T}
     isempty(psp_groups) && return TermNoop()
     ops = map(basis.kpoints) do kpt
         P = build_projection_vectors_(basis, kpt, psps, psp_positions)
-        D = build_projection_coefficients_(T, psps, psp_positions, array_type = array_type(basis))
+        D = build_projection_coefficients_(T, psps, psp_positions, array_type = basis.G_vectors)
         NonlocalOperator(basis, kpt, P, D)
     end
     TermAtomicNonlocal(ops)
@@ -109,7 +109,7 @@ function build_projection_coefficients_(T, psps, psp_positions; array_type = Arr
     @assert count == n_proj
 
     # GPU computation only : build the coefficients on CPU then offload them to the GPU
-    convert(array_type, proj_coeffs)
+    copy_like(array_type, proj_coeffs)
 end
 
 # Builds the projection coefficient matrix for a single atom

From 2563c58ec83ec3f8404fa9533e32896dca5cbcd6 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 2 Nov 2022 13:54:37 +0100
Subject: [PATCH 42/69] Remove AT parameter from PWB

---
 src/PlaneWaveBasis.jl    |  9 +++------
 src/common/ortho.jl      | 19 ++++++++-----------
 src/common/zeros_like.jl | 10 +++++-----
 src/external/jld2io.jl   | 21 ++++++++++++---------
 src/occupation.jl        |  2 +-
 src/orbitals.jl          |  4 ++--
 src/scf/mixing.jl        |  2 +-
 src/terms/hartree.jl     |  2 +-
 src/terms/local.jl       |  2 +-
 src/terms/nonlocal.jl    |  4 ++--
 10 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 9e6c332e20..8dab22652b 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -115,9 +115,6 @@ Base.Broadcast.broadcastable(basis::PlaneWaveBasis) = Ref(basis)
 Return the type of array used for computations (Array if on CPU, CuArray,
 ROCArray... if on GPU).
 """
-function array_type(basis::PlaneWaveBasis)
-    Base.typename(typeof(basis.G_vectors)).wrapper
-end
 
 Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 
@@ -139,7 +136,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
                 push!(Gvecs_k, G)
             end
         end
-        Gvecs_k = copy_like(array_type, Gvecs_k)  # GPU computation only: offload the Gs to the GPU
+        Gvecs_k = convert_like(array_type, Gvecs_k)  # GPU computation only: offload the Gs to the GPU
 
         mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping))
         for iσ = 1:model.n_spin_components
@@ -263,7 +260,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     VT = value_type(T)
     dvol  = model.unit_cell_volume ./ prod(fft_size)
     r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) for i = 1:N1, j = 1:N2, k = 1:N3]
-    r_vectors = copy_like(array_type, r_vectors)
+    r_vectors = convert_like(array_type, r_vectors)
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
     basis = PlaneWaveBasis{T,value_type(T), typeof(Gs), typeof(r_vectors)}(
@@ -355,7 +352,7 @@ function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type::Union{Type
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
     Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
-    copy_like(array_type, Gs)  # GPU computation only : offload the Gs to the GPU.
+    convert_like(array_type, Gs)  # GPU computation only : offload the Gs to the GPU.
 end
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
     # The generator version is used mainly in symmetry.jl for lowpass_for_symmetry! and
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index 3d2548fbeb..ce6a35db56 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,11 +1,8 @@
-# Orthonormalize and convert to an array of the type "array_type".
-@timing ortho_qr(φk::AbstractArray; array_type = Matrix) = array_type(qr(φk).Q)
-
-# @timing function ortho_qr(φk::AbstractArray; array_type = Matrix)
-#     Q = qr(φk).Q
-#     println(typeof(φk))
-#     print(size(Array(Q)))
-#     res = convert(typeof(φk), Q)
-#     print(size(res))
-#     res
-# end
+# Orthonormalize
+@timing function ortho_qr(φk::AbstractArray)
+    Q = qr(φk).Q
+    Q = convert(typeof(φk), Q)
+    # CUDA bug: after the convert line, when φk is m*n rectangular matrix with m > n,
+    # Q is not cropped ie only the first size(φk, 2) columns should be kept
+    Q[:, 1:size(φk, 2)]
+end
diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl
index 0cc9517578..2060025d15 100644
--- a/src/common/zeros_like.jl
+++ b/src/common/zeros_like.jl
@@ -9,17 +9,17 @@ zeros_like(X::AbstractArray, dims::Integer...) = zeros_like(X, eltype(X), dims..
 zeros_like(X::Array, T::Type=eltype(X), dims::Integer...=size(X)...) = zeros(T, dims...)
 zeros_like(X::StaticArray, T::Type=eltype(X), dims::Integer...=size(X)...) = @SArray zeros(T, dims...)
 
-function copy_like(array_model::AbstractArray, src::AbstractArray)
+function convert_like(array_model::AbstractArray, src::AbstractArray)
     copy!(similar(array_model, eltype(src), size(src)...), src)
 end
 
-copy_like(array_model::Array, src::Array) = src
+convert_like(array_model::Array, src::Array) = src
 
-function copy_like(array_model::Type, src::AbstractArray)
+function convert_like(array_model::Type, src::AbstractArray)
     convert(array_model, src)
 end
 
-# function copy_like(array_model::Array, src::Array, T::Type=eltype(src), dims::Integer...=size(src)...)
+# function convert_like(array_model::Array, src::Array, T::Type=eltype(src), dims::Integer...=size(src)...)
 #     T == eltype(src) && dims == size(src) && src
-#     copy_like(array_model, src, T, dims...)
+#     convert_like(array_model, src, T, dims...)
 # end
diff --git a/src/external/jld2io.jl b/src/external/jld2io.jl
index 7b1e1ff2d6..fe5a242119 100644
--- a/src/external/jld2io.jl
+++ b/src/external/jld2io.jl
@@ -74,7 +74,7 @@ load_scfres(file::AbstractString) = JLD2.jldopen(load_scfres, file, "r")
 #
 # Custom serialisations
 #
-struct PlaneWaveBasisSerialisation{T <: Real, AT <: AbstractArray}
+struct PlaneWaveBasisSerialisation{T <: Real, GT <: AbstractArray}
     model::Model{T,T}
     Ecut::T
     variational::Bool
@@ -85,13 +85,13 @@ struct PlaneWaveBasisSerialisation{T <: Real, AT <: AbstractArray}
     symmetries_respect_rgrid::Bool
     fft_size::Tuple{Int, Int, Int}
 end
-function JLD2.writeas(::Type{PlaneWaveBasis{T,T,AT,GT,RT}}) where {T,AT,GT,RT}
-    PlaneWaveBasisSerialisation{T,AT}
+function JLD2.writeas(::Type{PlaneWaveBasis{T,T,GT,RT}}) where {T,GT,RT}
+    PlaneWaveBasisSerialisation{T,GT}
 end
 
-function Base.convert(::Type{PlaneWaveBasisSerialisation{T,AT}},
-                      basis::PlaneWaveBasis{T,T,AT}) where {T,AT}
-    PlaneWaveBasisSerialisation{T,AT}(
+function Base.convert(::Type{PlaneWaveBasisSerialisation{T,GT}},
+                      basis::PlaneWaveBasis{T,T,GT}) where {T,GT}
+    PlaneWaveBasisSerialisation{T,GT}(
         basis.model,
         basis.Ecut,
         basis.variational,
@@ -104,13 +104,16 @@ function Base.convert(::Type{PlaneWaveBasisSerialisation{T,AT}},
     )
 end
 
-function Base.convert(::Type{PlaneWaveBasis{T,T,AT,GT,RT}},
-                      serial::PlaneWaveBasisSerialisation{T,AT}) where {T,AT,GT,RT}
+function Base.convert(::Type{PlaneWaveBasis{T,T,GT,RT}},
+                      serial::PlaneWaveBasisSerialisation{T,GT}) where {T,GT,RT}
     PlaneWaveBasis(serial.model, serial.Ecut, serial.kcoords, serial.kweights;
                    serial.fft_size,
                    serial.kgrid,
                    serial.kshift,
                    serial.symmetries_respect_rgrid,
                    serial.variational,
-                   array_type=AT)
+                   array_type=similar(GT, 1, 1, 1))
+                   # Can't use GT directly as it is Array{type, 2} instead of Array
+                   # so we build an array with type GT. GT is the G_vectors'type, so it
+                   # represents 3-dimensional arrays, hence the three 1's.
 end
diff --git a/src/occupation.jl b/src/occupation.jl
index bd0fd91ddb..f60915b295 100644
--- a/src/occupation.jl
+++ b/src/occupation.jl
@@ -29,7 +29,7 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF;
     inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature
 
     filled_occ = filled_occupation(basis.model)
-    [copy_like(basis.G_vectors,  # GPU computation only: put each occupation on GPU
+    [convert_like(basis.G_vectors,  # GPU computation only: put each occupation on GPU
         filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature))
         for εk in eigenvalues]
 end
diff --git a/src/orbitals.jl b/src/orbitals.jl
index cf4be47b58..cca1099b42 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -57,10 +57,10 @@ function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {
     @static if VERSION < v"1.7"
         # Don't use TaskLocalRNG as it is not available.
         orbitals = randn(Complex{T}, length(G_vectors(basis, kpt)), howmany)
-        orbitals = copy_like(basis.G_vectors, orbitals)
+        orbitals = convert_like(basis.G_vectors, orbitals)
     else
         orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
         randn!(TaskLocalRNG(), orbitals)  # Force the use of GPUArrays.jl's random function if using the GPU
     end
-    ortho_qr(orbitals; array_type = array_type(basis))
+    ortho_qr(orbitals)
 end
diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl
index c80700f757..92fd5a2601 100644
--- a/src/scf/mixing.jl
+++ b/src/scf/mixing.jl
@@ -79,7 +79,7 @@ end
     # of calling back the array on CPU, running force_real!, then putting it back on GPU
     δρtot_fourier = Array(δρtot_fourier)
     enforce_real!(basis, δρtot_fourier)
-    δρtot_fourier = copy_like(basis.G_vectors, δρtot_fourier)
+    δρtot_fourier = convert_like(basis.G_vectors, δρtot_fourier)
 
     δρtot = irfft(basis, δρtot_fourier)
 
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index a20793772f..56f8e79142 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -42,7 +42,7 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
     # of calling back the array on CPU, running force_real!, then putting it back on GPU
     poisson_green_coeffs = Array(poisson_green_coeffs)
     enforce_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
-    poisson_green_coeffs = copy_like(basis.G_vectors,poisson_green_coeffs)
+    poisson_green_coeffs = convert_like(basis.G_vectors,poisson_green_coeffs)
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)
 end
diff --git a/src/terms/local.jl b/src/terms/local.jl
index bd9b7a7114..bf50c197dc 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -104,7 +104,7 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     end
     enforce_real!(basis, pot_fourier)  # Symmetrize Fourier coeffs to have real iFFT
     # GPU computation only : build the potential values on CPU then offload them to GPU
-    pot_real = irfft(basis, copy_like(basis.G_vectors, pot_fourier))
+    pot_real = irfft(basis, convert_like(basis.G_vectors, pot_fourier))
 
     TermAtomicLocal(pot_real)
 end
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index b924604858..b7c1d36923 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -109,7 +109,7 @@ function build_projection_coefficients_(T, psps, psp_positions; array_type = Arr
     @assert count == n_proj
 
     # GPU computation only : build the coefficients on CPU then offload them to the GPU
-    copy_like(array_type, proj_coeffs)
+    convert_like(array_type, proj_coeffs)
 end
 
 # Builds the projection coefficient matrix for a single atom
@@ -175,7 +175,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     end
     @assert offset == n_proj
     # GPU computation only : build the vectors on CPU then offload them to the GPU
-    copy_like(basis.G_vectors, proj_vectors)
+    convert_like(basis.G_vectors, proj_vectors)
 end
 
 """

From d9d9da1d20a5460a0679af4b9e59e71f465a3fab Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 3 Nov 2022 14:23:39 +0100
Subject: [PATCH 43/69] Add Kpoints's array type as a parametric type in PWB

---
 src/PlaneWaveBasis.jl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 8dab22652b..282f95fba9 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -40,7 +40,8 @@ Normalization conventions:
 
 `ifft` and `fft` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT, GT, RT} <: AbstractBasis{T} where {VT <: Real, GT <: AbstractArray, RT <: AbstractArray}
+struct PlaneWaveBasis{T, VT, GT, RT, KGT} <: AbstractBasis{T} where {VT <: Real, GT <: AbstractArray,
+                                                                    RT <: AbstractArray, KGT <: AbstractArray}
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 
@@ -74,7 +75,7 @@ struct PlaneWaveBasis{T, VT, GT, RT} <: AbstractBasis{T} where {VT <: Real, GT <
     ## MPI-local information of the kpoints this processor treats
     # Irreducible kpoints. In the case of collinear spin,
     # this lists all the spin up, then all the spin down
-    kpoints::Vector{Kpoint{T}}
+    kpoints::Vector{Kpoint{T, KGT}}
     # BZ integration weights, summing up to model.n_spin_components
     kweights::Vector{T}
 
@@ -152,6 +153,8 @@ function build_kpoints(basis::PlaneWaveBasis, kcoords)
                   variational=basis.variational, array_type = basis.G_vectors)
 end
 
+kpt_array_type(A::Kpoint{T, GT}) where {T,GT} = GT
+
 # Lowest-level constructor, should not be called directly.
 # All given parameters must be the same on all processors
 # and are stored in PlaneWaveBasis for easy reconstruction.
@@ -263,7 +266,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     r_vectors = convert_like(array_type, r_vectors)
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
-    basis = PlaneWaveBasis{T,value_type(T), typeof(Gs), typeof(r_vectors)}(
+    basis = PlaneWaveBasis{T,value_type(T), typeof(Gs), typeof(r_vectors), kpt_array_type(kpoints[1])}(
         model, fft_size, dvol,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,

From 0a69e2c3b298db18ecf088128cc8256f98b8dfaa Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 3 Nov 2022 15:08:29 +0100
Subject: [PATCH 44/69] Fix serialization test

---
 src/external/jld2io.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/external/jld2io.jl b/src/external/jld2io.jl
index fe5a242119..b1a4d4bad9 100644
--- a/src/external/jld2io.jl
+++ b/src/external/jld2io.jl
@@ -85,7 +85,7 @@ struct PlaneWaveBasisSerialisation{T <: Real, GT <: AbstractArray}
     symmetries_respect_rgrid::Bool
     fft_size::Tuple{Int, Int, Int}
 end
-function JLD2.writeas(::Type{PlaneWaveBasis{T,T,GT,RT}}) where {T,GT,RT}
+function JLD2.writeas(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}}) where {T,GT,RT,KGT}
     PlaneWaveBasisSerialisation{T,GT}
 end
 
@@ -104,8 +104,8 @@ function Base.convert(::Type{PlaneWaveBasisSerialisation{T,GT}},
     )
 end
 
-function Base.convert(::Type{PlaneWaveBasis{T,T,GT,RT}},
-                      serial::PlaneWaveBasisSerialisation{T,GT}) where {T,GT,RT}
+function Base.convert(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}},
+                      serial::PlaneWaveBasisSerialisation{T,GT}) where {T,GT,RT,KGT}
     PlaneWaveBasis(serial.model, serial.Ecut, serial.kcoords, serial.kweights;
                    serial.fft_size,
                    serial.kgrid,

From ecfd306d01237b95e5e846550bd24e60ff76a856 Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Sat, 5 Nov 2022 20:25:14 +0100
Subject: [PATCH 45/69] Fix example

---
 examples/gpu.jl | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/examples/gpu.jl b/examples/gpu.jl
index 8d1ff8fa37..2819b075f0 100644
--- a/examples/gpu.jl
+++ b/examples/gpu.jl
@@ -1,27 +1,24 @@
 using DFTK
 using CUDA
 
-a = 10.263141334305942  # Lattice constant in Bohr
-lattice = a / 2 .* [[0 1 1.]; [1 0 1.]; [1 1 0.]]
+a = 10.26  # Silicon lattice constant in Bohr
+lattice = a / 2 * [[0 1 1.];
+                   [1 0 1.];
+                   [1 1 0.]]
 Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
 atoms     = [Si, Si]
 positions = [ones(3)/8, -ones(3)/8]
-terms = [Kinetic(),
-            AtomicLocal(),
-            AtomicNonlocal(),
-            Ewald(),
-            PspCorrection(),
-            Entropy(),
-            Hartree()]
-# Now, build a supercell to have a larger system
-supercell = ase_atoms(lattice, atoms, positions) * (repeat, 1, 1)
-lattice   = load_lattice(supercell)
-positions = load_positions(supercell)
-atoms     = fill(Si, length(positions))
+model = model_DFT(lattice, atoms, positions, []; temperature=1e-3)
 
-model = Model(lattice, atoms, positions; terms=terms, temperature=1e-3, symmetries=false)
-# Notice the only difference in the code, with the optional argument array_type
-basis_gpu = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), array_type = CuArray)
-# You can now check that some of the fields of the basis, such as the G_vectors, are CuArrays
+if has_cuda()
+    # Use CUDA to store DFT quantities and perform main computations
+    # For this we set the array_type for storing DFT quantities to a GPU array type
+    array_type = CuArray
+else
+    array_type = Array  # Keep using the CPU
+end
 
-scfres = self_consistent_field(basis_gpu; tol=1e-3, solver=scf_anderson_solver(), mixing = KerkerMixing())
+basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), array_type)
+scfres = self_consistent_field(basis; tol=1e-3,
+                               solver=scf_anderson_solver(),
+                               mixing=KerkerMixing())

From 4c459773a3545e64263e0652007920ae6b52eab3 Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Tue, 8 Nov 2022 10:00:20 +0100
Subject: [PATCH 46/69] Simplify coordinate transformation routines

---
 src/Model.jl          | 72 +++++++++++++++++--------------------------
 src/PlaneWaveBasis.jl |  8 +++--
 2 files changed, 33 insertions(+), 47 deletions(-)

diff --git a/src/Model.jl b/src/Model.jl
index 9b44920a15..5b89159318 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -280,47 +280,24 @@ Reciprocal vectors are a special case: they are covectors, but conventionally ha
 additional factor of 2π in their definition, so they transform rather with 2π times the
 inverse lattice transpose: q_cart = 2π lattice' \ q_red = recip_lattice * q_red.
 
-The trans_mat functions return the transition matrices required to do such a change of basis.
+For each of the function there is a one-argument version (returning a function to do the
+transformation) and a two-argument version applying the transformation to a passed vector.
 =#
-
-trans_mat_vector_red_to_cart(model::Model)       = model.lattice
-trans_mat_vector_cart_to_red(model::Model)       = model.inv_lattice
-trans_mat_covector_red_to_cart(model::Model)     = model.inv_lattice'
-trans_mat_covector_cart_to_red(model::Model)     = model.lattice'
-trans_mat_recip_vector_red_to_cart(model::Model) = model.recip_lattice
-trans_mat_recip_vector_cart_to_red(model::Model) = model.inv_recip_lattice
-
-fun_mat_list =(:vector_red_to_cart,
-                :vector_cart_to_red,
-                :covector_red_to_cart,
-                :covector_cart_to_red,
-                :recip_vector_red_to_cart,
-                :recip_vector_cart_to_red
-)
-
-for fun1 in fun_mat_list
-    #=
-    The following functions compute the change of basis for a given vector. To do so,
-    they call the trans_mat functions to get the corresponding transition matrix.
-    These functions can be broadcasted over an Array of vectors: however, they are
-    not GPU compatible, as they require the model, which is not isbits.
-    =#
-    @eval $fun1(model::Model, vec) = $(Symbol("trans_mat_"*string(fun1)))(model::Model) * vec
-    #=
-    The following functions take an AbstractArray of vectors and compute the change of basis
-    for every vector in the AbstractArray: they return an AbstractArray of the same type
-    and size as the input, but containing the vectors in a new basis.
-    These functions are GPU compatible (ie the AbstractArray can be a GPUArray), since
-    they use a map and the transition matrices are static arrays.
-    =#
-    @eval function $(Symbol("map_"*string(fun1)))(model::Model, A::AbstractArray{AT}) where {AT <: Vec3}
-        trans_matrix = $(Symbol("trans_mat_"*string(fun1)))(model)
-        in_new_basis = map(A) do Ai
-            trans_matrix  * Ai
-        end
-        in_new_basis
-    end
-end
+_closure_matmul(mat) = vec -> mat * vec
+
+vector_red_to_cart(model::Model)       = _closure_matmul(model.lattice)
+vector_cart_to_red(model::Model)       = _closure_matmul(model.inv_lattice)
+covector_red_to_cart(model::Model)     = _closure_matmul(model.inv_lattice')
+covector_cart_to_red(model::Model)     = _closure_matmul(model.lattice')
+recip_vector_red_to_cart(model::Model) = _closure_matmul(model.recip_lattice)
+recip_vector_cart_to_red(model::Model) = _closure_matmul(model.inv_recip_lattice)
+
+vector_red_to_cart(model::Model, vec)       = vector_red_to_cart(model)(vec)
+vector_cart_to_red(model::Model, vec)       = vector_cart_to_red(model)(vec)
+covector_red_to_cart(model::Model, vec)     = covector_red_to_cart(model)(vec)
+covector_cart_to_red(model::Model, vec)     = covector_cart_to_red(model)(vec)
+recip_vector_red_to_cart(model::Model, vec) = recip_vector_red_to_cart(model)(vec)
+recip_vector_cart_to_red(model::Model, vec) = recip_vector_cart_to_red(model)(vec)
 
 #=
 Transformations on vectors and covectors are matrices and comatrices.
@@ -335,7 +312,14 @@ s_cart = L s_red = L A_red r_red = L A_red L⁻¹ r_cart, thus A_cart = L A_red
 Examples of matrices are the symmetries in real space (W)
 Examples of comatrices are the symmetries in reciprocal space (S)
 =#
-matrix_red_to_cart(model::Model, Ared)    = model.lattice * Ared * model.inv_lattice
-matrix_cart_to_red(model::Model, Acart)   = model.inv_lattice * Acart * model.lattice
-comatrix_red_to_cart(model::Model, Bred)  = model.inv_lattice' * Bred * model.lattice'
-comatrix_cart_to_red(model::Model, Bcart) = model.lattice' * Bcart * model.inv_lattice'
+_closure_matmatmul(M, Minv) = mat -> M * mat * Minv
+
+matrix_red_to_cart(model::Model)   = _closure_matmatmul(model.lattice,      model.inv_lattice)
+matrix_cart_to_red(model::Model)   = _closure_matmatmul(model.inv_lattice,  model.lattice)
+comatrix_red_to_cart(model::Model) = _closure_matmatmul(model.inv_lattice', model.lattice')
+comatrix_cart_to_red(model::Model) = _closure_matmatmul(model.lattice',     model.inv_lattice')
+
+matrix_red_to_cart(model::Model, Ared)    = matrix_red_to_cart(model)(Ared)
+matrix_cart_to_red(model::Model, Acart)   = matrix_cart_to_red(model)(Acart)
+comatrix_red_to_cart(model::Model, Bred)  = comatrix_red_to_cart(model)(Bred)
+comatrix_cart_to_red(model::Model, Bcart) = comatrix_cart_to_red(model)(Bcart)
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 282f95fba9..b52c9b8c51 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -384,7 +384,9 @@ G_vectors(::PlaneWaveBasis, kpt::Kpoint) = kpt.G_vectors
 
 The list of ``G`` vectors of a given `basis` or `kpt`, in cartesian coordinates.
 """
-G_vectors_cart(basis::PlaneWaveBasis) = map_recip_vector_red_to_cart(basis.model, G_vectors(basis))
+function G_vectors_cart(basis::PlaneWaveBasis)
+    map(recip_vector_red_to_cart(basis.model), G_vectors(basis))
+end
 function G_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint)
     recip_vector_red_to_cart.(basis.model, G_vectors(basis, kpt))
 end
@@ -406,7 +408,7 @@ end
 The list of ``G + k`` vectors, in cartesian coordinates.
 """
 function Gplusk_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint)
-    map_recip_vector_red_to_cart(basis.model, Gplusk_vectors(basis, kpt))
+    map(recip_vector_red_to_cart(basis.model), Gplusk_vectors(basis, kpt))
 end
 
 @doc raw"""
@@ -421,7 +423,7 @@ r_vectors(basis::PlaneWaveBasis) = basis.r_vectors
 
 The list of ``r`` vectors, in cartesian coordinates.
 """
-r_vectors_cart(basis::PlaneWaveBasis) = map_vector_red_to_cart(basis.model, r_vectors(basis))
+r_vectors_cart(basis::PlaneWaveBasis) = map(vector_red_to_cart(basis.model), r_vectors(basis))
 
 
 """

From 5e4d37a7ba97ce842b51f10dae9db0a14653deb9 Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Tue, 8 Nov 2022 17:48:34 +0100
Subject: [PATCH 47/69] More refactoring ... barely tested, might fail

---
 src/PlaneWaveBasis.jl                | 57 +++++++++++++---------------
 src/common/ortho.jl                  |  5 +--
 src/common/zeros_like.jl             | 13 +------
 src/densities.jl                     |  2 +-
 src/eigen/diag.jl                    |  4 +-
 src/external/jld2io.jl               |  1 -
 src/fft.jl                           | 17 ++-------
 src/guess_density.jl                 | 10 ++---
 src/interpolation.jl                 |  2 +-
 src/occupation.jl                    |  2 +-
 src/orbitals.jl                      |  5 +--
 src/scf/mixing.jl                    | 15 ++------
 src/scf/nbands_algorithm.jl          |  3 +-
 src/symmetry.jl                      | 11 +++++-
 src/terms/hartree.jl                 | 14 +++----
 src/terms/kinetic.jl                 |  6 +--
 src/terms/local.jl                   | 13 ++++---
 src/terms/nonlocal.jl                | 19 +++++-----
 src/workarounds/fft_generic.jl       |  6 +--
 src/workarounds/forwarddiff_rules.jl |  4 +-
 20 files changed, 89 insertions(+), 120 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index b52c9b8c51..9b9ff2a86f 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -17,15 +17,14 @@ eg collinear spin is treated by doubling the number of kpoints.
 """
 struct Kpoint{T <: Real, GT <: AbstractArray}
     spin::Int                     # Spin component can be 1 or 2 as index into what is
-                                  # returned by the `spin_components` function
+    #                             # returned by the `spin_components` function
     coordinate::Vec3{T}           # Fractional coordinate of k-point
     mapping::Vector{Int}          # Index of G_vectors[i] on the FFT grid:
-                                  # G_vectors(basis)[kpt.mapping[i]] == G_vectors(basis, kpt)[i]
+    #                             # G_vectors(basis)[kpt.mapping[i]] == G_vectors(basis, kpt)[i]
     mapping_inv::Dict{Int, Int}   # Inverse of `mapping`:
-                                  # G_vectors(basis)[i] == G_vectors(basis, kpt)[mapping_inv[i]]
-    G_vectors::GT                 # Wave vectors in integer coordinates:
-                                  # ({G, 1/2 |k+G|^2 ≤ Ecut})
-                                  # The G_vectors are a 1D array of Vec3 of Ints
+    #                             # G_vectors(basis)[i] == G_vectors(basis, kpt)[mapping_inv[i]]
+    G_vectors::GT                 # Wave vectors in integer coordinates (vector of Vec3{Int})
+    #                             # ({G, 1/2 |k+G|^2 ≤ Ecut})
 end
 
 @doc raw"""
@@ -40,8 +39,9 @@ Normalization conventions:
 
 `ifft` and `fft` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT, GT, RT, KGT} <: AbstractBasis{T} where {VT <: Real, GT <: AbstractArray,
-                                                                    RT <: AbstractArray, KGT <: AbstractArray}
+struct PlaneWaveBasis{T, VT, GT, RT, KGT} <: AbstractBasis{
+    T
+} where {VT <: Real, GT <: AbstractArray, RT <: AbstractArray, KGT <: AbstractArray}
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 
@@ -112,11 +112,6 @@ end
 import Base.Broadcast.broadcastable
 Base.Broadcast.broadcastable(basis::PlaneWaveBasis) = Ref(basis)
 
-"""
-Return the type of array used for computations (Array if on CPU, CuArray,
-ROCArray... if on GPU).
-"""
-
 Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 
 @timing function build_kpoints(model::Model{T}, fft_size, kcoords, Ecut;
@@ -137,7 +132,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
                 push!(Gvecs_k, G)
             end
         end
-        Gvecs_k = convert_like(array_type, Gvecs_k)  # GPU computation only: offload the Gs to the GPU
+        Gvecs_k = convert_like(array_type, Gvecs_k)
 
         mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping))
         for iσ = 1:model.n_spin_components
@@ -153,8 +148,6 @@ function build_kpoints(basis::PlaneWaveBasis, kcoords)
                   variational=basis.variational, array_type = basis.G_vectors)
 end
 
-kpt_array_type(A::Kpoint{T, GT}) where {T,GT} = GT
-
 # Lowest-level constructor, should not be called directly.
 # All given parameters must be the same on all processors
 # and are stored in PlaneWaveBasis for easy reconstruction.
@@ -204,8 +197,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     kweights_global = kweights
 
     # Setup FFT plans
-    Gs = G_vectors(fft_size, array_type)
-    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(similar(Gs,T), fft_size)
+    Gs = G_vectors(array_type, fft_size)
+    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans!(similar(Gs, Complex{T}, fft_size))
 
     # Normalization constants
     # fft = fft_normalization * FFT
@@ -249,7 +242,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
         "Non-variational calculations are experimental. " *
         "Not all features of DFTK may be supported or work as intended."
     )
-    kpoints = build_kpoints(model, fft_size, kcoords_global[krange_thisproc], Ecut; variational, array_type)
+    kpoints = build_kpoints(model, fft_size, kcoords_global[krange_thisproc], Ecut;
+                            variational, array_type)
     # kpoints is now possibly twice the size of krange. Make things consistent
     if model.n_spin_components == 2
         krange_thisproc   = vcat(krange_thisproc, n_kpt .+ krange_thisproc)
@@ -258,15 +252,20 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     end
     @assert mpi_sum(sum(kweights_thisproc), comm_kpts) ≈ model.n_spin_components
     @assert length(kpoints) == length(kweights_thisproc)
-    Threads.nthreads() != 1 && Gs isa AbstractGPUArray && error("Can't mix multi-threading and GPU computations yet.")
+
+    if Gs isa AbstractGPUArray && Threads.nthreads() > 1
+        error("Can't mix multi-threading and GPU computations yet.")
+    end
 
     VT = value_type(T)
     dvol  = model.unit_cell_volume ./ prod(fft_size)
-    r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) for i = 1:N1, j = 1:N2, k = 1:N3]
+    r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3)
+                 for i = 1:N1, j = 1:N2, k = 1:N3]
     r_vectors = convert_like(array_type, r_vectors)
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
-    basis = PlaneWaveBasis{T,value_type(T), typeof(Gs), typeof(r_vectors), kpt_array_type(kpoints[1])}(
+    basis = PlaneWaveBasis{T,value_type(T),typeof(Gs),typeof(r_vectors),
+                           typeof(G_vectors(kpoints[1]))}(
         model, fft_size, dvol,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
@@ -347,8 +346,7 @@ end
 The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
 """
-
-function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type::Union{Type,AbstractArray} = Array)
+function G_vectors(array_type::Union{Type,AbstractArray}, fft_size::Union{Tuple,AbstractVector})
     # Note that a collect(G_vectors_generator(fft_size)) is 100-fold slower
     # than this implementation, hence the code duplication.
     start = .- cld.(fft_size .- 1, 2)
@@ -397,9 +395,8 @@ end
 The list of ``G + k`` vectors, in reduced coordinates.
 """
 function Gplusk_vectors(basis::PlaneWaveBasis, kpt::Kpoint)
-    coordinate = kpt.coordinate
-    Gs = G_vectors(basis,kpt)
-    map(G -> G + coordinate, Gs)
+    coordinate = kpt.coordinate  # Avoid closure on kpt (not isbits)
+    map(G -> G + coordinate, G_vectors(basis, kpt))
 end
 
 @doc raw"""
@@ -431,7 +428,7 @@ Return the index tuple `I` such that `G_vectors(basis)[I] == G`
 or the index `i` such that `G_vectors(basis, kpoint)[i] == G`.
 Returns nothing if outside the range of valid wave vectors.
 """
-@inline function index_G_vectors(fft_size::Tuple, G::AbstractVector{T}) where {T <: Integer}
+@inline function index_G_vectors(fft_size::Tuple, G::AbstractVector{<:Integer})
     start = .- cld.(fft_size .- 1, 2)
     stop  = fld.(fft_size .- 1, 2)
     lengths = stop .- start .+ 1
@@ -448,7 +445,7 @@ Returns nothing if outside the range of valid wave vectors.
     end
 end
 
-@inline function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{T}) where {T <: Integer}
+@inline function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{<:Integer})
     index_G_vectors(basis.fft_size, G)
 end
 
@@ -486,7 +483,7 @@ it as a `PlaneWaveBasis`. On the other (non-master) processes `nothing` is retur
 The returned object should not be used for computations and only to extract data
 for post-processing and serialisation to disk.
 """
-function gather_kpts(basis)
+function gather_kpts(basis::PlaneWaveBasis)
     # No need to allocate and setup a new basis object
     mpi_nprocs(basis.comm_kpts) == 1 && return basis
 
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index ce6a35db56..e26def5fe8 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,7 +1,6 @@
 # Orthonormalize
-@timing function ortho_qr(φk::AbstractArray)
-    Q = qr(φk).Q
-    Q = convert(typeof(φk), Q)
+@timing function ortho_qr(φk::ArrayType) where {ArrayType <: AbstractArray}
+    Q = convert(ArrayType, qr(φk).Q)
     # CUDA bug: after the convert line, when φk is m*n rectangular matrix with m > n,
     # Q is not cropped ie only the first size(φk, 2) columns should be kept
     Q[:, 1:size(φk, 2)]
diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl
index 2060025d15..bab452eb09 100644
--- a/src/common/zeros_like.jl
+++ b/src/common/zeros_like.jl
@@ -2,7 +2,7 @@
 # of allocations.
 function zeros_like(X::AbstractArray, T::Type=eltype(X), dims::Integer...=size(X)...)
     Z = similar(X, T, dims...)
-    Z .= 0
+    Z .= false
     Z
 end
 zeros_like(X::AbstractArray, dims::Integer...) = zeros_like(X, eltype(X), dims...)
@@ -12,14 +12,5 @@ zeros_like(X::StaticArray, T::Type=eltype(X), dims::Integer...=size(X)...) = @SA
 function convert_like(array_model::AbstractArray, src::AbstractArray)
     copy!(similar(array_model, eltype(src), size(src)...), src)
 end
-
 convert_like(array_model::Array, src::Array) = src
-
-function convert_like(array_model::Type, src::AbstractArray)
-    convert(array_model, src)
-end
-
-# function convert_like(array_model::Array, src::Array, T::Type=eltype(src), dims::Integer...=size(src)...)
-#     T == eltype(src) && dims == size(src) && src
-#     convert_like(array_model, src, T, dims...)
-# end
+convert_like(array_model::Type,  src::AbstractArray) = convert(array_model, src)
diff --git a/src/densities.jl b/src/densities.jl
index 7fb338d6d2..9ab81e0aba 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -23,7 +23,7 @@ using an optional `occupation_threshold`. By default all occupation numbers are
 @views @timing function compute_density(basis::PlaneWaveBasis{T}, ψ, occupation;
                                         occupation_threshold=zero(T)) where {T}
     S = promote_type(T, real(eltype(ψ[1])))
-    occupation = [Array(oc) for oc in occupation]  # GPU computation only: offload to CPU
+    occupation = [Array(oc) for oc in occupation]  # Bring to CPU if not yet done
 
     # we split the total iteration range (ik, n) in chunks, and parallelize over them
     mask_occ = map(occk -> findall(isless.(occupation_threshold, occk)), occupation)
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index 8122c029c7..fe0a593cae 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -58,7 +58,9 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
     end
 
     # Transform results into a nicer datastructure
-    (λ=[Array(real.(res.λ)) for res in results],  # GPU computation only : get λ back on the CPU
+    # TODO It feels inconsistent to put λ onto the CPU here but none of the other object
+    #      ... better have this handled by the caller of diagonalize_all_kblocks.
+    (λ=[Array(real.(res.λ)) for res in results],  # Get onto the CPU in any case
      X=[res.X for res in results],
      residual_norms=[res.residual_norms for res in results],
      iterations=[res.iterations for res in results],
diff --git a/src/external/jld2io.jl b/src/external/jld2io.jl
index b1a4d4bad9..64a8a9d0bf 100644
--- a/src/external/jld2io.jl
+++ b/src/external/jld2io.jl
@@ -1,4 +1,3 @@
-
 function ScfSaveCheckpoints(filename="dftk_scf_checkpoint.jld2"; keep=false, overwrite=false)
     # TODO Save only every 30 minutes or so
     function callback(info)
diff --git a/src/fft.jl b/src/fft.jl
index 5c598705e5..5429c9b3ac 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -266,20 +266,9 @@ _fftw_flags(::Type{Float64}) = FFTW.MEASURE
 Plan a FFT of type `T` and size `fft_size`, spending some time on finding an
 optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are returned.
 """
-# Default fallback, which will be used when doing GPU computations.
-function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union{Float32,Float64}}
-    tmp = similar(array_type, Complex{T}, fft_size...)
-    ipFFT = AbstractFFTs.plan_fft!(tmp)
-    opFFT = AbstractFFTs.plan_fft(tmp)
-    # backward by inverting and stripping off normalizations
-    ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
-end
-
-# Specific CPU version, using flags to be a bit faster.
-function build_fft_plans(array_type::Array{T}, fft_size) where {T<:Union{Float32,Float64}}
-    tmp = Array{Complex{T}}(undef, fft_size...)
-    ipFFT = FFTW.plan_fft!(tmp, flags=_fftw_flags(T))
-    opFFT = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
+function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Union{Float32,Float64}}
+    ipFFT = AbstractFFTs.plan_fft!(tmp; flags=_fftw_flags(T))
+    opFFT = AbstractFFTs.plan_fft(tmp;  flags=_fftw_flags(T))
     # backward by inverting and stripping off normalizations
     ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
 end
diff --git a/src/guess_density.jl b/src/guess_density.jl
index 9785b0a9a9..b47c22705b 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -93,20 +93,20 @@ which follow the functional form
 and are placed at `position` (in fractional coordinates).
 """
 function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
-    ρ = zeros_like(G_vectors(basis), complex(T), basis.fft_size...)
+    recip_lattice = basis.model.recip_lattice
+    fft_size      = basis.fft_size
+    ρ = zeros_like(G_vectors(basis), complex(T), fft_size...)
 
     isempty(gaussians) && return irfft(basis, ρ)
 
-    #These copies are required so that recip_lattice and gaussians are isbits (GPU compatibility)
-    recip_lattice = basis.model.recip_lattice
+    # This copy is required such that gaussians is isbits and can be transferred to the GPU
+    # TODO See if there is a better option here ... this feels non-ideal for larger systems
     gaussians = SVector{size(gaussians)[1]}(gaussians)
 
     # Fill ρ with the (unnormalized) Fourier transform, i.e. ∫ e^{-iGx} f(x) dx,
     # where f(x) is a weighted gaussian
     #
     # is formed from a superposition of atomic densities, each scaled by a prefactor
-
-    fft_size = basis.fft_size
     function build_ρ(G)
         if isnothing(index_G_vectors(fft_size, -G))
             return zero(complex(T))
diff --git a/src/interpolation.jl b/src/interpolation.jl
index c2a03952a9..c5658685ea 100644
--- a/src/interpolation.jl
+++ b/src/interpolation.jl
@@ -79,7 +79,7 @@ function interpolate_kpoint(data_in::AbstractVecOrMat,
     n_bands  = size(data_in, 2)
     n_Gk_out = length(G_vectors(basis_out, kpoint_out))
     data_out = similar(data_in, n_Gk_out, n_bands) .= 0
-    #TODO: use a map, or this will not be GPU compatible (scalar indexing)
+    # TODO: use a map, or this will not be GPU compatible (scalar indexing)
     for iin in 1:size(data_in, 1)
         idx_fft = kpoint_in.mapping[iin]
         idx_fft in keys(kpoint_out.mapping_inv) || continue
diff --git a/src/occupation.jl b/src/occupation.jl
index f60915b295..50c44a849f 100644
--- a/src/occupation.jl
+++ b/src/occupation.jl
@@ -29,7 +29,7 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF;
     inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature
 
     filled_occ = filled_occupation(basis.model)
-    [convert_like(basis.G_vectors,  # GPU computation only: put each occupation on GPU
+    [convert_like(basis.G_vectors,  # Put onto the GPU if we're using it
         filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature))
         for εk in eigenvalues]
 end
diff --git a/src/orbitals.jl b/src/orbitals.jl
index ec4e91bd28..d9ba1773cd 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -54,13 +54,12 @@ end
 unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ))
 
 function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
-    @static if VERSION < v"1.7"
-        # Don't use TaskLocalRNG as it is not available.
+    @static if VERSION < v"1.7"  # TaskLocalRNG not yet available.
         orbitals = randn(Complex{T}, length(G_vectors(basis, kpt)), howmany)
         orbitals = convert_like(basis.G_vectors, orbitals)
     else
         orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
-        randn!(TaskLocalRNG(), orbitals)  # Force the use of GPUArrays.jl's random function if using the GPU
+        randn!(TaskLocalRNG(), orbitals)  # use the RNG on the device if we're using a GPU
     end
     ortho_qr(orbitals)
 end
diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl
index 19e36e51b6..62f9854ce8 100644
--- a/src/scf/mixing.jl
+++ b/src/scf/mixing.jl
@@ -49,10 +49,8 @@ end
 
 @timing "KerkerMixing" function mix_density(mixing::KerkerMixing, basis::PlaneWaveBasis,
                                             δF; kwargs...)
-    T      = eltype(δF)
-    G²     = map(G_vectors_cart(basis)) do G
-                sum(abs2, G)
-            end
+    T  = eltype(δF)
+    G² = map(G -> sum(abs2, G), G_vectors_cart(basis))
     kTF    = T.(mixing.kTF)
     ΔDOS_Ω = T.(mixing.ΔDOS_Ω)
 
@@ -75,12 +73,7 @@ end
     δFtot_fourier  = total_density(δF_fourier)
     δFspin_fourier = spin_density(δF_fourier)
     δρtot_fourier = δFtot_fourier .* G² ./ (kTF.^2 .+ G²)
-    # force_real! is currently not GPU compatible, so we have to do this very ugly thing
-    # of calling back the array on CPU, running force_real!, then putting it back on GPU
-    δρtot_fourier = Array(δρtot_fourier)
     enforce_real!(basis, δρtot_fourier)
-    δρtot_fourier = convert_like(basis.G_vectors, δρtot_fourier)
-
     δρtot = irfft(basis, δρtot_fourier)
 
     # Copy DC component, otherwise it never gets updated
@@ -143,9 +136,7 @@ end
     εr > 1 / sqrt(eps(T)) && return mix_density(KerkerMixing(; kTF), basis, δF)
 
     C0 = 1 - εr
-    Gsq = map(G_vectors_cart(basis)) do G
-        sum(abs2, G)
-    end
+    Gsq = map(G -> sum(abs2, G), G_vectors_cart(basis))
     δF_fourier = fft(basis, δF)
     δρ = @. δF_fourier * (kTF^2 - C0 * Gsq) / (εr * kTF^2 - C0 * Gsq)
     δρ = irfft(basis, δρ)
diff --git a/src/scf/nbands_algorithm.jl b/src/scf/nbands_algorithm.jl
index 513ee15b03..df1f085935 100644
--- a/src/scf/nbands_algorithm.jl
+++ b/src/scf/nbands_algorithm.jl
@@ -68,8 +68,7 @@ function determine_n_bands(bands::AdaptiveBands, occupation::AbstractVector,
     # TODO Could return different bands per k-Points
 
     # Determine number of bands to be actually converged
-
-    occupation = [Array(oc) for oc in occupation]  # GPU computation only: bring occupation back
+    occupation = [Array(occk) for occk in occupation]  # Bring occupation to CPU memory
     # on the CPU, or maximum (following line) will fail
     n_bands_occ = maximum(occupation) do occk
         something(findlast(fnk -> fnk ≥ bands.occupation_threshold, occk), length(occk) + 1)
diff --git a/src/symmetry.jl b/src/symmetry.jl
index af850c04c8..4a271d70e1 100644
--- a/src/symmetry.jl
+++ b/src/symmetry.jl
@@ -187,7 +187,7 @@ end
 end
 
 # Low-pass filters ρ (in Fourier) so that symmetry operations acting on it stay in the grid
-function lowpass_for_symmetry!(ρ, basis; symmetries=basis.symmetries)
+function lowpass_for_symmetry!(ρ::AbstractArray, basis; symmetries=basis.symmetries)
     for symop in symmetries
         isone(symop) && continue
         for (ig, G) in enumerate(G_vectors_generator(basis.fft_size))
@@ -198,6 +198,15 @@ function lowpass_for_symmetry!(ρ, basis; symmetries=basis.symmetries)
     end
     ρ
 end
+function lowpass_for_symmetry!(ρ::AT, basis;
+                               symmetries=basis.symmetries) where {AT <: AbstractGPUArray}
+    all(isone, symmetries) && return ρ
+    # lowpass_for_symmetry! currently uses scalar indexing, so we have to do this very ugly
+    # thing for cases where ρ sits on a device (e.g. GPU)
+    ρ_CPU = Array(ρ)
+    ρ_CPU = lowpass_for_symmetry!(ρ_CPU, basis; symmetries)
+    convert(AT, ρ_CPU)
+end
 
 """
 Symmetrize a density by applying all the basis (by default) symmetries and forming the average.
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index f98d9f7edb..6aa22f9844 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -30,17 +30,13 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
     # Solving the Poisson equation ΔV = -4π ρ in Fourier space
     # is multiplying elementwise by 4π / |G|^2.
     poisson_green_coeffs = map(G_vectors_cart(basis)) do G
-        4T(π) /sum(abs2, G)
+        if iszero(G)
+            zero(T)  # Compensating charge background => zero DC
+        else
+            4T(π) / sum(abs2, G)
+        end
     end
-    poisson_green_coeffs[1:1,1:1,1:1] .= zero(similar(G_vectors(basis), T, 1,1,1))
-    ## Hackish way to do the following
-    # poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
-
-    # force_real! is currently not GPU compatible, so we have to do this very ugly thing
-    # of calling back the array on CPU, running force_real!, then putting it back on GPU
-    poisson_green_coeffs = Array(poisson_green_coeffs)
     enforce_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
-    poisson_green_coeffs = convert_like(basis.G_vectors,poisson_green_coeffs)
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)
 end
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index 769d211a75..541d3958c6 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -19,8 +19,8 @@ struct TermKinetic <: Term
     kinetic_energies::Vector{<:AbstractVector}
 end
 function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor, blowup) where {T}
-    function build_kin(Gs, Ecut)
-        map(Gs) do Gk
+    function build_kin(Gks, Ecut)
+        map(Gks) do Gk
             T(scaling_factor) * sum(abs2, Gk) / 2 * blowup(norm(Gk), Ecut)
         end
     end
@@ -36,7 +36,7 @@ end
     if isnothing(ψ) || isnothing(occupation)
         return (; E=T(Inf), ops)
     end
-    occupation = [Array(oc) for oc in occupation]  # GPU computation only: put the occupations back on CPU
+    occupation = [Array(occk) for occk in occupation]  # Bring occupation to CPU memory
 
     E = zero(T)
     for (ik, ψk) in enumerate(ψ)
diff --git a/src/terms/local.jl b/src/terms/local.jl
index e739aa0f46..9d2fd70505 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -74,9 +74,10 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     # positions, this involves a form factor (`local_potential_fourier`)
     # and a structure factor e^{-i G·r}
     model = basis.model
-    # GPU computation only : put the Gs on CPU for compatibility with the
-    # pseudopotentials which are not isbits
     G_cart = Array(G_vectors_cart(basis))
+    # TODO Bring G_cart on the CPU for compatibility with the pseudopotentials which
+    #      are not isbits ... might be able to solve this by restructuring the loop
+
 
     # Pre-compute the form factors at unique values of |G| to speed up
     # the potential Fourier transform (by a lot). Using a hash map gives O(1)
@@ -91,9 +92,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
             end
         end
     end
-    # GPU computation only : put the Gs on CPU for compatibility with the
-    # pseudopotentials which are not isbits
-    Gs = Array(G_vectors(basis))
+
+    Gs = Array(G_vectors(basis))  # TODO Again for GPU compatibility
     pot_fourier = map(enumerate(Gs)) do (iG, G)
         q = norm(G_cart[iG])
         pot = sum(enumerate(model.atom_groups)) do (igroup, group)
@@ -103,7 +103,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
         pot / sqrt(model.unit_cell_volume)
     end
     enforce_real!(basis, pot_fourier)  # Symmetrize Fourier coeffs to have real iFFT
-    # GPU computation only : build the potential values on CPU then offload them to GPU
+
+    # Offload potential values to a device (like a GPU) and do the FFT
     pot_real = irfft(basis, convert_like(basis.G_vectors, pot_fourier))
 
     TermAtomicLocal(pot_real)
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index f176ede6b6..8bb61a3079 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -15,7 +15,7 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T}
     isempty(psp_groups) && return TermNoop()
     ops = map(basis.kpoints) do kpt
         P = build_projection_vectors_(basis, kpt, psps, psp_positions)
-        D = build_projection_coefficients_(T, psps, psp_positions, array_type = basis.G_vectors)
+        D = build_projection_coefficients_(T, psps, psp_positions, array_type=basis.G_vectors)
         NonlocalOperator(basis, kpt, P, D)
     end
     TermAtomicNonlocal(ops)
@@ -63,8 +63,8 @@ end
         C = build_projection_coefficients_(T, element.psp)
         for (ik, kpt) in enumerate(basis.kpoints)
             # we compute the forces from the irreductible BZ; they are symmetrized later
-            qs_cart = Gplusk_vectors_cart(basis, kpt)
             qs = Gplusk_vectors(basis, kpt)
+            qs_cart = Array(Gplusk_vectors_cart(basis, kpt))  # Get on the CPU
             form_factors = build_form_factors(element.psp, qs_cart)
             for idx in group
                 r = model.positions[idx]
@@ -92,7 +92,7 @@ end
 # The ordering of the projector indices is (A,l,m,i), where A is running over all
 # atoms, l, m are AM quantum numbers and i is running over all projectors for a
 # given l. The matrix is block-diagonal with non-zeros only if A, l and m agree.
-function build_projection_coefficients_(T, psps, psp_positions; array_type = Array)
+function build_projection_coefficients_(T, psps, psp_positions; array_type=Array)
     # TODO In the current version the proj_coeffs still has a lot of zeros.
     #      One could improve this by storing the blocks as a list or in a
     #      BlockDiagonal data structure
@@ -150,6 +150,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     n_proj = count_n_proj(psps, psp_positions)
     n_G    = length(G_vectors(basis, kpt))
     proj_vectors = zeros(Complex{T}, n_G, n_proj)
+    qs = Array(Gplusk_vectors(basis, kpt))  # Get Gs on the CPU if computed on a device
 
     # Compute the columns of proj_vectors = 1/√Ω pihat(k+G)
     # Since the pi are translates of each others, pihat(k+G) decouples as
@@ -158,13 +159,13 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     offset = 0  # offset into proj_vectors
     for (psp, positions) in zip(psps, psp_positions)
         # Compute position-independent form factors
-        form_factors = build_form_factors(psp, Gplusk_vectors_cart(basis, kpt))
+        qs_cart = Array(Gplusk_vectors_cart(basis, kpt))  # Get on the CPU if computed on device
+        form_factors = build_form_factors(psp, qs_cart)
 
         # Combine with structure factors
         for r in positions
             # k+G in this formula can also be G, this only changes an unimportant phase factor
-            Gs = Array(Gplusk_vectors(basis, kpt))  # GPU computation only: get Gs on CPU for the following map
-            structure_factors = map(q -> cis2pi(-dot(q, r)), Gs)
+            structure_factors = map(q -> cis2pi(-dot(q, r)), qs)
             @views for iproj = 1:count_n_proj(psp)
                 proj_vectors[:, offset+iproj] .= (
                     structure_factors .* form_factors[:, iproj] ./ sqrt(unit_cell_volume)
@@ -174,15 +175,15 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
         end
     end
     @assert offset == n_proj
-    # GPU computation only : build the vectors on CPU then offload them to the GPU
+
+    # Offload potential values to a device (like a GPU)
     convert_like(basis.G_vectors, proj_vectors)
 end
 
 """
 Build form factors (Fourier transforms of projectors) for an atom centered at 0.
 """
-function build_form_factors(psp, qs)
-    qs = Array(qs)  # GPU computation only : get qs back on CPU
+function build_form_factors(psp, qs::Array)
     T = real(eltype(first(qs)))
 
     # Pre-compute the radial parts of the non-local projectors at unique |q| to speed up
diff --git a/src/workarounds/fft_generic.jl b/src/workarounds/fft_generic.jl
index 993bca485b..8390678cde 100644
--- a/src/workarounds/fft_generic.jl
+++ b/src/workarounds/fft_generic.jl
@@ -23,13 +23,11 @@ end
 default_primes(::Any) = (2, )
 
 # Generic fallback function, Float32 and Float64 specialization in fft.jl
-function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T}
-    tmp = Array{Complex{T}}(undef, fft_size...)
-
+function build_fft_plans!(tmp::AbstractArray{T}) where {T <: Complex}
     # Note: FourierTransforms has no support for in-place FFTs at the moment
     # ... also it's extension to multi-dimensional arrays is broken and
     #     the algo only works for some cases
-    @assert all(ispow2, fft_size)
+    @assert all(ispow2, size(tmp))
 
     # opFFT = FourierTransforms.plan_fft(tmp)   # TODO When multidim works
     # opBFFT = inv(opFFT).p
diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index 77afd6e28f..091b1885a7 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -107,11 +107,9 @@ next_working_fft_size(::Type{<:ForwardDiff.Dual}, size::Int) = size
 
 _fftw_flags(::Type{<:ForwardDiff.Dual}) = FFTW.MEASURE | FFTW.UNALIGNED
 
-function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.Dual}}}
-    tmp = Array{complex(T)}(undef, fft_size...) # TODO think about other Array types
+function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:ForwardDiff.Dual}
     opFFT  = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
     opBFFT = FFTW.plan_bfft(tmp, flags=_fftw_flags(T))
-
     ipFFT  = DummyInplace{typeof(opFFT)}(opFFT)
     ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
     # backward by inverting and stripping off normalizations

From 6dc6c82090dc397634f311690f38fce9d60c3837 Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Tue, 8 Nov 2022 22:06:28 +0100
Subject: [PATCH 48/69] Fixes

---
 examples/gpu.jl                      | 12 ++++--------
 src/PlaneWaveBasis.jl                | 11 +++++++----
 src/fft.jl                           | 25 +++++++++++++++++--------
 src/workarounds/fft_generic.jl       |  4 ++--
 src/workarounds/forwarddiff_rules.jl |  7 ++-----
 5 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/examples/gpu.jl b/examples/gpu.jl
index 2819b075f0..fe7cba568c 100644
--- a/examples/gpu.jl
+++ b/examples/gpu.jl
@@ -10,15 +10,11 @@ atoms     = [Si, Si]
 positions = [ones(3)/8, -ones(3)/8]
 model = model_DFT(lattice, atoms, positions, []; temperature=1e-3)
 
-if has_cuda()
-    # Use CUDA to store DFT quantities and perform main computations
-    # For this we set the array_type for storing DFT quantities to a GPU array type
-    array_type = CuArray
-else
-    array_type = Array  # Keep using the CPU
-end
+# If available use CUDA to store DFT quantities and perform main computations
+# This is triggered by setting the array_type for storing DFT quantities
+array_type = has_cuda() ? CuArray : Array
 
 basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), array_type)
 scfres = self_consistent_field(basis; tol=1e-3,
-                               solver=scf_anderson_solver(),
+                               solver=scf_damping_solver(),
                                mixing=KerkerMixing())
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 9b9ff2a86f..3092ef8c17 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -264,8 +264,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     r_vectors = convert_like(array_type, r_vectors)
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
-    basis = PlaneWaveBasis{T,value_type(T),typeof(Gs),typeof(r_vectors),
-                           typeof(G_vectors(kpoints[1]))}(
+    basis = PlaneWaveBasis{T, value_type(T), typeof(Gs), typeof(r_vectors),
+                           typeof(kpoints[1].G_vectors)}(
         model, fft_size, dvol,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
@@ -341,11 +341,12 @@ Creates a new basis identical to `basis`, but with a custom set of kpoints
 end
 
 """
-    G_vectors(fft_size::Tuple)
+    G_vectors([array_type=Vector], fft_size::Tuple)
 
 The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
 """
+G_vectors(fft_size::Union{Tuple,AbstractVector}) = G_vectors(Array, fft_size)
 function G_vectors(array_type::Union{Type,AbstractArray}, fft_size::Union{Tuple,AbstractVector})
     # Note that a collect(G_vectors_generator(fft_size)) is 100-fold slower
     # than this implementation, hence the code duplication.
@@ -353,8 +354,9 @@ function G_vectors(array_type::Union{Type,AbstractArray}, fft_size::Union{Tuple,
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
     Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
-    convert_like(array_type, Gs)  # GPU computation only : offload the Gs to the GPU.
+    convert_like(array_type, Gs)  # Put data on the device (like GPU)
 end
+
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
     # The generator version is used mainly in symmetry.jl for lowpass_for_symmetry! and
     # accumulate_over_symmetries!, which are 100-fold slower with G_vector(fft_size).
@@ -364,6 +366,7 @@ function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
     (Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3])
 end
 
+
 @doc raw"""
     G_vectors(basis::PlaneWaveBasis)
     G_vectors(basis::PlaneWaveBasis, kpt::Kpoint)
diff --git a/src/fft.jl b/src/fft.jl
index 5429c9b3ac..559d518710 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -257,19 +257,28 @@ function compute_Glims_fast(lattice::AbstractMatrix{T}, Ecut; supersampling=2, t
     Glims
 end
 
-# For Float32 there are issues with aligned FFTW plans, so we
-# fall back to unaligned FFTW plans (which are generally discouraged).
-_fftw_flags(::Type{Float32}) = FFTW.MEASURE | FFTW.UNALIGNED
-_fftw_flags(::Type{Float64}) = FFTW.MEASURE
-
 """
 Plan a FFT of type `T` and size `fft_size`, spending some time on finding an
 optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are returned.
 """
+function build_fft_plans!(tmp::Array{Complex{Float64}})
+    ipFFT = FFTW.plan_fft!(tmp; flags=FFTW.MEASURE)
+    opFFT = FFTW.plan_fft(tmp;  flags=FFTW.MEASURE)
+    # backwards-FFT by inverting and stripping off normalizations
+    ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
+end
+function build_fft_plans!(tmp::Array{Complex{Float32}})
+    # For Float32 there are issues with aligned FFTW plans, so we
+    # fall back to unaligned FFTW plans (which are generally discouraged).
+    ipFFT = FFTW.plan_fft!(tmp; flags=FFTW.MEASURE | FFTW.UNALIGNED)
+    opFFT = FFTW.plan_fft(tmp;  flags=FFTW.MEASURE | FFTW.UNALIGNED)
+    # backwards-FFT by inverting and stripping off normalizations
+    ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
+end
 function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Union{Float32,Float64}}
-    ipFFT = AbstractFFTs.plan_fft!(tmp; flags=_fftw_flags(T))
-    opFFT = AbstractFFTs.plan_fft(tmp;  flags=_fftw_flags(T))
-    # backward by inverting and stripping off normalizations
+    ipFFT = AbstractFFTs.plan_fft!(tmp)
+    opFFT = AbstractFFTs.plan_fft(tmp)
+    # backwards-FFT by inverting and stripping off normalizations
     ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
 end
 
diff --git a/src/workarounds/fft_generic.jl b/src/workarounds/fft_generic.jl
index 8390678cde..a9b8c4ff0d 100644
--- a/src/workarounds/fft_generic.jl
+++ b/src/workarounds/fft_generic.jl
@@ -23,13 +23,13 @@ end
 default_primes(::Any) = (2, )
 
 # Generic fallback function, Float32 and Float64 specialization in fft.jl
-function build_fft_plans!(tmp::AbstractArray{T}) where {T <: Complex}
+function build_fft_plans!(tmp::AbstractArray{<:Complex})
     # Note: FourierTransforms has no support for in-place FFTs at the moment
     # ... also it's extension to multi-dimensional arrays is broken and
     #     the algo only works for some cases
     @assert all(ispow2, size(tmp))
 
-    # opFFT = FourierTransforms.plan_fft(tmp)   # TODO When multidim works
+    # opFFT = AbstractFFTs.plan_fft(tmp)   # TODO When multidim works
     # opBFFT = inv(opFFT).p
     opFFT  = generic_plan_fft(tmp)               # Fallback for now
     opBFFT = generic_plan_bfft(tmp)
diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index 091b1885a7..22c50c6114 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -105,14 +105,11 @@ end
 
 next_working_fft_size(::Type{<:ForwardDiff.Dual}, size::Int) = size
 
-_fftw_flags(::Type{<:ForwardDiff.Dual}) = FFTW.MEASURE | FFTW.UNALIGNED
-
 function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:ForwardDiff.Dual}
-    opFFT  = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
-    opBFFT = FFTW.plan_bfft(tmp, flags=_fftw_flags(T))
+    opFFT  = AbstractFFTs.plan_fft(tmp)
+    opBFFT = AbstractFFTs.plan_bfft(tmp)
     ipFFT  = DummyInplace{typeof(opFFT)}(opFFT)
     ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
-    # backward by inverting and stripping off normalizations
     ipFFT, opFFT, ipBFFT, opBFFT
 end
 

From a6fe22b93512d24f0cc19d6e68de96bbe4eb50e8 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 9 Nov 2022 10:43:09 +0100
Subject: [PATCH 49/69] Remove array_type and use AbstractArchitecture
 structure instead

---
 examples/gpu.jl          |  4 ++--
 src/DFTK.jl              |  3 +++
 src/PlaneWaveBasis.jl    | 34 ++++++++++++++++++----------------
 src/architecture.jl      | 13 +++++++++++++
 src/common/zeros_like.jl |  4 ++++
 src/cuda_architecture.jl |  7 +++++++
 src/external/jld2io.jl   |  7 +++----
 7 files changed, 50 insertions(+), 22 deletions(-)
 create mode 100644 src/architecture.jl
 create mode 100644 src/cuda_architecture.jl

diff --git a/examples/gpu.jl b/examples/gpu.jl
index fe7cba568c..ad9773fc0d 100644
--- a/examples/gpu.jl
+++ b/examples/gpu.jl
@@ -12,9 +12,9 @@ model = model_DFT(lattice, atoms, positions, []; temperature=1e-3)
 
 # If available use CUDA to store DFT quantities and perform main computations
 # This is triggered by setting the array_type for storing DFT quantities
-array_type = has_cuda() ? CuArray : Array
+architecture = has_cuda() ? GPU(CuArray) : CPU()
 
-basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), array_type)
+basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), architecture)
 scfres = self_consistent_field(basis; tol=1e-3,
                                solver=scf_damping_solver(),
                                mixing=KerkerMixing())
diff --git a/src/DFTK.jl b/src/DFTK.jl
index bb2c7439fa..1418aaccf5 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -23,6 +23,7 @@ export Mat3
 export mpi_nprocs
 export mpi_master
 export setup_threading, disable_threading
+export CPU, GPU
 include("common/timer.jl")
 include("common/constants.jl")
 include("common/ortho.jl")
@@ -34,6 +35,7 @@ include("common/mpi.jl")
 include("common/threading.jl")
 include("common/printing.jl")
 include("common/cis2pi.jl")
+include("architecture.jl")
 include("common/zeros_like.jl")
 include("common/norm.jl")
 
@@ -237,6 +239,7 @@ function __init__()
     end
     @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba"  begin
         include("workarounds/cuda_arrays.jl")
+        include("cuda_architecture.jl")
     end
 end
 
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 3092ef8c17..c02c22127d 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -95,6 +95,9 @@ struct PlaneWaveBasis{T, VT, GT, RT, KGT} <: AbstractBasis{
     krange_allprocs::Vector{Vector{Int}}  # indices of kpoints treated by the
     #                                       respective rank in comm_kpts
 
+    ## Information on the hardware and device used for computations.
+    architecture::AbstractArchitecture
+
     ## Symmetry operations that leave the discretized model (k and r grids) invariant.
     # Subset of model.symmetries.
     symmetries::Vector{SymOp{VT}}
@@ -116,7 +119,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 
 @timing function build_kpoints(model::Model{T}, fft_size, kcoords, Ecut;
                                variational=true,
-                               array_type::Union{Type,AbstractArray} = Array) where {T}
+                               architecture::AbstractArchitecture=CPU()) where {T}
     kpoints_per_spin = [Kpoint[] for _ in 1:model.n_spin_components]
     for k in kcoords
         k = Vec3{T}(k)  # rationals are sloooow
@@ -132,7 +135,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
                 push!(Gvecs_k, G)
             end
         end
-        Gvecs_k = convert_like(array_type, Gvecs_k)
+        Gvecs_k = convert_like(architecture, Gvecs_k)
 
         mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping))
         for iσ = 1:model.n_spin_components
@@ -140,12 +143,11 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
                   Kpoint{T,typeof(Gvecs_k)}(iσ, k, mapping, mapping_inv, Gvecs_k))
         end
     end
-
     vcat(kpoints_per_spin...)  # put all spin up first, then all spin down
 end
 function build_kpoints(basis::PlaneWaveBasis, kcoords)
     build_kpoints(basis.model, basis.fft_size, kcoords, basis.Ecut;
-                  variational=basis.variational, array_type = basis.G_vectors)
+                  variational=basis.variational, architecture = basis.architecture)
 end
 
 # Lowest-level constructor, should not be called directly.
@@ -154,7 +156,7 @@ end
 function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
                         kcoords, kweights, kgrid, kshift,
                         symmetries_respect_rgrid, comm_kpts,
-                        array_type::Union{Type,AbstractArray} = Array) where {T <: Real}
+                        architecture::AbstractArchitecture=CPU()) where {T <: Real}
     # Validate fft_size
     if variational
         max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
@@ -197,7 +199,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     kweights_global = kweights
 
     # Setup FFT plans
-    Gs = G_vectors(array_type, fft_size)
+    Gs = G_vectors(architecture, fft_size)
     (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans!(similar(Gs, Complex{T}, fft_size))
 
     # Normalization constants
@@ -243,7 +245,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
         "Not all features of DFTK may be supported or work as intended."
     )
     kpoints = build_kpoints(model, fft_size, kcoords_global[krange_thisproc], Ecut;
-                            variational, array_type)
+                            variational, architecture)
     # kpoints is now possibly twice the size of krange. Make things consistent
     if model.n_spin_components == 2
         krange_thisproc   = vcat(krange_thisproc, n_kpt .+ krange_thisproc)
@@ -261,7 +263,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     dvol  = model.unit_cell_volume ./ prod(fft_size)
     r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3)
                  for i = 1:N1, j = 1:N2, k = 1:N3]
-    r_vectors = convert_like(array_type, r_vectors)
+    r_vectors = convert_like(architecture, r_vectors)
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
     basis = PlaneWaveBasis{T, value_type(T), typeof(Gs), typeof(r_vectors),
@@ -273,7 +275,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
         Gs, r_vectors,
         kpoints, kweights_thisproc, kgrid, kshift,
         kcoords_global, kweights_global, comm_kpts, krange_thisproc, krange_allprocs,
-        symmetries, symmetries_respect_rgrid, terms)
+        architecture, symmetries, symmetries_respect_rgrid, terms)
     # Instantiate the terms with the basis
     for (it, t) in enumerate(model.term_types)
         term_name = string(nameof(typeof(t)))
@@ -291,7 +293,7 @@ end
                                 variational=true, fft_size=nothing,
                                 kgrid=nothing, kshift=nothing,
                                 symmetries_respect_rgrid=isnothing(fft_size),
-                                comm_kpts=MPI.COMM_WORLD, array_type=Array) where {T <: Real}
+                                comm_kpts=MPI.COMM_WORLD, architecture=CPU()) where {T <: Real}
     if isnothing(fft_size)
         @assert variational
         if symmetries_respect_rgrid
@@ -309,7 +311,7 @@ end
         fft_size = compute_fft_size(model, Ecut, kcoords; factors)
     end
     PlaneWaveBasis(model, Ecut, fft_size, variational, kcoords, kweights,
-                   kgrid, kshift, symmetries_respect_rgrid, comm_kpts, array_type)
+                   kgrid, kshift, symmetries_respect_rgrid, comm_kpts, architecture)
 end
 
 @doc raw"""
@@ -341,20 +343,20 @@ Creates a new basis identical to `basis`, but with a custom set of kpoints
 end
 
 """
-    G_vectors([array_type=Vector], fft_size::Tuple)
+    G_vectors([architecture=AbstractArchitecture], fft_size::Tuple)
 
 The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
 """
-G_vectors(fft_size::Union{Tuple,AbstractVector}) = G_vectors(Array, fft_size)
-function G_vectors(array_type::Union{Type,AbstractArray}, fft_size::Union{Tuple,AbstractVector})
+G_vectors(fft_size::Union{Tuple,AbstractVector}) = G_vectors(CPU(), fft_size)
+function G_vectors(architecture::AbstractArchitecture, fft_size::Union{Tuple,AbstractVector})
     # Note that a collect(G_vectors_generator(fft_size)) is 100-fold slower
     # than this implementation, hence the code duplication.
     start = .- cld.(fft_size .- 1, 2)
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
     Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
-    convert_like(array_type, Gs)  # Put data on the device (like GPU)
+    convert_like(architecture, Gs)  # Put data on the device (like GPU)
 end
 
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
@@ -511,7 +513,7 @@ function gather_kpts(basis::PlaneWaveBasis)
                        basis.kshift,
                        basis.symmetries_respect_rgrid,
                        comm_kpts=MPI.COMM_SELF,
-                       array_type=basis.G_vectors)
+                       architecture=basis.architecture)
     end
 end
 
diff --git a/src/architecture.jl b/src/architecture.jl
new file mode 100644
index 0000000000..ed3786dc83
--- /dev/null
+++ b/src/architecture.jl
@@ -0,0 +1,13 @@
+"""
+Abstract supertype for architectures supported by DFTK.
+"""
+abstract type AbstractArchitecture end
+
+struct CPU <: AbstractArchitecture end
+
+get_array_type(::CPU) = Array
+
+"""
+Generic, hardware independent architecture for DFTK.
+"""
+abstract type GPU <: AbstractArchitecture end
diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl
index bab452eb09..66acdb53f2 100644
--- a/src/common/zeros_like.jl
+++ b/src/common/zeros_like.jl
@@ -14,3 +14,7 @@ function convert_like(array_model::AbstractArray, src::AbstractArray)
 end
 convert_like(array_model::Array, src::Array) = src
 convert_like(array_model::Type,  src::AbstractArray) = convert(array_model, src)
+
+function convert_like(array_model::AbstractArchitecture, src::AbstractArray)
+    convert_like(get_array_type(array_model), src)
+end
diff --git a/src/cuda_architecture.jl b/src/cuda_architecture.jl
new file mode 100644
index 0000000000..8c1ffc031f
--- /dev/null
+++ b/src/cuda_architecture.jl
@@ -0,0 +1,7 @@
+"""
+Specialised architecture for NVIDIA CUDA GPUs.
+"""
+struct CUDAGPU <: GPU end
+GPU(::Type{CUDA.CuArray}) = CUDAGPU()
+
+get_array_type(::CUDAGPU) = CUDA.CuArray
diff --git a/src/external/jld2io.jl b/src/external/jld2io.jl
index 64a8a9d0bf..a774834733 100644
--- a/src/external/jld2io.jl
+++ b/src/external/jld2io.jl
@@ -83,6 +83,7 @@ struct PlaneWaveBasisSerialisation{T <: Real, GT <: AbstractArray}
     kshift::Union{Nothing,Vec3{T}}
     symmetries_respect_rgrid::Bool
     fft_size::Tuple{Int, Int, Int}
+    architecture::AbstractArchitecture
 end
 function JLD2.writeas(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}}) where {T,GT,RT,KGT}
     PlaneWaveBasisSerialisation{T,GT}
@@ -100,6 +101,7 @@ function Base.convert(::Type{PlaneWaveBasisSerialisation{T,GT}},
         basis.kshift,
         basis.symmetries_respect_rgrid,
         basis.fft_size,
+        basis.architecture
     )
 end
 
@@ -111,8 +113,5 @@ function Base.convert(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}},
                    serial.kshift,
                    serial.symmetries_respect_rgrid,
                    serial.variational,
-                   array_type=similar(GT, 1, 1, 1))
-                   # Can't use GT directly as it is Array{type, 2} instead of Array
-                   # so we build an array with type GT. GT is the G_vectors'type, so it
-                   # represents 3-dimensional arrays, hence the three 1's.
+                   architecture=serial.architecture)
 end

From 866598c60d1187ee92409aaf06685f3348afe5e9 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 9 Nov 2022 11:31:00 +0100
Subject: [PATCH 50/69] Fixes

---
 src/PlaneWaveBasis.jl | 4 ++--
 src/common/ortho.jl   | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index c02c22127d..1abe645841 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -156,7 +156,7 @@ end
 function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
                         kcoords, kweights, kgrid, kshift,
                         symmetries_respect_rgrid, comm_kpts,
-                        architecture::AbstractArchitecture=CPU()) where {T <: Real}
+                        architecture::AbstractArchitecture) where {T <: Real}
     # Validate fft_size
     if variational
         max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
@@ -339,7 +339,7 @@ Creates a new basis identical to `basis`, but with a custom set of kpoints
     PlaneWaveBasis(basis.model, basis.Ecut,
                    basis.fft_size, basis.variational,
                    kcoords, kweights, kgrid, kshift,
-                   basis.symmetries_respect_rgrid, basis.comm_kpts, basis.G_vectors)
+                   basis.symmetries_respect_rgrid, basis.comm_kpts, basis.architecture)
 end
 
 """
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index e26def5fe8..54c56b61e4 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -3,5 +3,6 @@
     Q = convert(ArrayType, qr(φk).Q)
     # CUDA bug: after the convert line, when φk is m*n rectangular matrix with m > n,
     # Q is not cropped ie only the first size(φk, 2) columns should be kept
+    # See https://github.com/JuliaGPU/CUDA.jl/pull/1662
     Q[:, 1:size(φk, 2)]
 end

From 0bcd3d13625670f1a29df19767384531e2bcf217 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 9 Nov 2022 12:55:30 +0100
Subject: [PATCH 51/69] Fix failing tests

---
 src/postprocess/stresses.jl | 2 +-
 src/supercell.jl            | 2 +-
 src/symmetry.jl             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/postprocess/stresses.jl b/src/postprocess/stresses.jl
index 5e0ac4a5ed..2e00dad7ae 100644
--- a/src/postprocess/stresses.jl
+++ b/src/postprocess/stresses.jl
@@ -19,7 +19,7 @@ Compute the stresses (= 1/Vol dE/d(M*lattice), taken at M=I) of an obtained SCF
                                    basis.Ecut, basis.fft_size, basis.variational,
                                    basis.kcoords_global, basis.kweights_global,
                                    basis.kgrid, basis.kshift, basis.symmetries_respect_rgrid,
-                                   basis.comm_kpts, basis.G_vectors)
+                                   basis.comm_kpts, basis.architecture)
         ρ = compute_density(new_basis, scfres.ψ, scfres.occupation)
         energies = energy_hamiltonian(new_basis, scfres.ψ, scfres.occupation;
                                       ρ, scfres.eigenvalues, scfres.εF).energies
diff --git a/src/supercell.jl b/src/supercell.jl
index c201c64a31..cd78d7d606 100644
--- a/src/supercell.jl
+++ b/src/supercell.jl
@@ -43,7 +43,7 @@ function cell_to_supercell(basis::PlaneWaveBasis)
                    ones(3),              # kgrid = Γ point only
                    basis.kshift,         # kshift
                    symmetries_respect_rgrid,
-                   basis.comm_kpts, basis.G_vectors)
+                   basis.comm_kpts, basis.architecture)
 end
 
 @doc raw"""
diff --git a/src/symmetry.jl b/src/symmetry.jl
index 4a271d70e1..e116279422 100644
--- a/src/symmetry.jl
+++ b/src/symmetry.jl
@@ -278,7 +278,7 @@ function unfold_bz(basis::PlaneWaveBasis)
                               basis.Ecut, basis.fft_size, basis.variational,
                               kcoords, [1/length(kcoords) for _ in kcoords],
                               basis.kgrid, basis.kshift,
-                              basis.symmetries_respect_rgrid, basis.comm_kpts, basis.G_vectors)
+                              basis.symmetries_respect_rgrid, basis.comm_kpts, basis.architecture)
     end
 end
 

From d13fa568303777c9832517f632051130c16edfa8 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 15 Nov 2022 10:59:27 +0100
Subject: [PATCH 52/69] Code clarification and better comments

---
 src/PlaneWaveBasis.jl    | 12 +++++++-----
 src/common/ortho.jl      |  3 +++
 src/common/zeros_like.jl |  9 ++++++---
 src/eigen/diag.jl        |  6 +++---
 src/guess_density.jl     |  4 ++--
 5 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 1abe645841..cdab0ab0cb 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -39,9 +39,10 @@ Normalization conventions:
 
 `ifft` and `fft` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT, GT, RT, KGT} <: AbstractBasis{
+struct PlaneWaveBasis{T, VT, T_G_vectors, T_r_vectors, T_kpt_G_vecs} <: AbstractBasis{
     T
-} where {VT <: Real, GT <: AbstractArray, RT <: AbstractArray, KGT <: AbstractArray}
+} where {VT <: Real, T_G_vectors <: AbstractArray{Vec3{Int}}, T_r_vectors <: AbstractArray,
+         T_kpt_G_vecs <: AbstractVector{Vec3{Int}}}
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 
@@ -69,13 +70,13 @@ struct PlaneWaveBasis{T, VT, GT, RT, KGT} <: AbstractBasis{
     ifft_normalization::T  # ifft = ifft_normalization * BFFT
 
     # "cubic" basis in reciprocal and real space, on which potentials and densities are stored
-    G_vectors::GT
-    r_vectors::RT
+    G_vectors::T_G_vectors
+    r_vectors::T_r_vectors
 
     ## MPI-local information of the kpoints this processor treats
     # Irreducible kpoints. In the case of collinear spin,
     # this lists all the spin up, then all the spin down
-    kpoints::Vector{Kpoint{T, KGT}}
+    kpoints::Vector{Kpoint{T, T_kpt_G_vecs}}
     # BZ integration weights, summing up to model.n_spin_components
     kweights::Vector{T}
 
@@ -434,6 +435,7 @@ or the index `i` such that `G_vectors(basis, kpoint)[i] == G`.
 Returns nothing if outside the range of valid wave vectors.
 """
 @inline function index_G_vectors(fft_size::Tuple, G::AbstractVector{<:Integer})
+    # the inline declaration encourages the compiler to hoist these (G-independent) precomputations
     start = .- cld.(fft_size .- 1, 2)
     stop  = fld.(fft_size .- 1, 2)
     lengths = stop .- start .+ 1
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index 54c56b61e4..53b4a309e1 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -6,3 +6,6 @@
     # See https://github.com/JuliaGPU/CUDA.jl/pull/1662
     Q[:, 1:size(φk, 2)]
 end
+
+# CPU specialisation to go a bit faster
+@timing ortho_qr(φk::ArrayType) where {ArrayType <: Array} = Array(qr(φk).Q)
diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl
index 66acdb53f2..0fc6407cfe 100644
--- a/src/common/zeros_like.jl
+++ b/src/common/zeros_like.jl
@@ -1,8 +1,11 @@
-# Create an array of same type as X filled with zeros, minimizing the number
-# of allocations.
+"""
+Create an array of same type as X filled with zeros, minimizing the number
+of allocations. This unifies CPU and GPU code, as the output will always be on the
+same device as the input.
+"""
 function zeros_like(X::AbstractArray, T::Type=eltype(X), dims::Integer...=size(X)...)
     Z = similar(X, T, dims...)
-    Z .= false
+    Z .= zero(T)
     Z
 end
 zeros_like(X::AbstractArray, dims::Integer...) = zeros_like(X, eltype(X), dims...)
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index fe0a593cae..10cef4807c 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -58,9 +58,9 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
     end
 
     # Transform results into a nicer datastructure
-    # TODO It feels inconsistent to put λ onto the CPU here but none of the other object
-    #      ... better have this handled by the caller of diagonalize_all_kblocks.
-    (λ=[Array(real.(res.λ)) for res in results],  # Get onto the CPU in any case
+    # TODO It feels inconsistent to put λ onto the CPU here but none of the other objects.
+    #      Better have this handled by the caller of diagonalize_all_kblocks.
+    (λ=[Array(real.(res.λ)) for res in results],  # Always get onto the CPU
      X=[res.X for res in results],
      residual_norms=[res.residual_norms for res in results],
      iterations=[res.iterations for res in results],
diff --git a/src/guess_density.jl b/src/guess_density.jl
index b47c22705b..4ca742949d 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -99,7 +99,7 @@ function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
 
     isempty(gaussians) && return irfft(basis, ρ)
 
-    # This copy is required such that gaussians is isbits and can be transferred to the GPU
+    # This copy is required so that `gaussians` is isbits and can be transferred to the GPU
     # TODO See if there is a better option here ... this feels non-ideal for larger systems
     gaussians = SVector{size(gaussians)[1]}(gaussians)
 
@@ -107,6 +107,7 @@ function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
     # where f(x) is a weighted gaussian
     #
     # is formed from a superposition of atomic densities, each scaled by a prefactor
+    # Ensure that we only set G-vectors that have a -G counterpart to ensure ρ is real.
     function build_ρ(G)
         if isnothing(index_G_vectors(fft_size, -G))
             return zero(complex(T))
@@ -119,7 +120,6 @@ function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
         end
         res
     end
-    #  Can't use map! as the Gs are converted from an array of Vec3 to an array of complex
     ρ = map(build_ρ, basis.G_vectors)
 
     irfft(basis, ρ / sqrt(basis.model.unit_cell_volume))

From 085ca99deedb301d1c695134a625dad93e63859a Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 15 Nov 2022 11:46:08 +0100
Subject: [PATCH 53/69] Refactoring/code cleanup

---
 src/symmetry.jl               |  9 ---------
 src/terms/hartree.jl          | 13 +++++++------
 src/terms/kinetic.jl          | 15 +++++++++------
 src/workarounds/gpu_arrays.jl | 10 ++++++++++
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/symmetry.jl b/src/symmetry.jl
index e116279422..51e1228bc2 100644
--- a/src/symmetry.jl
+++ b/src/symmetry.jl
@@ -198,15 +198,6 @@ function lowpass_for_symmetry!(ρ::AbstractArray, basis; symmetries=basis.symmet
     end
     ρ
 end
-function lowpass_for_symmetry!(ρ::AT, basis;
-                               symmetries=basis.symmetries) where {AT <: AbstractGPUArray}
-    all(isone, symmetries) && return ρ
-    # lowpass_for_symmetry! currently uses scalar indexing, so we have to do this very ugly
-    # thing for cases where ρ sits on a device (e.g. GPU)
-    ρ_CPU = Array(ρ)
-    ρ_CPU = lowpass_for_symmetry!(ρ_CPU, basis; symmetries)
-    convert(AT, ρ_CPU)
-end
 
 """
 Symmetrize a density by applying all the basis (by default) symmetries and forming the average.
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 6aa22f9844..3f29b05f30 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -1,3 +1,5 @@
+using GPUArraysCore
+
 """
 Hartree term: for a decaying potential V the energy would be
 
@@ -29,14 +31,13 @@ end
 function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
     # Solving the Poisson equation ΔV = -4π ρ in Fourier space
     # is multiplying elementwise by 4π / |G|^2.
-    poisson_green_coeffs = map(G_vectors_cart(basis)) do G
-        if iszero(G)
-            zero(T)  # Compensating charge background => zero DC
-        else
-            4T(π) / sum(abs2, G)
-        end
+
+    GPUArraysCore.allowscalar() do
+        poisson_green_coeffs = 4T(π) ./ [sum(abs2, G) for G in G_vectors_cart(basis)]
+        poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
     end
     enforce_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
+    poisson_green_coeffs = convert_like(basis.G_vectors, poisson_green_coeffs) # Move to GPU
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)
 end
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index 541d3958c6..cc8ab07300 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -19,16 +19,19 @@ struct TermKinetic <: Term
     kinetic_energies::Vector{<:AbstractVector}
 end
 function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor, blowup) where {T}
-    function build_kin(Gks, Ecut)
-        map(Gks) do Gk
-            T(scaling_factor) * sum(abs2, Gk) / 2 * blowup(norm(Gk), Ecut)
-        end
-    end
-    kinetic_energies = [build_kin(Gplusk_vectors_cart(basis, kpt), basis.Ecut)
+
+    kinetic_energies = [kinetic_energy(Gplusk_vectors_cart(basis, kpt), scaling_factor,
+                                        blowup, basis.Ecut, T)
                         for kpt in basis.kpoints]
     TermKinetic(T(scaling_factor), kinetic_energies)
 end
 
+function kinetic_energy(q, scaling_factor, blowup, Ecut, ::Type{T}) where {T}
+    map(q) do qk
+        T(scaling_factor) * sum(abs2, qk) / 2 * blowup(norm(qk), Ecut)
+    end
+end
+
 @timing "ene_ops: kinetic" function ene_ops(term::TermKinetic, basis::PlaneWaveBasis{T},
                                             ψ, occupation; kwargs...) where {T}
     ops = [FourierMultiplication(basis, kpoint, term.kinetic_energies[ik])
diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl
index 5c9fc9a478..3f99a6c6ad 100644
--- a/src/workarounds/gpu_arrays.jl
+++ b/src/workarounds/gpu_arrays.jl
@@ -3,3 +3,13 @@ using GPUArraysCore
 
 # https://github.com/JuliaGPU/CUDA.jl/issues/1565
 LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal, y::AbstractGPUArray) = x' * (D * y)
+
+function lowpass_for_symmetry!(ρ::AT, basis;
+    symmetries=basis.symmetries) where {AT <: AbstractGPUArray}
+    all(isone, symmetries) && return ρ
+    # lowpass_for_symmetry! currently uses scalar indexing, so we have to do this very ugly
+    # thing for cases where ρ sits on a device (e.g. GPU)
+    ρ_CPU = Array(ρ)
+    ρ_CPU = lowpass_for_symmetry!(ρ_CPU, basis; symmetries)
+    convert(AT, ρ_CPU)
+end

From 8c6ac54cf8b4f4935f73680729f32919ee30bf51 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 16 Nov 2022 09:32:04 +0100
Subject: [PATCH 54/69] Replace sum(abs2(...)) by norm2 function

---
 src/PlaneWaveBasis.jl        | 4 ++--
 src/common/norm.jl           | 5 +++++
 src/eigen/preconditioners.jl | 2 +-
 src/guess_density.jl         | 2 +-
 src/scf/chi0models.jl        | 2 +-
 src/scf/mixing.jl            | 4 ++--
 src/terms/anyonic.jl         | 6 +++---
 src/terms/ewald.jl           | 2 +-
 src/terms/hartree.jl         | 2 +-
 src/terms/kinetic.jl         | 2 +-
 src/terms/xc.jl              | 4 ++--
 test/PlaneWaveBasis.jl       | 2 +-
 12 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index cdab0ab0cb..8dd018884c 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -131,7 +131,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
         sizehint!(mapping, n_guess)
         sizehint!(Gvecs_k, n_guess)
         for (i, G) in enumerate(G_vectors(fft_size))
-            if !variational || sum(abs2, model.recip_lattice * (G + k)) / 2 ≤ Ecut
+            if !variational || norm2(model.recip_lattice * (G + k)) / 2 ≤ Ecut
                 push!(mapping, i)
                 push!(Gvecs_k, G)
             end
@@ -160,7 +160,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
                         architecture::AbstractArchitecture) where {T <: Real}
     # Validate fft_size
     if variational
-        max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
+        max_E = norm2(model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
         Ecut > max_E && @warn(
             "For a variational method, Ecut should be less than the maximal kinetic " *
             "energy the grid supports ($max_E)"
diff --git a/src/common/norm.jl b/src/common/norm.jl
index 4f96ad6146..20d4767d50 100644
--- a/src/common/norm.jl
+++ b/src/common/norm.jl
@@ -7,3 +7,8 @@ differentiation, we can extend analytically f to accept complex inputs, then dif
 inputs, and therefore we have to redefine it.
 """
 norm_cplx(x) = sqrt(sum(xx -> xx * xx, x))
+
+"""
+Compute the square of the ℓ²-norm for instances of our static structure Vec3.
+"""
+norm2(G::Vec3) = sum(abs2, G)
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index 699c9f8ef9..95bc8e19bb 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -42,7 +42,7 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=
     blowup = kinetic_term.blowup     # blowup for energy cut-off smearing
     Ecut = basis.Ecut
     kin = map(Gplusk_vectors_cart(basis, kpt)) do q
-        scaling * sum(abs2, q) /2 * blowup(norm(q), Ecut)
+        scaling * norm2(q) /2 * blowup(norm(q), Ecut)
     end
     PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift)
 end
diff --git a/src/guess_density.jl b/src/guess_density.jl
index 4ca742949d..301bb21e8e 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -112,7 +112,7 @@ function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
         if isnothing(index_G_vectors(fft_size, -G))
             return zero(complex(T))
         end
-        Gsq = sum(abs2, recip_lattice * G)
+        Gsq = norm2(recip_lattice * G)
         res = zero(complex(T))
         for (coeff, decay_length, r) in gaussians
             form_factor::T = exp(-Gsq * T(decay_length)^2)
diff --git a/src/scf/chi0models.jl b/src/scf/chi0models.jl
index df77bda5d3..c3d5175ec5 100644
--- a/src/scf/chi0models.jl
+++ b/src/scf/chi0models.jl
@@ -55,7 +55,7 @@ function (χ0::DielectricModel)(basis; kwargs...)
     C0  = 1 - εr
     iszero(C0) && return nothing  # Will yield no contribution
 
-    Gsq = [sum(abs2, G) for G in G_vectors_cart(basis)]
+    Gsq = [norm2(G) for G in G_vectors_cart(basis)]
     apply_sqrtL = identity
     if χ0.localization != identity
         sqrtL = sqrt.(χ0.localization.(r_vectors(basis)))
diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl
index 62f9854ce8..3f601c09ac 100644
--- a/src/scf/mixing.jl
+++ b/src/scf/mixing.jl
@@ -50,7 +50,7 @@ end
 @timing "KerkerMixing" function mix_density(mixing::KerkerMixing, basis::PlaneWaveBasis,
                                             δF; kwargs...)
     T  = eltype(δF)
-    G² = map(G -> sum(abs2, G), G_vectors_cart(basis))
+    G² = map(G -> norm2(G), G_vectors_cart(basis))
     kTF    = T.(mixing.kTF)
     ΔDOS_Ω = T.(mixing.ΔDOS_Ω)
 
@@ -136,7 +136,7 @@ end
     εr > 1 / sqrt(eps(T)) && return mix_density(KerkerMixing(; kTF), basis, δF)
 
     C0 = 1 - εr
-    Gsq = map(G -> sum(abs2, G), G_vectors_cart(basis))
+    Gsq = map(G -> norm2(G), G_vectors_cart(basis))
     δF_fourier = fft(basis, δF)
     δρ = @. δF_fourier * (kTF^2 - C0 * Gsq) / (εr * kTF^2 - C0 * Gsq)
     δρ = irfft(basis, δρ)
diff --git a/src/terms/anyonic.jl b/src/terms/anyonic.jl
index 54b07df3be..2c55ea4d58 100644
--- a/src/terms/anyonic.jl
+++ b/src/terms/anyonic.jl
@@ -49,7 +49,7 @@ function make_div_free(basis::PlaneWaveBasis{T}, A) where {T}
         # project on divergence free fields, ie in Fourier
         # project A(G) on the orthogonal of G
         if iG != 1
-            out[1][iG], out[2][iG] = vec - (G'vec) * G / sum(abs2, G)
+            out[1][iG], out[2][iG] = vec - (G'vec) * G / norm2(G)
         else
             out[1][iG], out[2][iG] = vec
         end
@@ -114,7 +114,7 @@ function ene_ops(term::TermAnyonic, basis::PlaneWaveBasis{T}, ψ, occupation;
     ρ_fourier = fft(basis, ρ[:, :, :, 1])
     ρref_fourier = fft(basis, term.ρref)  # TODO optimize
     for (iG, G) in enumerate(G_vectors_cart(basis))
-        G2 = sum(abs2, G)
+        G2 = norm2(G)
         if G2 != 0
             A1[iG] = +2T(π) * G[2] / G2 * (ρ_fourier[iG] - ρref_fourier[iG]) * im
             A2[iG] = -2T(π) * G[1] / G2 * (ρ_fourier[iG] - ρref_fourier[iG]) * im
@@ -142,7 +142,7 @@ function ene_ops(term::TermAnyonic, basis::PlaneWaveBasis{T}, ψ, occupation;
     eff_pot_fourier = zeros(complex(T), basis.fft_size)
     for (iG, Gred) in enumerate(G_vectors(basis))
         G = basis.model.recip_lattice * Gred
-        G2 = sum(abs2, G)
+        G2 = norm2(G)
         if G2 != 0
             eff_pot_fourier[iG] += -4T(π)*β * im * G[2] / G2 * eff_current_fourier[1][iG]
             eff_pot_fourier[iG] += +4T(π)*β * im * G[1] / G2 * eff_current_fourier[2][iG]
diff --git a/src/terms/ewald.jl b/src/terms/ewald.jl
index ddd85bb172..79dc949bba 100644
--- a/src/terms/ewald.jl
+++ b/src/terms/ewald.jl
@@ -91,7 +91,7 @@ function energy_ewald(lattice::AbstractArray{T}, charges, positions;
     for G1 in -Glims[1]:Glims[1], G2 in -Glims[2]:Glims[2], G3 in -Glims[3]:Glims[3]
         G = Vec3(G1, G2, G3)
         iszero(G) && continue
-        Gsq = sum(abs2, recip_lattice * G)
+        Gsq = norm2(recip_lattice * G)
         cos_strucfac = sum(Z * cos2pi(dot(r, G)) for (r, Z) in zip(positions, charges))
         sin_strucfac = sum(Z * sin2pi(dot(r, G)) for (r, Z) in zip(positions, charges))
         sum_strucfac = cos_strucfac^2 + sin_strucfac^2
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 3f29b05f30..2fc437aa75 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -33,7 +33,7 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
     # is multiplying elementwise by 4π / |G|^2.
 
     GPUArraysCore.allowscalar() do
-        poisson_green_coeffs = 4T(π) ./ [sum(abs2, G) for G in G_vectors_cart(basis)]
+        poisson_green_coeffs = 4T(π) ./ [norm2(G) for G in G_vectors_cart(basis)]
         poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
     end
     enforce_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index cc8ab07300..dda208c670 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -28,7 +28,7 @@ end
 
 function kinetic_energy(q, scaling_factor, blowup, Ecut, ::Type{T}) where {T}
     map(q) do qk
-        T(scaling_factor) * sum(abs2, qk) / 2 * blowup(norm(qk), Ecut)
+        T(scaling_factor) * norm2(qk) / 2 * blowup(norm(qk), Ecut)
     end
 end
 
diff --git a/src/terms/xc.jl b/src/terms/xc.jl
index bccbf9ba2f..0e4ccb9836 100644
--- a/src/terms/xc.jl
+++ b/src/terms/xc.jl
@@ -97,7 +97,7 @@ end
         end
         if haskey(terms, :Vl) && any(x -> abs(x) > term.potential_threshold, terms.Vl)
             @warn "Meta-GGAs with a Δρ term have not yet been thoroughly tested." maxlog=1
-            mG² = [-sum(abs2, G) for G in G_vectors_cart(basis)]
+            mG² = [-norm2(G) for G in G_vectors_cart(basis)]
             Vl  = reshape(terms.Vl, n_spin, basis.fft_size...)
             Vl_fourier = fft(basis, Vl[s, :, :, :])
             # TODO: forcing real-valued ifft; should be enforced at creation of array
@@ -255,7 +255,7 @@ function LibxcDensities(basis, max_derivative::Integer, ρ, τ)
     # Compute Δρ
     if max_derivative > 1
         Δρ_real = similar(ρ_real, n_spin, basis.fft_size...)
-        mG² = [-sum(abs2, G) for G in G_vectors_cart(basis)]
+        mG² = [-norm2(G) for G in G_vectors_cart(basis)]
         for σ = 1:n_spin
             # TODO: forcing real-valued ifft; should be enforced at creation of array
             Δρ_real[σ, :, :, :] .= irfft(basis, mG² .* @view ρ_fourier[σ, :, :, :];
diff --git a/test/PlaneWaveBasis.jl b/test/PlaneWaveBasis.jl
index 78a41337fd..be29155e40 100644
--- a/test/PlaneWaveBasis.jl
+++ b/test/PlaneWaveBasis.jl
@@ -11,7 +11,7 @@ function test_pw_cutoffs(testcase, Ecut, fft_size)
 
     for (ik, kpt) in enumerate(basis.kpoints)
         for G in G_vectors(basis, kpt)
-            @test sum(abs2, model.recip_lattice * (kpt.coordinate + G)) ≤ 2 * Ecut
+            @test norm2(model.recip_lattice * (kpt.coordinate + G)) ≤ 2 * Ecut
         end
     end
 end

From a02cd21c39612ec7a49fa194b4674e6cd17d42de Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Wed, 16 Nov 2022 10:42:53 +0100
Subject: [PATCH 55/69] Bugfix: extend norm2 to any array

---
 src/common/norm.jl     | 4 ++--
 test/PlaneWaveBasis.jl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/common/norm.jl b/src/common/norm.jl
index 20d4767d50..0f79149d0d 100644
--- a/src/common/norm.jl
+++ b/src/common/norm.jl
@@ -9,6 +9,6 @@ inputs, and therefore we have to redefine it.
 norm_cplx(x) = sqrt(sum(xx -> xx * xx, x))
 
 """
-Compute the square of the ℓ²-norm for instances of our static structure Vec3.
+Square of the ℓ²-norm.
 """
-norm2(G::Vec3) = sum(abs2, G)
+norm2(G::AbstractArray) = sum(abs2, G)
diff --git a/test/PlaneWaveBasis.jl b/test/PlaneWaveBasis.jl
index be29155e40..72f5bd407f 100644
--- a/test/PlaneWaveBasis.jl
+++ b/test/PlaneWaveBasis.jl
@@ -11,7 +11,7 @@ function test_pw_cutoffs(testcase, Ecut, fft_size)
 
     for (ik, kpt) in enumerate(basis.kpoints)
         for G in G_vectors(basis, kpt)
-            @test norm2(model.recip_lattice * (kpt.coordinate + G)) ≤ 2 * Ecut
+            @test sum(abs2,model.recip_lattice * (kpt.coordinate + G)) ≤ 2 * Ecut
         end
     end
 end

From 349b8f1ed042ebaba9b10c310a9bcb0e44f8dc94 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 17 Nov 2022 14:58:58 +0100
Subject: [PATCH 56/69] Remove convert_like and add to_device

---
 examples/gpu.jl          |  2 +-
 src/PlaneWaveBasis.jl    |  6 +++---
 src/architecture.jl      |  9 ++++++++-
 src/common/zeros_like.jl | 10 ----------
 src/cuda_architecture.jl |  6 +++++-
 src/occupation.jl        |  2 +-
 src/orbitals.jl          |  2 +-
 src/terms/hartree.jl     |  2 +-
 src/terms/local.jl       |  2 +-
 src/terms/nonlocal.jl    |  8 ++++----
 10 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/examples/gpu.jl b/examples/gpu.jl
index ad9773fc0d..c6f052afac 100644
--- a/examples/gpu.jl
+++ b/examples/gpu.jl
@@ -11,7 +11,7 @@ positions = [ones(3)/8, -ones(3)/8]
 model = model_DFT(lattice, atoms, positions, []; temperature=1e-3)
 
 # If available use CUDA to store DFT quantities and perform main computations
-# This is triggered by setting the array_type for storing DFT quantities
+# This is triggered by setting the array type for storing DFT quantities
 architecture = has_cuda() ? GPU(CuArray) : CPU()
 
 basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), architecture)
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 8dd018884c..e7f1304f5a 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -136,7 +136,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
                 push!(Gvecs_k, G)
             end
         end
-        Gvecs_k = convert_like(architecture, Gvecs_k)
+        Gvecs_k = to_device(architecture, Gvecs_k)
 
         mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping))
         for iσ = 1:model.n_spin_components
@@ -264,7 +264,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     dvol  = model.unit_cell_volume ./ prod(fft_size)
     r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3)
                  for i = 1:N1, j = 1:N2, k = 1:N3]
-    r_vectors = convert_like(architecture, r_vectors)
+    r_vectors = to_device(architecture, r_vectors)
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
     basis = PlaneWaveBasis{T, value_type(T), typeof(Gs), typeof(r_vectors),
@@ -357,7 +357,7 @@ function G_vectors(architecture::AbstractArchitecture, fft_size::Union{Tuple,Abs
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
     Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
-    convert_like(architecture, Gs)  # Put data on the device (like GPU)
+    to_device(architecture, Gs)  # Put data on the device (like GPU)
 end
 
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
diff --git a/src/architecture.jl b/src/architecture.jl
index ed3786dc83..75e31fe85b 100644
--- a/src/architecture.jl
+++ b/src/architecture.jl
@@ -5,7 +5,14 @@ abstract type AbstractArchitecture end
 
 struct CPU <: AbstractArchitecture end
 
-get_array_type(::CPU) = Array
+"""
+Transfer an array from a device (typically a GPU) to the CPU.
+"""
+to_cpu(x::AbstractArray) = Array(x)
+to_cpu(x::Array) = x
+
+to_device(::CPU, x) = to_cpu(x)
+
 
 """
 Generic, hardware independent architecture for DFTK.
diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl
index 0fc6407cfe..724729b99a 100644
--- a/src/common/zeros_like.jl
+++ b/src/common/zeros_like.jl
@@ -11,13 +11,3 @@ end
 zeros_like(X::AbstractArray, dims::Integer...) = zeros_like(X, eltype(X), dims...)
 zeros_like(X::Array, T::Type=eltype(X), dims::Integer...=size(X)...) = zeros(T, dims...)
 zeros_like(X::StaticArray, T::Type=eltype(X), dims::Integer...=size(X)...) = @SArray zeros(T, dims...)
-
-function convert_like(array_model::AbstractArray, src::AbstractArray)
-    copy!(similar(array_model, eltype(src), size(src)...), src)
-end
-convert_like(array_model::Array, src::Array) = src
-convert_like(array_model::Type,  src::AbstractArray) = convert(array_model, src)
-
-function convert_like(array_model::AbstractArchitecture, src::AbstractArray)
-    convert_like(get_array_type(array_model), src)
-end
diff --git a/src/cuda_architecture.jl b/src/cuda_architecture.jl
index 8c1ffc031f..2c2da6c2b8 100644
--- a/src/cuda_architecture.jl
+++ b/src/cuda_architecture.jl
@@ -4,4 +4,8 @@ Specialised architecture for NVIDIA CUDA GPUs.
 struct CUDAGPU <: GPU end
 GPU(::Type{CUDA.CuArray}) = CUDAGPU()
 
-get_array_type(::CUDAGPU) = CUDA.CuArray
+"""
+Transfer an array from a device (typically the CPU) to the NVIDIA CUDA GPU.
+"""
+to_device(::CUDAGPU, x::AbstractArray) = CUDA.CuArray(x)
+to_device(::CUDAGPU, x::CUDA.CuArray) = x
diff --git a/src/occupation.jl b/src/occupation.jl
index 50c44a849f..465e1eb0c7 100644
--- a/src/occupation.jl
+++ b/src/occupation.jl
@@ -29,7 +29,7 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF;
     inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature
 
     filled_occ = filled_occupation(basis.model)
-    [convert_like(basis.G_vectors,  # Put onto the GPU if we're using it
+    [to_device(basis.architecture,  # Put onto the GPU if we're using it
         filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature))
         for εk in eigenvalues]
 end
diff --git a/src/orbitals.jl b/src/orbitals.jl
index d9ba1773cd..45596f4f02 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -56,7 +56,7 @@ unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ))
 function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
     @static if VERSION < v"1.7"  # TaskLocalRNG not yet available.
         orbitals = randn(Complex{T}, length(G_vectors(basis, kpt)), howmany)
-        orbitals = convert_like(basis.G_vectors, orbitals)
+        orbitals = to_device(basis.architecture, orbitals)
     else
         orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
         randn!(TaskLocalRNG(), orbitals)  # use the RNG on the device if we're using a GPU
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 2fc437aa75..dc3ec77a6c 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -37,7 +37,7 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
         poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
     end
     enforce_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
-    poisson_green_coeffs = convert_like(basis.G_vectors, poisson_green_coeffs) # Move to GPU
+    poisson_green_coeffs = to_device(basis.architecture, poisson_green_coeffs) # Move to GPU
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)
 end
diff --git a/src/terms/local.jl b/src/terms/local.jl
index 9d2fd70505..daffb9c949 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -105,7 +105,7 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     enforce_real!(basis, pot_fourier)  # Symmetrize Fourier coeffs to have real iFFT
 
     # Offload potential values to a device (like a GPU) and do the FFT
-    pot_real = irfft(basis, convert_like(basis.G_vectors, pot_fourier))
+    pot_real = irfft(basis, to_device(basis.architecture, pot_fourier))
 
     TermAtomicLocal(pot_real)
 end
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 8bb61a3079..5085d05106 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -15,7 +15,7 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T}
     isempty(psp_groups) && return TermNoop()
     ops = map(basis.kpoints) do kpt
         P = build_projection_vectors_(basis, kpt, psps, psp_positions)
-        D = build_projection_coefficients_(T, psps, psp_positions, array_type=basis.G_vectors)
+        D = build_projection_coefficients_(T, psps, psp_positions, architecture=basis.architecture)
         NonlocalOperator(basis, kpt, P, D)
     end
     TermAtomicNonlocal(ops)
@@ -92,7 +92,7 @@ end
 # The ordering of the projector indices is (A,l,m,i), where A is running over all
 # atoms, l, m are AM quantum numbers and i is running over all projectors for a
 # given l. The matrix is block-diagonal with non-zeros only if A, l and m agree.
-function build_projection_coefficients_(T, psps, psp_positions; array_type=Array)
+function build_projection_coefficients_(T, psps, psp_positions; architecture=CPU())
     # TODO In the current version the proj_coeffs still has a lot of zeros.
     #      One could improve this by storing the blocks as a list or in a
     #      BlockDiagonal data structure
@@ -109,7 +109,7 @@ function build_projection_coefficients_(T, psps, psp_positions; array_type=Array
     @assert count == n_proj
 
     # GPU computation only : build the coefficients on CPU then offload them to the GPU
-    convert_like(array_type, proj_coeffs)
+    to_device(architecture, proj_coeffs)
 end
 
 # Builds the projection coefficient matrix for a single atom
@@ -177,7 +177,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     @assert offset == n_proj
 
     # Offload potential values to a device (like a GPU)
-    convert_like(basis.G_vectors, proj_vectors)
+    to_device(basis.architecture, proj_vectors)
 end
 
 """

From 778712d5e9b957d59171d00a1205d2994a6e4cf0 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 17 Nov 2022 15:10:20 +0100
Subject: [PATCH 57/69] Update comments and docstring

---
 examples/gpu.jl          | 2 +-
 src/DFTK.jl              | 1 -
 src/PlaneWaveBasis.jl    | 2 +-
 src/common/ortho.jl      | 2 +-
 src/common/zeros_like.jl | 2 +-
 src/occupation.jl        | 2 +-
 src/terms/hartree.jl     | 2 +-
 src/terms/nonlocal.jl    | 1 -
 8 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/examples/gpu.jl b/examples/gpu.jl
index c6f052afac..afad08fb7e 100644
--- a/examples/gpu.jl
+++ b/examples/gpu.jl
@@ -12,7 +12,7 @@ model = model_DFT(lattice, atoms, positions, []; temperature=1e-3)
 
 # If available use CUDA to store DFT quantities and perform main computations
 # This is triggered by setting the array type for storing DFT quantities
-architecture = has_cuda() ? GPU(CuArray) : CPU()
+architecture = has_cuda() ? DFTK.GPU(CuArray) : DFTK.CPU()
 
 basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), architecture)
 scfres = self_consistent_field(basis; tol=1e-3,
diff --git a/src/DFTK.jl b/src/DFTK.jl
index 1418aaccf5..56ce0008f6 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -23,7 +23,6 @@ export Mat3
 export mpi_nprocs
 export mpi_master
 export setup_threading, disable_threading
-export CPU, GPU
 include("common/timer.jl")
 include("common/constants.jl")
 include("common/ortho.jl")
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index e7f1304f5a..08fbfc8a1a 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -357,7 +357,7 @@ function G_vectors(architecture::AbstractArchitecture, fft_size::Union{Tuple,Abs
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
     Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
-    to_device(architecture, Gs)  # Put data on the device (like GPU)
+    to_device(architecture, Gs)
 end
 
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index 53b4a309e1..8685961e1c 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -7,5 +7,5 @@
     Q[:, 1:size(φk, 2)]
 end
 
-# CPU specialisation to go a bit faster
+# CPU specialisation to go a bit faster (skip the slicing)
 @timing ortho_qr(φk::ArrayType) where {ArrayType <: Array} = Array(qr(φk).Q)
diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl
index 724729b99a..7de53ea4d0 100644
--- a/src/common/zeros_like.jl
+++ b/src/common/zeros_like.jl
@@ -1,5 +1,5 @@
 """
-Create an array of same type as X filled with zeros, minimizing the number
+Create an array of same "array type" as X filled with zeros, minimizing the number
 of allocations. This unifies CPU and GPU code, as the output will always be on the
 same device as the input.
 """
diff --git a/src/occupation.jl b/src/occupation.jl
index 465e1eb0c7..304d033566 100644
--- a/src/occupation.jl
+++ b/src/occupation.jl
@@ -29,7 +29,7 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF;
     inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature
 
     filled_occ = filled_occupation(basis.model)
-    [to_device(basis.architecture,  # Put onto the GPU if we're using it
+    [to_device(basis.architecture,
         filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature))
         for εk in eigenvalues]
 end
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index dc3ec77a6c..99493c8892 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -37,7 +37,7 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
         poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
     end
     enforce_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
-    poisson_green_coeffs = to_device(basis.architecture, poisson_green_coeffs) # Move to GPU
+    poisson_green_coeffs = to_device(basis.architecture, poisson_green_coeffs)
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)
 end
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 5085d05106..50b0844767 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -108,7 +108,6 @@ function build_projection_coefficients_(T, psps, psp_positions; architecture=CPU
     end  # psp, r
     @assert count == n_proj
 
-    # GPU computation only : build the coefficients on CPU then offload them to the GPU
     to_device(architecture, proj_coeffs)
 end
 

From c39ee19e9786c1efdd0f9a3b5fbef79679eaf9fc Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Thu, 17 Nov 2022 15:24:43 +0100
Subject: [PATCH 58/69] Enforce the use of to_cpu instead of Array when doing
 GPU-CPU transfers

---
 src/densities.jl              | 3 ++-
 src/eigen/diag.jl             | 2 +-
 src/scf/nbands_algorithm.jl   | 4 ++--
 src/scf/potential_mixing.jl   | 2 +-
 src/terms/kinetic.jl          | 2 +-
 src/terms/local.jl            | 4 ++--
 src/terms/nonlocal.jl         | 6 +++---
 src/workarounds/gpu_arrays.jl | 5 ++---
 8 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/densities.jl b/src/densities.jl
index 9ab81e0aba..dd8b449ba0 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -23,7 +23,8 @@ using an optional `occupation_threshold`. By default all occupation numbers are
 @views @timing function compute_density(basis::PlaneWaveBasis{T}, ψ, occupation;
                                         occupation_threshold=zero(T)) where {T}
     S = promote_type(T, real(eltype(ψ[1])))
-    occupation = [Array(oc) for oc in occupation]  # Bring to CPU if not yet done
+    # occupation should be on the CPU as we are going to be doing scalar indexing.
+    occupation = [to_cpu(oc) for oc in occupation]
 
     # we split the total iteration range (ik, n) in chunks, and parallelize over them
     mask_occ = map(occk -> findall(isless.(occupation_threshold, occk)), occupation)
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index 10cef4807c..3de27ac29a 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -60,7 +60,7 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
     # Transform results into a nicer datastructure
     # TODO It feels inconsistent to put λ onto the CPU here but none of the other objects.
     #      Better have this handled by the caller of diagonalize_all_kblocks.
-    (λ=[Array(real.(res.λ)) for res in results],  # Always get onto the CPU
+    (λ=[to_cpu(real.(res.λ)) for res in results],  # Always get onto the CPU
      X=[res.X for res in results],
      residual_norms=[res.residual_norms for res in results],
      iterations=[res.iterations for res in results],
diff --git a/src/scf/nbands_algorithm.jl b/src/scf/nbands_algorithm.jl
index df1f085935..a74c6aff61 100644
--- a/src/scf/nbands_algorithm.jl
+++ b/src/scf/nbands_algorithm.jl
@@ -68,8 +68,8 @@ function determine_n_bands(bands::AdaptiveBands, occupation::AbstractVector,
     # TODO Could return different bands per k-Points
 
     # Determine number of bands to be actually converged
-    occupation = [Array(occk) for occk in occupation]  # Bring occupation to CPU memory
-    # on the CPU, or maximum (following line) will fail
+    # Bring occupation on the CPU, or findlast will fail
+    occupation = [to_cpu(occk) for occk in occupation]
     n_bands_occ = maximum(occupation) do occk
         something(findlast(fnk -> fnk ≥ bands.occupation_threshold, occk), length(occk) + 1)
     end
diff --git a/src/scf/potential_mixing.jl b/src/scf/potential_mixing.jl
index 6b729c5ff1..548333dd5b 100644
--- a/src/scf/potential_mixing.jl
+++ b/src/scf/potential_mixing.jl
@@ -61,7 +61,7 @@ function (anderson::AndersonAcceleration)(xₙ, αₙ, Pfxₙ)
 
     xₙ₊₁ = vec(xₙ) .+ αₙ .* vec(Pfxₙ)
     βs   = -(Mfac \ vec(Pfxₙ))
-    βs = Array(βs)  # GPU computation only : get βs back on the CPU so we can iterate through it
+    βs = to_cpu(βs)  # GPU computation only : get βs back on the CPU so we can iterate through it
     for (iβ, β) in enumerate(βs)
         xₙ₊₁ .+= β .* (xs[iβ] .- vec(xₙ) .+ αₙ .* (Pfxs[iβ] .- vec(Pfxₙ)))
     end
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index dda208c670..e07ba51313 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -39,7 +39,7 @@ end
     if isnothing(ψ) || isnothing(occupation)
         return (; E=T(Inf), ops)
     end
-    occupation = [Array(occk) for occk in occupation]  # Bring occupation to CPU memory
+    occupation = [to_cpu(occk) for occk in occupation]
 
     E = zero(T)
     for (ik, ψk) in enumerate(ψ)
diff --git a/src/terms/local.jl b/src/terms/local.jl
index daffb9c949..3a171aa8ec 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -74,7 +74,7 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     # positions, this involves a form factor (`local_potential_fourier`)
     # and a structure factor e^{-i G·r}
     model = basis.model
-    G_cart = Array(G_vectors_cart(basis))
+    G_cart = to_cpu(G_vectors_cart(basis))
     # TODO Bring G_cart on the CPU for compatibility with the pseudopotentials which
     #      are not isbits ... might be able to solve this by restructuring the loop
 
@@ -93,7 +93,7 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
         end
     end
 
-    Gs = Array(G_vectors(basis))  # TODO Again for GPU compatibility
+    Gs = to_cpu(G_vectors(basis))  # TODO Again for GPU compatibility
     pot_fourier = map(enumerate(Gs)) do (iG, G)
         q = norm(G_cart[iG])
         pot = sum(enumerate(model.atom_groups)) do (igroup, group)
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 50b0844767..921d8dc405 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -64,7 +64,7 @@ end
         for (ik, kpt) in enumerate(basis.kpoints)
             # we compute the forces from the irreductible BZ; they are symmetrized later
             qs = Gplusk_vectors(basis, kpt)
-            qs_cart = Array(Gplusk_vectors_cart(basis, kpt))  # Get on the CPU
+            qs_cart = to_cpu(Gplusk_vectors_cart(basis, kpt))
             form_factors = build_form_factors(element.psp, qs_cart)
             for idx in group
                 r = model.positions[idx]
@@ -149,7 +149,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     n_proj = count_n_proj(psps, psp_positions)
     n_G    = length(G_vectors(basis, kpt))
     proj_vectors = zeros(Complex{T}, n_G, n_proj)
-    qs = Array(Gplusk_vectors(basis, kpt))  # Get Gs on the CPU if computed on a device
+    qs = to_cpu(Gplusk_vectors(basis, kpt))
 
     # Compute the columns of proj_vectors = 1/√Ω pihat(k+G)
     # Since the pi are translates of each others, pihat(k+G) decouples as
@@ -158,7 +158,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     offset = 0  # offset into proj_vectors
     for (psp, positions) in zip(psps, psp_positions)
         # Compute position-independent form factors
-        qs_cart = Array(Gplusk_vectors_cart(basis, kpt))  # Get on the CPU if computed on device
+        qs_cart = to_cpu(Gplusk_vectors_cart(basis, kpt))
         form_factors = build_form_factors(psp, qs_cart)
 
         # Combine with structure factors
diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl
index 3f99a6c6ad..55272d34c3 100644
--- a/src/workarounds/gpu_arrays.jl
+++ b/src/workarounds/gpu_arrays.jl
@@ -9,7 +9,6 @@ function lowpass_for_symmetry!(ρ::AT, basis;
     all(isone, symmetries) && return ρ
     # lowpass_for_symmetry! currently uses scalar indexing, so we have to do this very ugly
     # thing for cases where ρ sits on a device (e.g. GPU)
-    ρ_CPU = Array(ρ)
-    ρ_CPU = lowpass_for_symmetry!(ρ_CPU, basis; symmetries)
-    convert(AT, ρ_CPU)
+    ρ_CPU = lowpass_for_symmetry!(to_cpu(ρ), basis; symmetries)
+    to_device(basis.architecture, ρ_CPU)
 end

From 4501bd94ff53a96d8d7ed57c6f2d9c2131d4724f Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Fri, 18 Nov 2022 16:37:53 +0100
Subject: [PATCH 59/69] Small polishing

---
 src/DFTK.jl              |  1 -
 src/architecture.jl      | 16 +++++++++++-----
 src/cuda_architecture.jl | 11 -----------
 src/external/jld2io.jl   | 14 ++++++++------
 4 files changed, 19 insertions(+), 23 deletions(-)
 delete mode 100644 src/cuda_architecture.jl

diff --git a/src/DFTK.jl b/src/DFTK.jl
index 56ce0008f6..2124ac1cb2 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -238,7 +238,6 @@ function __init__()
     end
     @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba"  begin
         include("workarounds/cuda_arrays.jl")
-        include("cuda_architecture.jl")
     end
 end
 
diff --git a/src/architecture.jl b/src/architecture.jl
index 75e31fe85b..499bd7ec45 100644
--- a/src/architecture.jl
+++ b/src/architecture.jl
@@ -5,16 +5,22 @@ abstract type AbstractArchitecture end
 
 struct CPU <: AbstractArchitecture end
 
+struct GPU{ArrayType <: AbstractArray} <: AbstractArchitecture end
+
+"""
+Construct a particular GPU architecture by passing the ArrayType
+"""
+GPU(::Type{T}) where {T <: AbstractArray} = GPU{T}()
+
 """
 Transfer an array from a device (typically a GPU) to the CPU.
 """
 to_cpu(x::AbstractArray) = Array(x)
 to_cpu(x::Array) = x
 
-to_device(::CPU, x) = to_cpu(x)
-
-
 """
-Generic, hardware independent architecture for DFTK.
+Transfer an array to a particular device (typically a GPU)
 """
-abstract type GPU <: AbstractArchitecture end
+to_device(::CPU, x) = to_cpu(x)
+to_device(::GPU{ArrayType}, x::AbstractArray) where {ArrayType} = ArrayType(x)
+to_device(::GPU{ArrayType}, x::ArrayType)     where {ArrayType} = x
diff --git a/src/cuda_architecture.jl b/src/cuda_architecture.jl
deleted file mode 100644
index 2c2da6c2b8..0000000000
--- a/src/cuda_architecture.jl
+++ /dev/null
@@ -1,11 +0,0 @@
-"""
-Specialised architecture for NVIDIA CUDA GPUs.
-"""
-struct CUDAGPU <: GPU end
-GPU(::Type{CUDA.CuArray}) = CUDAGPU()
-
-"""
-Transfer an array from a device (typically the CPU) to the NVIDIA CUDA GPU.
-"""
-to_device(::CUDAGPU, x::AbstractArray) = CUDA.CuArray(x)
-to_device(::CUDAGPU, x::CUDA.CuArray) = x
diff --git a/src/external/jld2io.jl b/src/external/jld2io.jl
index a774834733..7afc385874 100644
--- a/src/external/jld2io.jl
+++ b/src/external/jld2io.jl
@@ -73,7 +73,7 @@ load_scfres(file::AbstractString) = JLD2.jldopen(load_scfres, file, "r")
 #
 # Custom serialisations
 #
-struct PlaneWaveBasisSerialisation{T <: Real, GT <: AbstractArray}
+struct PlaneWaveBasisSerialisation{T <: Real}
     model::Model{T,T}
     Ecut::T
     variational::Bool
@@ -86,12 +86,14 @@ struct PlaneWaveBasisSerialisation{T <: Real, GT <: AbstractArray}
     architecture::AbstractArchitecture
 end
 function JLD2.writeas(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}}) where {T,GT,RT,KGT}
-    PlaneWaveBasisSerialisation{T,GT}
+    # The GT, GT, KGT are uniquely determined by the architecture,
+    # which is stored in the basis.
+    PlaneWaveBasisSerialisation{T}
 end
 
-function Base.convert(::Type{PlaneWaveBasisSerialisation{T,GT}},
-                      basis::PlaneWaveBasis{T,T,GT}) where {T,GT}
-    PlaneWaveBasisSerialisation{T,GT}(
+function Base.convert(::Type{PlaneWaveBasisSerialisation{T}},
+                      basis::PlaneWaveBasis{T,T}) where {T}
+    PlaneWaveBasisSerialisation{T}(
         basis.model,
         basis.Ecut,
         basis.variational,
@@ -106,7 +108,7 @@ function Base.convert(::Type{PlaneWaveBasisSerialisation{T,GT}},
 end
 
 function Base.convert(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}},
-                      serial::PlaneWaveBasisSerialisation{T,GT}) where {T,GT,RT,KGT}
+                      serial::PlaneWaveBasisSerialisation{T}) where {T,GT,RT,KGT}
     PlaneWaveBasis(serial.model, serial.Ecut, serial.kcoords, serial.kweights;
                    serial.fft_size,
                    serial.kgrid,

From 60506b92a5e6c5562a83abc1207a19c0d1e38aac Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Mon, 21 Nov 2022 09:51:19 +0100
Subject: [PATCH 60/69] Formatting, updating comments and small nits

---
 examples/gpu.jl        | 3 +--
 src/Model.jl           | 2 +-
 src/PlaneWaveBasis.jl  | 2 +-
 src/common/norm.jl     | 2 +-
 src/densities.jl       | 3 +--
 src/occupation.jl      | 7 ++++---
 src/scf/mixing.jl      | 4 ++--
 src/terms/hartree.jl   | 2 --
 src/terms/local.jl     | 3 +--
 src/terms/nonlocal.jl  | 8 ++++----
 test/PlaneWaveBasis.jl | 2 +-
 11 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/examples/gpu.jl b/examples/gpu.jl
index afad08fb7e..a005591c30 100644
--- a/examples/gpu.jl
+++ b/examples/gpu.jl
@@ -10,8 +10,7 @@ atoms     = [Si, Si]
 positions = [ones(3)/8, -ones(3)/8]
 model = model_DFT(lattice, atoms, positions, []; temperature=1e-3)
 
-# If available use CUDA to store DFT quantities and perform main computations
-# This is triggered by setting the array type for storing DFT quantities
+# If available, use CUDA to store DFT quantities and perform main computations
 architecture = has_cuda() ? DFTK.GPU(CuArray) : DFTK.CPU()
 
 basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), architecture)
diff --git a/src/Model.jl b/src/Model.jl
index 5b89159318..d06fd3ee95 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -283,7 +283,7 @@ inverse lattice transpose: q_cart = 2π lattice' \ q_red = recip_lattice * q_red
 For each of the function there is a one-argument version (returning a function to do the
 transformation) and a two-argument version applying the transformation to a passed vector.
 =#
-_closure_matmul(mat) = vec -> mat * vec
+@inline _closure_matmul(mat) = vec -> mat * vec
 
 vector_red_to_cart(model::Model)       = _closure_matmul(model.lattice)
 vector_cart_to_red(model::Model)       = _closure_matmul(model.inv_lattice)
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 08fbfc8a1a..4cd59e08a4 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -256,7 +256,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     @assert mpi_sum(sum(kweights_thisproc), comm_kpts) ≈ model.n_spin_components
     @assert length(kpoints) == length(kweights_thisproc)
 
-    if Gs isa AbstractGPUArray && Threads.nthreads() > 1
+    if architecture isa GPU && Threads.nthreads() > 1
         error("Can't mix multi-threading and GPU computations yet.")
     end
 
diff --git a/src/common/norm.jl b/src/common/norm.jl
index 0f79149d0d..3bccf28523 100644
--- a/src/common/norm.jl
+++ b/src/common/norm.jl
@@ -11,4 +11,4 @@ norm_cplx(x) = sqrt(sum(xx -> xx * xx, x))
 """
 Square of the ℓ²-norm.
 """
-norm2(G::AbstractArray) = sum(abs2, G)
+norm2(G) = sum(abs2, G)
diff --git a/src/densities.jl b/src/densities.jl
index dd8b449ba0..b65c98e111 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -39,11 +39,10 @@ using an optional `occupation_threshold`. By default all occupation numbers are
 
     @sync for (ichunk, chunk) in enumerate(Iterators.partition(ik_n, chunk_length))
         Threads.@spawn for (ik, n) in chunk  # spawn a task per chunk
-            kpt = basis.kpoints[ik]
             ψnk_real = ψnk_real_chunklocal[ichunk]
             ρ_loc = ρ_chunklocal[ichunk]
-
             kpt = basis.kpoints[ik]
+
             ifft!(ψnk_real, basis, kpt, ψ[ik][:, n])
             ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* abs2.(ψnk_real)
         end
diff --git a/src/occupation.jl b/src/occupation.jl
index 304d033566..0e2793ed3f 100644
--- a/src/occupation.jl
+++ b/src/occupation.jl
@@ -29,9 +29,10 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF;
     inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature
 
     filled_occ = filled_occupation(basis.model)
-    [to_device(basis.architecture,
-        filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature))
-        for εk in eigenvalues]
+    map(eigenvalues) do εk
+        occ = filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature)
+        to_device(basis.architecture, occ)
+    end
 end
 
 function compute_fermi_level(basis::PlaneWaveBasis{T}, eigenvalues; temperature) where {T}
diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl
index 3f601c09ac..22de626674 100644
--- a/src/scf/mixing.jl
+++ b/src/scf/mixing.jl
@@ -49,8 +49,8 @@ end
 
 @timing "KerkerMixing" function mix_density(mixing::KerkerMixing, basis::PlaneWaveBasis,
                                             δF; kwargs...)
-    T  = eltype(δF)
-    G² = map(G -> norm2(G), G_vectors_cart(basis))
+    T      = eltype(δF)
+    G²     = map(norm2, G_vectors_cart(basis))
     kTF    = T.(mixing.kTF)
     ΔDOS_Ω = T.(mixing.ΔDOS_Ω)
 
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 99493c8892..9667cb4468 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -1,5 +1,3 @@
-using GPUArraysCore
-
 """
 Hartree term: for a decaying potential V the energy would be
 
diff --git a/src/terms/local.jl b/src/terms/local.jl
index 3a171aa8ec..f7aa4dc3ab 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -102,9 +102,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
         end
         pot / sqrt(model.unit_cell_volume)
     end
-    enforce_real!(basis, pot_fourier)  # Symmetrize Fourier coeffs to have real iFFT
 
-    # Offload potential values to a device (like a GPU) and do the FFT
+    enforce_real!(basis, pot_fourier)  # Symmetrize Fourier coeffs to have real iFFT
     pot_real = irfft(basis, to_device(basis.architecture, pot_fourier))
 
     TermAtomicLocal(pot_real)
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 921d8dc405..e9451afdf1 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -15,8 +15,8 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T}
     isempty(psp_groups) && return TermNoop()
     ops = map(basis.kpoints) do kpt
         P = build_projection_vectors_(basis, kpt, psps, psp_positions)
-        D = build_projection_coefficients_(T, psps, psp_positions, architecture=basis.architecture)
-        NonlocalOperator(basis, kpt, P, D)
+        D = build_projection_coefficients_(T, psps, psp_positions)
+        NonlocalOperator(basis, kpt, P, to_device(basis.architecture, D))
     end
     TermAtomicNonlocal(ops)
 end
@@ -92,7 +92,7 @@ end
 # The ordering of the projector indices is (A,l,m,i), where A is running over all
 # atoms, l, m are AM quantum numbers and i is running over all projectors for a
 # given l. The matrix is block-diagonal with non-zeros only if A, l and m agree.
-function build_projection_coefficients_(T, psps, psp_positions; architecture=CPU())
+function build_projection_coefficients_(T, psps, psp_positions)
     # TODO In the current version the proj_coeffs still has a lot of zeros.
     #      One could improve this by storing the blocks as a list or in a
     #      BlockDiagonal data structure
@@ -108,7 +108,7 @@ function build_projection_coefficients_(T, psps, psp_positions; architecture=CPU
     end  # psp, r
     @assert count == n_proj
 
-    to_device(architecture, proj_coeffs)
+    proj_coeffs
 end
 
 # Builds the projection coefficient matrix for a single atom
diff --git a/test/PlaneWaveBasis.jl b/test/PlaneWaveBasis.jl
index 72f5bd407f..78a41337fd 100644
--- a/test/PlaneWaveBasis.jl
+++ b/test/PlaneWaveBasis.jl
@@ -11,7 +11,7 @@ function test_pw_cutoffs(testcase, Ecut, fft_size)
 
     for (ik, kpt) in enumerate(basis.kpoints)
         for G in G_vectors(basis, kpt)
-            @test sum(abs2,model.recip_lattice * (kpt.coordinate + G)) ≤ 2 * Ecut
+            @test sum(abs2, model.recip_lattice * (kpt.coordinate + G)) ≤ 2 * Ecut
         end
     end
 end

From 147c81dfe143460604ea3c8d76cf1b8d3e77e22b Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Mon, 21 Nov 2022 10:41:12 +0100
Subject: [PATCH 61/69] Remove architecture argument from G_vectors + infer
 type in kinetic_energy

---
 src/PlaneWaveBasis.jl        |  8 +++-----
 src/eigen/preconditioners.jl |  7 +------
 src/terms/kinetic.jl         | 10 ++++++----
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 4cd59e08a4..defb20271e 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -200,7 +200,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     kweights_global = kweights
 
     # Setup FFT plans
-    Gs = G_vectors(architecture, fft_size)
+    Gs = to_device(architecture, G_vectors(fft_size))
     (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans!(similar(Gs, Complex{T}, fft_size))
 
     # Normalization constants
@@ -349,15 +349,13 @@ end
 The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
 """
-G_vectors(fft_size::Union{Tuple,AbstractVector}) = G_vectors(CPU(), fft_size)
-function G_vectors(architecture::AbstractArchitecture, fft_size::Union{Tuple,AbstractVector})
+function G_vectors(fft_size::Union{Tuple,AbstractVector})
     # Note that a collect(G_vectors_generator(fft_size)) is 100-fold slower
     # than this implementation, hence the code duplication.
     start = .- cld.(fft_size .- 1, 2)
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
-    Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
-    to_device(architecture, Gs)
+    [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
 end
 
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index 95bc8e19bb..4eb8ef96be 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -38,12 +38,7 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=
     # TODO Annoying that one has to recompute the kinetic energies here. Perhaps
     #      it's better to pass a HamiltonianBlock directly and read the computed values.
     kinetic_term = only(kinetic_term)
-    scaling = kinetic_term.scaling_factor
-    blowup = kinetic_term.blowup     # blowup for energy cut-off smearing
-    Ecut = basis.Ecut
-    kin = map(Gplusk_vectors_cart(basis, kpt)) do q
-        scaling * norm2(q) /2 * blowup(norm(q), Ecut)
-    end
+    kin = kinetic_energy(kinetic_term, basis.Ecut, Gplusk_vectors_cart(basis, kpt))
     PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift)
 end
 
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index e07ba51313..6ceb2a7f0d 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -19,18 +19,20 @@ struct TermKinetic <: Term
     kinetic_energies::Vector{<:AbstractVector}
 end
 function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor, blowup) where {T}
-
-    kinetic_energies = [kinetic_energy(Gplusk_vectors_cart(basis, kpt), scaling_factor,
-                                        blowup, basis.Ecut, T)
+    kinetic_energies = [kinetic_energy(blowup, scaling_factor, basis.Ecut,
+                                       Gplusk_vectors_cart(basis, kpt))
                         for kpt in basis.kpoints]
     TermKinetic(T(scaling_factor), kinetic_energies)
 end
 
-function kinetic_energy(q, scaling_factor, blowup, Ecut, ::Type{T}) where {T}
+function kinetic_energy(blowup, scaling_factor, Ecut, q::AbstractArray{Vec3{T}}) where {T}
     map(q) do qk
         T(scaling_factor) * norm2(qk) / 2 * blowup(norm(qk), Ecut)
     end
 end
+function kinetic_energy(kin::Kinetic, Ecut, q)
+    kinetic_energy(kin.blowup, kin.scaling_factor, Ecut, q)
+end
 
 @timing "ene_ops: kinetic" function ene_ops(term::TermKinetic, basis::PlaneWaveBasis{T},
                                             ψ, occupation; kwargs...) where {T}

From 8a40af5c97404b0e47f078e04afff949222a658e Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Mon, 21 Nov 2022 14:31:09 +0100
Subject: [PATCH 62/69] Inline _closure_matmatmul

---
 src/Model.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Model.jl b/src/Model.jl
index d06fd3ee95..b2f6fc4ecb 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -312,7 +312,7 @@ s_cart = L s_red = L A_red r_red = L A_red L⁻¹ r_cart, thus A_cart = L A_red
 Examples of matrices are the symmetries in real space (W)
 Examples of comatrices are the symmetries in reciprocal space (S)
 =#
-_closure_matmatmul(M, Minv) = mat -> M * mat * Minv
+@inline _closure_matmatmul(M, Minv) = mat -> M * mat * Minv
 
 matrix_red_to_cart(model::Model)   = _closure_matmatmul(model.lattice,      model.inv_lattice)
 matrix_cart_to_red(model::Model)   = _closure_matmatmul(model.inv_lattice,  model.lattice)

From 5ee546bd87658ba98c0675a35bb7b0d13c68e4b2 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Mon, 21 Nov 2022 16:38:22 +0100
Subject: [PATCH 63/69] Rename closures and small changes to ortho_qr and
 build_kpoints

---
 src/Model.jl          | 24 ++++++++++++------------
 src/PlaneWaveBasis.jl |  4 ++--
 src/common/ortho.jl   | 11 +----------
 src/fft.jl            |  5 +++--
 4 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/src/Model.jl b/src/Model.jl
index b2f6fc4ecb..9aa9c4cc66 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -283,14 +283,14 @@ inverse lattice transpose: q_cart = 2π lattice' \ q_red = recip_lattice * q_red
 For each of the function there is a one-argument version (returning a function to do the
 transformation) and a two-argument version applying the transformation to a passed vector.
 =#
-@inline _closure_matmul(mat) = vec -> mat * vec
+@inline _gen_matmul(mat) = vec -> mat * vec
 
-vector_red_to_cart(model::Model)       = _closure_matmul(model.lattice)
-vector_cart_to_red(model::Model)       = _closure_matmul(model.inv_lattice)
-covector_red_to_cart(model::Model)     = _closure_matmul(model.inv_lattice')
-covector_cart_to_red(model::Model)     = _closure_matmul(model.lattice')
-recip_vector_red_to_cart(model::Model) = _closure_matmul(model.recip_lattice)
-recip_vector_cart_to_red(model::Model) = _closure_matmul(model.inv_recip_lattice)
+vector_red_to_cart(model::Model)       = _gen_matmul(model.lattice)
+vector_cart_to_red(model::Model)       = _gen_matmul(model.inv_lattice)
+covector_red_to_cart(model::Model)     = _gen_matmul(model.inv_lattice')
+covector_cart_to_red(model::Model)     = _gen_matmul(model.lattice')
+recip_vector_red_to_cart(model::Model) = _gen_matmul(model.recip_lattice)
+recip_vector_cart_to_red(model::Model) = _gen_matmul(model.inv_recip_lattice)
 
 vector_red_to_cart(model::Model, vec)       = vector_red_to_cart(model)(vec)
 vector_cart_to_red(model::Model, vec)       = vector_cart_to_red(model)(vec)
@@ -312,12 +312,12 @@ s_cart = L s_red = L A_red r_red = L A_red L⁻¹ r_cart, thus A_cart = L A_red
 Examples of matrices are the symmetries in real space (W)
 Examples of comatrices are the symmetries in reciprocal space (S)
 =#
-@inline _closure_matmatmul(M, Minv) = mat -> M * mat * Minv
+@inline _gen_matmatmul(M, Minv) = mat -> M * mat * Minv
 
-matrix_red_to_cart(model::Model)   = _closure_matmatmul(model.lattice,      model.inv_lattice)
-matrix_cart_to_red(model::Model)   = _closure_matmatmul(model.inv_lattice,  model.lattice)
-comatrix_red_to_cart(model::Model) = _closure_matmatmul(model.inv_lattice', model.lattice')
-comatrix_cart_to_red(model::Model) = _closure_matmatmul(model.lattice',     model.inv_lattice')
+matrix_red_to_cart(model::Model)   = _gen_matmatmul(model.lattice,      model.inv_lattice)
+matrix_cart_to_red(model::Model)   = _gen_matmatmul(model.inv_lattice,  model.lattice)
+comatrix_red_to_cart(model::Model) = _gen_matmatmul(model.inv_lattice', model.lattice')
+comatrix_cart_to_red(model::Model) = _gen_matmatmul(model.lattice',     model.inv_lattice')
 
 matrix_red_to_cart(model::Model, Ared)    = matrix_red_to_cart(model)(Ared)
 matrix_cart_to_red(model::Model, Acart)   = matrix_cart_to_red(model)(Acart)
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index defb20271e..b5b85d43ec 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -120,7 +120,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 
 @timing function build_kpoints(model::Model{T}, fft_size, kcoords, Ecut;
                                variational=true,
-                               architecture::AbstractArchitecture=CPU()) where {T}
+                               architecture::AbstractArchitecture) where {T}
     kpoints_per_spin = [Kpoint[] for _ in 1:model.n_spin_components]
     for k in kcoords
         k = Vec3{T}(k)  # rationals are sloooow
@@ -148,7 +148,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 end
 function build_kpoints(basis::PlaneWaveBasis, kcoords)
     build_kpoints(basis.model, basis.fft_size, kcoords, basis.Ecut;
-                  variational=basis.variational, architecture = basis.architecture)
+                  variational=basis.variational, basis.architecture)
 end
 
 # Lowest-level constructor, should not be called directly.
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index 8685961e1c..ae3e0b3b76 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,11 +1,2 @@
 # Orthonormalize
-@timing function ortho_qr(φk::ArrayType) where {ArrayType <: AbstractArray}
-    Q = convert(ArrayType, qr(φk).Q)
-    # CUDA bug: after the convert line, when φk is m*n rectangular matrix with m > n,
-    # Q is not cropped ie only the first size(φk, 2) columns should be kept
-    # See https://github.com/JuliaGPU/CUDA.jl/pull/1662
-    Q[:, 1:size(φk, 2)]
-end
-
-# CPU specialisation to go a bit faster (skip the slicing)
-@timing ortho_qr(φk::ArrayType) where {ArrayType <: Array} = Array(qr(φk).Q)
+@timing ortho_qr(φk::ArrayType) where {ArrayType <: AbstractArray} = convert(ArrayType, qr(φk).Q)
diff --git a/src/fft.jl b/src/fft.jl
index 559d518710..587e5e4f5f 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -158,7 +158,8 @@ represented in the spherical basis sets, `supersampling` should be at least `2`.
 If `factors` is not empty, ensure that the resulting fft_size contains all the factors
 """
 function compute_fft_size(model::Model{T}, Ecut, kcoords=nothing;
-                          ensure_smallprimes=true, algorithm=:fast, factors=1, kwargs...) where {T}
+                          ensure_smallprimes=true, algorithm=:fast, factors=1,
+                          architecture = CPU(), kwargs...) where {T}
     if algorithm == :fast
         Glims = compute_Glims_fast(model.lattice, Ecut; kwargs...)
     elseif algorithm == :precise
@@ -171,7 +172,7 @@ function compute_fft_size(model::Model{T}, Ecut, kcoords=nothing;
         # fft_size needs to be final at k-point construction time
         Glims_temp    = compute_Glims_fast(model.lattice, Ecut; kwargs...)
         fft_size_temp = Tuple{Int, Int, Int}(2 .* Glims_temp .+ 1)
-        kpoints_temp  = build_kpoints(model, fft_size_temp, kcoords, Ecut)
+        kpoints_temp  = build_kpoints(model, fft_size_temp, kcoords, Ecut; architecture)
 
         Glims = compute_Glims_precise(model.lattice, Ecut, kpoints_temp; kwargs...)
     else

From b7b611a279383330ce61131c61b00dc0e89710fd Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Mon, 21 Nov 2022 17:40:07 +0100
Subject: [PATCH 64/69] Hard-code CPU architecture

---
 src/fft.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fft.jl b/src/fft.jl
index 587e5e4f5f..81155b2ddd 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -159,7 +159,7 @@ If `factors` is not empty, ensure that the resulting fft_size contains all the f
 """
 function compute_fft_size(model::Model{T}, Ecut, kcoords=nothing;
                           ensure_smallprimes=true, algorithm=:fast, factors=1,
-                          architecture = CPU(), kwargs...) where {T}
+                          kwargs...) where {T}
     if algorithm == :fast
         Glims = compute_Glims_fast(model.lattice, Ecut; kwargs...)
     elseif algorithm == :precise
@@ -172,7 +172,7 @@ function compute_fft_size(model::Model{T}, Ecut, kcoords=nothing;
         # fft_size needs to be final at k-point construction time
         Glims_temp    = compute_Glims_fast(model.lattice, Ecut; kwargs...)
         fft_size_temp = Tuple{Int, Int, Int}(2 .* Glims_temp .+ 1)
-        kpoints_temp  = build_kpoints(model, fft_size_temp, kcoords, Ecut; architecture)
+        kpoints_temp  = build_kpoints(model, fft_size_temp, kcoords, Ecut; architecture=CPU())
 
         Glims = compute_Glims_precise(model.lattice, Ecut, kpoints_temp; kwargs...)
     else

From cf3fbf8ad76e4d2bfca5c776678ffde66815abff Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Tue, 22 Nov 2022 15:20:26 +0100
Subject: [PATCH 65/69] Minor nits: broadcast norm2, more type inference

---
 src/PlaneWaveBasis.jl | 15 +++++++--------
 src/scf/chi0models.jl |  2 +-
 src/scf/mixing.jl     |  2 +-
 src/terms/hartree.jl  |  7 +++----
 src/terms/xc.jl       |  4 ++--
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index b5b85d43ec..a36e96cb7a 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -15,7 +15,7 @@ Discretization information for ``k``-point-dependent quantities such as orbitals
 More generally, a ``k``-point is a block of the Hamiltonian;
 eg collinear spin is treated by doubling the number of kpoints.
 """
-struct Kpoint{T <: Real, GT <: AbstractArray}
+struct Kpoint{T <: Real, GT <: AbstractVector{Vec3{Int}}}
     spin::Int                     # Spin component can be 1 or 2 as index into what is
     #                             # returned by the `spin_components` function
     coordinate::Vec3{T}           # Fractional coordinate of k-point
@@ -39,10 +39,9 @@ Normalization conventions:
 
 `ifft` and `fft` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT, T_G_vectors, T_r_vectors, T_kpt_G_vecs} <: AbstractBasis{
-    T
-} where {VT <: Real, T_G_vectors <: AbstractArray{Vec3{Int}}, T_r_vectors <: AbstractArray,
-         T_kpt_G_vecs <: AbstractVector{Vec3{Int}}}
+struct PlaneWaveBasis{T, VT, T_G_vectors, T_r_vectors, T_kpt_G_vecs} <: AbstractBasis{T} where
+    {VT <: Real, T_G_vectors <: AbstractArray{Vec3{Int}},
+    T_r_vectors <: AbstractArray{Vec3}, T_kpt_G_vecs <: AbstractVector{Vec3{Int}}}
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 
@@ -141,7 +140,7 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
         mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping))
         for iσ = 1:model.n_spin_components
             push!(kpoints_per_spin[iσ],
-                  Kpoint{T,typeof(Gvecs_k)}(iσ, k, mapping, mapping_inv, Gvecs_k))
+                  Kpoint(iσ, k, mapping, mapping_inv, Gvecs_k))
         end
     end
     vcat(kpoints_per_spin...)  # put all spin up first, then all spin down
@@ -399,7 +398,7 @@ end
 The list of ``G + k`` vectors, in reduced coordinates.
 """
 function Gplusk_vectors(basis::PlaneWaveBasis, kpt::Kpoint)
-    coordinate = kpt.coordinate  # Avoid closure on kpt (not isbits)
+    coordinate = kpt.coordinate  # Accelerator: avoid closure on kpt (not isbits)
     map(G -> G + coordinate, G_vectors(basis, kpt))
 end
 
@@ -450,7 +449,7 @@ Returns nothing if outside the range of valid wave vectors.
     end
 end
 
-@inline function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{<:Integer})
+function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{<:Integer})
     index_G_vectors(basis.fft_size, G)
 end
 
diff --git a/src/scf/chi0models.jl b/src/scf/chi0models.jl
index c3d5175ec5..1c478053fe 100644
--- a/src/scf/chi0models.jl
+++ b/src/scf/chi0models.jl
@@ -55,7 +55,7 @@ function (χ0::DielectricModel)(basis; kwargs...)
     C0  = 1 - εr
     iszero(C0) && return nothing  # Will yield no contribution
 
-    Gsq = [norm2(G) for G in G_vectors_cart(basis)]
+    Gsq = norm2.(G_vectors_cart(basis))
     apply_sqrtL = identity
     if χ0.localization != identity
         sqrtL = sqrt.(χ0.localization.(r_vectors(basis)))
diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl
index 22de626674..82507016f3 100644
--- a/src/scf/mixing.jl
+++ b/src/scf/mixing.jl
@@ -50,7 +50,7 @@ end
 @timing "KerkerMixing" function mix_density(mixing::KerkerMixing, basis::PlaneWaveBasis,
                                             δF; kwargs...)
     T      = eltype(δF)
-    G²     = map(norm2, G_vectors_cart(basis))
+    G²     = norm2.(G_vectors_cart(basis))
     kTF    = T.(mixing.kTF)
     ΔDOS_Ω = T.(mixing.ΔDOS_Ω)
 
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 9667cb4468..81372ce5f1 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -30,10 +30,9 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
     # Solving the Poisson equation ΔV = -4π ρ in Fourier space
     # is multiplying elementwise by 4π / |G|^2.
 
-    GPUArraysCore.allowscalar() do
-        poisson_green_coeffs = 4T(π) ./ [norm2(G) for G in G_vectors_cart(basis)]
-        poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
-    end
+    poisson_green_coeffs = 4T(π) ./ norm2.(G_vectors_cart(basis))
+    GPUArraysCore.@allowscalar poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
+
     enforce_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
     poisson_green_coeffs = to_device(basis.architecture, poisson_green_coeffs)
 
diff --git a/src/terms/xc.jl b/src/terms/xc.jl
index 0e4ccb9836..cc63325817 100644
--- a/src/terms/xc.jl
+++ b/src/terms/xc.jl
@@ -97,7 +97,7 @@ end
         end
         if haskey(terms, :Vl) && any(x -> abs(x) > term.potential_threshold, terms.Vl)
             @warn "Meta-GGAs with a Δρ term have not yet been thoroughly tested." maxlog=1
-            mG² = [-norm2(G) for G in G_vectors_cart(basis)]
+            mG² = -norm2.(G_vectors_cart(basis))
             Vl  = reshape(terms.Vl, n_spin, basis.fft_size...)
             Vl_fourier = fft(basis, Vl[s, :, :, :])
             # TODO: forcing real-valued ifft; should be enforced at creation of array
@@ -255,7 +255,7 @@ function LibxcDensities(basis, max_derivative::Integer, ρ, τ)
     # Compute Δρ
     if max_derivative > 1
         Δρ_real = similar(ρ_real, n_spin, basis.fft_size...)
-        mG² = [-norm2(G) for G in G_vectors_cart(basis)]
+        mG² = -norm2.(G_vectors_cart(basis))
         for σ = 1:n_spin
             # TODO: forcing real-valued ifft; should be enforced at creation of array
             Δρ_real[σ, :, :, :] .= irfft(basis, mG² .* @view ρ_fourier[σ, :, :, :];

From ff0d9b08105cf63b7bbfa6b85e226cc965ba2d10 Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Tue, 22 Nov 2022 14:29:18 +0100
Subject: [PATCH 66/69] Update xc.jl

---
 src/terms/xc.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/terms/xc.jl b/src/terms/xc.jl
index cc63325817..2babe2ed9b 100644
--- a/src/terms/xc.jl
+++ b/src/terms/xc.jl
@@ -97,7 +97,7 @@ end
         end
         if haskey(terms, :Vl) && any(x -> abs(x) > term.potential_threshold, terms.Vl)
             @warn "Meta-GGAs with a Δρ term have not yet been thoroughly tested." maxlog=1
-            mG² = -norm2.(G_vectors_cart(basis))
+            mG² = .-norm2.(G_vectors_cart(basis))
             Vl  = reshape(terms.Vl, n_spin, basis.fft_size...)
             Vl_fourier = fft(basis, Vl[s, :, :, :])
             # TODO: forcing real-valued ifft; should be enforced at creation of array
@@ -255,7 +255,7 @@ function LibxcDensities(basis, max_derivative::Integer, ρ, τ)
     # Compute Δρ
     if max_derivative > 1
         Δρ_real = similar(ρ_real, n_spin, basis.fft_size...)
-        mG² = -norm2.(G_vectors_cart(basis))
+        mG² = .-norm2.(G_vectors_cart(basis))
         for σ = 1:n_spin
             # TODO: forcing real-valued ifft; should be enforced at creation of array
             Δρ_real[σ, :, :, :] .= irfft(basis, mG² .* @view ρ_fourier[σ, :, :, :];

From d7be3a67a4fdd8e23ea525d583928f7a2c39a3da Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Tue, 22 Nov 2022 14:42:50 +0100
Subject: [PATCH 67/69] Reformat basis

---
 src/PlaneWaveBasis.jl | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index a36e96cb7a..2f1d46909d 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -39,9 +39,13 @@ Normalization conventions:
 
 `ifft` and `fft` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT, T_G_vectors, T_r_vectors, T_kpt_G_vecs} <: AbstractBasis{T} where
-    {VT <: Real, T_G_vectors <: AbstractArray{Vec3{Int}},
-    T_r_vectors <: AbstractArray{Vec3}, T_kpt_G_vecs <: AbstractVector{Vec3{Int}}}
+struct PlaneWaveBasis{T,
+                      VT <: Real,
+                      T_G_vectors  <: AbstractArray{Vec3{Int}, 3},
+                      T_r_vectors  <: AbstractArray{Vec3{T},   3},
+                      T_kpt_G_vecs <: AbstractVector{Vec3{Int}}
+                     } <: AbstractBasis{T}
+
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 

From 482d3b2df5f798977f1f7531006ef629dc16d2d7 Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Tue, 22 Nov 2022 14:43:57 +0100
Subject: [PATCH 68/69] Docs concurrency

---
 .github/workflows/documentation.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml
index 2252d866de..75c71319c4 100644
--- a/.github/workflows/documentation.yaml
+++ b/.github/workflows/documentation.yaml
@@ -6,6 +6,11 @@ on:
     tags:
       - 'v*'
   pull_request:
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 
 jobs:
   docs:

From 70704871a782e36271afe88b63cee88149a5fa6a Mon Sep 17 00:00:00 2001
From: "Michael F. Herbst" <info@michael-herbst.com>
Date: Tue, 22 Nov 2022 14:47:48 +0100
Subject: [PATCH 69/69] Value type instead of real type.

---
 src/PlaneWaveBasis.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 2f1d46909d..41d443ca78 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -42,7 +42,7 @@ Normalization conventions:
 struct PlaneWaveBasis{T,
                       VT <: Real,
                       T_G_vectors  <: AbstractArray{Vec3{Int}, 3},
-                      T_r_vectors  <: AbstractArray{Vec3{T},   3},
+                      T_r_vectors  <: AbstractArray{Vec3{VT},  3},
                       T_kpt_G_vecs <: AbstractVector{Vec3{Int}}
                      } <: AbstractBasis{T}