diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 12c7cc62905c9..9e0ba87909001 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2066,31 +2066,35 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; } - Expected> - doJITPostProcessing(std::unique_ptr MB) const override { + Expected> doJITPostProcessing( + llvm::SmallVector> &&MB) const override { // TODO: We should try to avoid materialization but there seems to be no // good linker interface w/o file i/o. - SmallString<128> LinkerInputFilePath; - std::error_code EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", - "o", LinkerInputFilePath); - if (EC) - return Plugin::error(ErrorCode::HOST_IO, - "failed to create temporary file for linker"); - - // Write the file's contents to the output file. - Expected> OutputOrErr = - FileOutputBuffer::create(LinkerInputFilePath, MB->getBuffer().size()); - if (!OutputOrErr) - return OutputOrErr.takeError(); - std::unique_ptr Output = std::move(*OutputOrErr); - llvm::copy(MB->getBuffer(), Output->getBufferStart()); - if (Error E = Output->commit()) - return std::move(E); + llvm::SmallVector> InputFilenames; + for (auto &B : MB) { + SmallString<128> LinkerInputFilePath; + auto &Dest = InputFilenames.emplace_back(); + std::error_code EC = + sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "o", Dest); + if (EC) + return Plugin::error(ErrorCode::HOST_IO, + "failed to create temporary file for linker"); + + // Write the file's contents to the output file. + Expected> OutputOrErr = + FileOutputBuffer::create(Dest, B->getBuffer().size()); + if (!OutputOrErr) + return OutputOrErr.takeError(); + std::unique_ptr Output = std::move(*OutputOrErr); + llvm::copy(B->getBuffer(), Output->getBufferStart()); + if (Error E = Output->commit()) + return std::move(E); + } SmallString<128> LinkerOutputFilePath; - EC = sys::fs::createTemporaryFile("amdgpu-pre-link-jit", "so", - LinkerOutputFilePath); + std::error_code EC = sys::fs::createTemporaryFile( + "amdgpu-pre-link-jit", "so", LinkerOutputFilePath); if (EC) return Plugin::error(ErrorCode::HOST_IO, "failed to create temporary file for linker"); @@ -2105,15 +2109,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { "Using `%s` to link JITed amdgcn output.", LLDPath.c_str()); std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind(); - StringRef Args[] = {LLDPath, - "-flavor", - "gnu", - "--no-undefined", - "-shared", - MCPU, - "-o", - LinkerOutputFilePath.data(), - LinkerInputFilePath.data()}; + std::vector Args = { + LLDPath, "-flavor", "gnu", "--no-undefined", + "-shared", MCPU, "-o", LinkerOutputFilePath.data()}; + for (auto &N : InputFilenames) { + Args.push_back(N); + } std::string Error; int RC = sys::ExecuteAndWait(LLDPath, Args, std::nullopt, {}, 0, 0, &Error); @@ -2131,9 +2132,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (sys::fs::remove(LinkerOutputFilePath)) return Plugin::error(ErrorCode::HOST_IO, "failed to remove temporary output file for lld"); - if (sys::fs::remove(LinkerInputFilePath)) - return Plugin::error(ErrorCode::HOST_IO, - "failed to remove temporary input file for lld"); + for (auto &N : InputFilenames) { + if (sys::fs::remove(N)) + return Plugin::error(ErrorCode::HOST_IO, + "failed to remove temporary input file for lld"); + } return std::move(*BufferOrErr); } diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h index 8c530436a754b..1d6280a0af141 100644 --- a/offload/plugins-nextgen/common/include/JIT.h +++ b/offload/plugins-nextgen/common/include/JIT.h @@ -44,7 +44,7 @@ struct JITEngine { /// called. using PostProcessingFn = std::function>( - std::unique_ptr)>; + llvm::SmallVector> &&)>; JITEngine(Triple::ArchType TA); diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 162b149ab483e..7824257d28e1f 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -934,9 +934,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual std::string getComputeUnitKind() const { return "unknown"; } /// Post processing after jit backend. The ownership of \p MB will be taken. - virtual Expected> - doJITPostProcessing(std::unique_ptr MB) const { - return std::move(MB); + virtual Expected> doJITPostProcessing( + llvm::SmallVector> &&MB) const { + if (MB.size() > 1) + return make_error( + error::ErrorCode::UNSUPPORTED, + "Plugin does not support linking multiple binaries"); + return std::move(MB[0]); } /// The minimum number of threads we use for a low-trip count combined loop. diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp index c82a06e36d8f9..835dcc0da2ec9 100644 --- a/offload/plugins-nextgen/common/src/JIT.cpp +++ b/offload/plugins-nextgen/common/src/JIT.cpp @@ -292,7 +292,9 @@ JITEngine::compile(const __tgt_device_image &Image, if (!ObjMBOrErr) return ObjMBOrErr.takeError(); - auto ImageMBOrErr = PostProcessing(std::move(*ObjMBOrErr)); + llvm::SmallVector> Buffers; + Buffers.push_back(std::move(*ObjMBOrErr)); + auto ImageMBOrErr = PostProcessing(std::move(Buffers)); if (!ImageMBOrErr) return ImageMBOrErr.takeError(); @@ -314,7 +316,8 @@ JITEngine::process(const __tgt_device_image &Image, target::plugin::GenericDeviceTy &Device) { const std::string &ComputeUnitKind = Device.getComputeUnitKind(); - PostProcessingFn PostProcessing = [&Device](std::unique_ptr MB) + PostProcessingFn PostProcessing = + [&Device](llvm::SmallVector> &&MB) -> Expected> { return Device.doJITPostProcessing(std::move(MB)); }; diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 15193de6ae430..b916197bc5a6b 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -420,8 +420,14 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } - Expected> - doJITPostProcessing(std::unique_ptr MB) const override { + Expected> doJITPostProcessing( + llvm::SmallVector> &&MB) const override { + // TODO: This should be possible, just needs to be implemented + if (MB.size() > 1) + return make_error( + error::ErrorCode::UNIMPLEMENTED, + "CUDA plugin does not support linking multiple binaries"); + // TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to // avoid the call to 'ptxas'. SmallString<128> PTXInputFilePath; @@ -433,11 +439,11 @@ struct CUDADeviceTy : public GenericDeviceTy { // Write the file's contents to the output file. Expected> OutputOrErr = - FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size()); + FileOutputBuffer::create(PTXInputFilePath, MB[0]->getBuffer().size()); if (!OutputOrErr) return OutputOrErr.takeError(); std::unique_ptr Output = std::move(*OutputOrErr); - llvm::copy(MB->getBuffer(), Output->getBufferStart()); + llvm::copy(MB[0]->getBuffer(), Output->getBufferStart()); if (Error E = Output->commit()) return std::move(E);