tongyuantongyu
diff --git a/‎CMakeLists.txt
Lines changed: 5 additions & 5 deletions b/‎CMakeLists.txt
Lines changed: 5 additions & 5 deletions
diff --git a/‎main.cpp
Lines changed: 41 additions & 12 deletions b/‎main.cpp
Lines changed: 41 additions & 12 deletions
diff --git a/‎md_view.h
Lines changed: 7 additions & 7 deletions b/‎md_view.h
Lines changed: 7 additions & 7 deletions
diff --git a/‎optimize.cpp
Lines changed: 4 additions & 3 deletions b/‎optimize.cpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎reformat/reformat.cpp
Lines changed: 0 additions & 54 deletions b/‎reformat/reformat.cpp
Lines changed: 0 additions & 54 deletions
@@ -4,11 +4,11 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CUDA_STANDARD 20)
 
-project(TRT-NNScaler LANGUAGES CXX)
+project(TRT-NNScaler LANGUAGES CXX CUDA)
 
 if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
     if (CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
-        set(CMAKE_CUDA_ARCHITECTURES 53 62 72 87)
+        set(CMAKE_CUDA_ARCHITECTURES 62 72 87)
     else ()
         set(CMAKE_CUDA_ARCHITECTURES 61 70 75 80 86 89 90)
     endif ()
@@ -50,10 +50,10 @@ endif ()
 
 add_subdirectory(libyuv)
 
-#add_library(reformat_cuda OBJECT reformat/reformat_cuda.h reformat/reformat.cu)
+add_library(reformat_cuda STATIC reformat/reformat_cuda.h reformat/reformat.cu)
 
-add_library(reformat OBJECT reformat/reformat.h reformat/reformat_cuda.h reformat/reformat.cpp)
-target_link_libraries(reformat PUBLIC CUDA::cudart)
+add_library(reformat INTERFACE reformat/reformat.h reformat/reformat_cuda.h)
+target_link_libraries(reformat INTERFACE CUDA::cudart reformat_cuda)
 
 set(SOURCE_FILES
         md_view.h
 
@@ -11,6 +11,7 @@
 #include <cstdlib>
 
 #include "gflags/gflags.h"
+#include "cuda_fp16.h"
 
 #include "nn-scaler.h"
 #include "logging.h"
@@ -20,8 +21,17 @@
 DEFINE_string(model_path, "models", "path to the folder to save model files");
 
 InferenceSession *session = nullptr;
-pixel_importer_cpu *importer = nullptr;
-pixel_exporter_cpu_crop *exporter = nullptr;
+
+int using_io = 0;
+
+pixel_importer_cpu *importer_cpu = nullptr;
+pixel_exporter_cpu *exporter_cpu = nullptr;
+
+pixel_importer_gpu<float> *importer_gpu = nullptr;
+pixel_exporter_gpu<float> *exporter_gpu = nullptr;
+
+pixel_importer_gpu<half> *importer_gpu_fp16 = nullptr;
+pixel_exporter_gpu<half> *exporter_gpu_fp16 = nullptr;
 
 static uint64_t total_processed = 0;
 
@@ -88,9 +98,10 @@ static std::string handle_folder(const std::filesystem::path &input, chan &works
 
 static Logger gLogger;
 
-DEFINE_bool(fp16, false, "Use FP16 processing");
-DEFINE_bool(external, false, "Use external algorithms from cuDNN and cuBLAS");
-DEFINE_bool(low_mem, false, "Tweak configs to reduce memory consumption");
+DEFINE_bool(fp16, false, "use FP16 processing");
+DEFINE_bool(external, false, "use external algorithms from cuDNN and cuBLAS");
+DEFINE_bool(low_mem, false, "tweak configs to reduce memory consumption");
+DEFINE_string(reformatter, "auto", "reformatter used to import and export pixels: cpu, gpu, auto");
 
 DECLARE_string(alpha);
 DEFINE_int32(tile_width, 512, "tile width");
@@ -154,7 +165,7 @@ void custom_prefix(std::ostream &s, const google::LogMessageInfo &l, void *) {
 
 DECLARE_string(flagfile);
 
-DEFINE_bool(cuda_lazy_load, true, "Enable CUDA lazying load.");
+DEFINE_bool(cuda_lazy_load, true, "enable CUDA lazying load.");
 
 int32_t h_scale, w_scale;
 
@@ -286,16 +297,34 @@ int wmain(int argc, wchar_t **wargv) {
     LOG(FATAL) << "different width and height scale ratio unimplemented.";
   }
 
-  if (FLAGS_fp16) {
-    LOG(FATAL) << "FP16 mode unimplemented.";
-  }
-
   // ------------------------------
   // Import & Export
   auto max_size = size_t(max_width) * max_height;
 
-  importer = new pixel_importer_cpu(max_size, FLAGS_alpha != "ignore");
-  exporter = new pixel_exporter_cpu_crop(h_scale * w_scale * max_size);
+  if (FLAGS_reformatter == "auto") {
+    FLAGS_reformatter = FLAGS_fp16 ? "gpu" : "cpu";
+  }
+  if (FLAGS_fp16 && FLAGS_reformatter == "cpu") {
+    LOG(FATAL) << "CPU reformatter can not handle FP16.";
+  }
+
+  if (FLAGS_reformatter == "cpu") {
+    importer_cpu = new pixel_importer_cpu(max_size, FLAGS_alpha != "ignore");
+    exporter_cpu = new pixel_exporter_cpu(h_scale * w_scale * max_size, FLAGS_alpha != "ignore");
+    using_io = 0;
+  } else if (FLAGS_reformatter == "gpu") {
+    if (FLAGS_fp16) {
+      importer_gpu_fp16 = new pixel_importer_gpu<half>(max_size, FLAGS_alpha != "ignore");
+      exporter_gpu_fp16 = new pixel_exporter_gpu<half>(h_scale * w_scale * max_size, FLAGS_alpha != "ignore");
+      using_io = 2;
+    } else {
+      importer_gpu = new pixel_importer_gpu<float>(max_size, FLAGS_alpha != "ignore");
+      exporter_gpu = new pixel_exporter_gpu<float>(h_scale * w_scale * max_size, FLAGS_alpha != "ignore");
+      using_io = 1;
+    }
+  } else {
+    LOG(FATAL) << "Unknown reformatter.";
+  }
 
   chan works;
   std::thread pipeline(launch_pipeline, std::ref(works));
 
@@ -442,6 +442,13 @@ void util_attrs copy(const md_view<T, DIMS> &dst, const md_view<T, DIMS> &src) {
 }
 
 namespace detail {
+template<class T, class Memcpy>
+void util_attrs copy_impl(const md_uview<T, 1> &dst, const md_uview<const T, 1> &src, Memcpy cp) {
+  for (int i = 0; i < dst.shape[0]; ++i) {
+    cp(&dst.at(i), &src.at(i), sizeof(T));
+  }
+}
+
 template<class T, std::size_t DIMS, class Memcpy>
 void util_attrs copy_impl(const md_uview<T, DIMS> &dst, const md_uview<const T, DIMS> &src, Memcpy cp) {
   if (dst.at(0).is_contiguous() && src.at(0).is_contiguous()) {
@@ -455,13 +462,6 @@ void util_attrs copy_impl(const md_uview<T, DIMS> &dst, const md_uview<const T,
     }
   }
 }
-
-template<class T, class Memcpy>
-void util_attrs copy_impl(const md_uview<T, 1> &dst, const md_uview<const T, 1> &src, Memcpy cp) {
-  for (int i = 0; i < dst.shape[0]; ++i) {
-    cp(dst.at(i).data, src.at(i).data, sizeof(T));
-  }
-}
 }
 
 template<class T, std::size_t DIMS, class Memcpy>
 
@@ -118,10 +118,11 @@ std::string OptimizationContext::optimize() {
 
   network->getInput(0)->setName("input");
   network->getOutput(0)->setName("output");
-  network->getInput(0)->setType(nvinfer1::DataType::kFLOAT);
-  network->getOutput(0)->setType(nvinfer1::DataType::kFLOAT);
 
-//  auto ioDataType = config.use_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
+  auto ioDataType = config.use_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
+  network->getInput(0)->setType(ioDataType);
+  network->getOutput(0)->setType(ioDataType);
+
   auto height = config.input_height;
   auto width = config.input_width;
   auto batch = config.batch;
Original file line number	Diff line number	Diff line change
`@@ -442,6 +442,13 @@ void util_attrs copy(const md_view<T, DIMS> &dst, const md_view<T, DIMS> &src) {`
`442`	`442`	`}`
`443`	`443`
`444`	`444`	`namespace detail {`
	`445`	`+template<class T, class Memcpy>`
	`446`	`+void util_attrs copy_impl(const md_uview<T, 1> &dst, const md_uview<const T, 1> &src, Memcpy cp) {`
	`447`	`+ for (int i = 0; i < dst.shape[0]; ++i) {`
	`448`	`+ cp(&dst.at(i), &src.at(i), sizeof(T));`
	`449`	`+ }`
	`450`	`+}`
	`451`	`+`
`445`	`452`	`template<class T, std::size_t DIMS, class Memcpy>`
`446`	`453`	`void util_attrs copy_impl(const md_uview<T, DIMS> &dst, const md_uview<const T, DIMS> &src, Memcpy cp) {`
`447`	`454`	`if (dst.at(0).is_contiguous() && src.at(0).is_contiguous()) {`
`@@ -455,13 +462,6 @@ void util_attrs copy_impl(const md_uview<T, DIMS> &dst, const md_uview<const T,`
`455`	`462`	`}`
`456`	`463`	`}`
`457`	`464`	`}`
`458`		`-`
`459`		`-template<class T, class Memcpy>`
`460`		`-void util_attrs copy_impl(const md_uview<T, 1> &dst, const md_uview<const T, 1> &src, Memcpy cp) {`
`461`		`- for (int i = 0; i < dst.shape[0]; ++i) {`
`462`		`- cp(dst.at(i).data, src.at(i).data, sizeof(T));`
`463`		`- }`
`464`		`-}`
`465`	`465`	`}`
`466`	`466`
`467`	`467`	`template<class T, std::size_t DIMS, class Memcpy>`