Add alignment parameter: handling models that has input shape alignment requirement

tongyuantongyu · tongyuantongyu · commit 1cafbbd4089c · 2023-02-14T23:41:54.000+08:00
diff --git a/main.cpp b/main.cpp
@@ -97,6 +97,7 @@ DEFINE_int32(tile_width, 512, "tile width");
 DEFINE_int32(tile_height, 512, "tile height");
 DEFINE_int32(tile_pad, 16, "tile pad border to reduce tile block discontinuity");
 DEFINE_int32(extend_grace, 0, "grace limit to not split another tile");
+DECLARE_int32(alignment);
 
 void verify_flags() {
   if (!exists(std::filesystem::path(FLAGS_model_path))) {
@@ -116,6 +117,11 @@ void verify_flags() {
     LOG(FATAL) << "Invalid tile extend grace.";
   }
 
+  if (FLAGS_alignment < 1 || FLAGS_tile_width % FLAGS_alignment != 0 || FLAGS_tile_height % FLAGS_alignment != 0
+      || FLAGS_tile_pad % FLAGS_alignment != 0 || FLAGS_extend_grace % FLAGS_alignment != 0) {
+    LOG(FATAL) << "Invalid tile alignment.";
+  }
+
   auto ext_count = std::count(FLAGS_extensions.begin(), FLAGS_extensions.end(), ',');
   exts.reserve(ext_count + 1);
   exts.emplace_back(FLAGS_extensions);
diff --git a/reformat/reformat.cpp b/reformat/reformat.cpp
@@ -30,6 +30,7 @@ std::string pixel_exporter_cpu_crop::fetch_color(md_view<const float, 3> src, cu
     return std::string("CUDA error: ") + cudaGetErrorName(err);
   }
 
+  current_buffer_shape = src.shape;
   return "";
 }
 
@@ -40,6 +41,10 @@ std::string pixel_exporter_cpu_crop::fetch_alpha(md_view<const float, 3> src, cu
     return "dimension too big";
   }
 
+  if (current_buffer_shape != src.shape) {
+    return "incompatible color buffer shape";
+  }
+
   auto err = cudaMemcpyAsync(buffer_alpha.get(), src.data, h * w * 4, cudaMemcpyDeviceToHost, stream);
   if (err != cudaSuccess) {
     return std::string("CUDA error: ") + cudaGetErrorName(err);
diff --git a/reformat/reformat.h b/reformat/reformat.h
@@ -37,6 +37,7 @@ struct pad_descriptor {
 class pixel_exporter_cpu_crop {
   std::unique_ptr<float[]> buffer{};
   std::unique_ptr<float[]> buffer_alpha{};
+  shape_t<3> current_buffer_shape;
   size_t max_size;
 
  public:
@@ -82,21 +83,23 @@ std::string pixel_importer_cpu::import_color(md_view<float, 3> dst,
                                              md_uview<const U, 3> src,
                                              cudaStream_t stream,
                                              float quant) {
-  if (dst.shape.slice<1, 2>() != src.shape.template slice<0, 2>()) {
-    return "dimension mismatch";
-  }
-
   auto [h, w, c] = src.shape;
+  auto [dc, dh, dw] = dst.shape;
 
-  if (h * w > max_size) {
+  if (dh * dw > max_size) {
     return "dimension too big";
   }
 
+  if (h > dh || w > dw) {
+    return "incompatible dimension";
+  }
+
   if (quant == 0.0) {
     quant = 1.0 / float(std::numeric_limits<U>::max());
   }
 
   md_view<float, 3> tmp{buffer.get(), dst.shape};
+  md_view<float, 2> tmp_alpha{buffer_alpha.get(), {h, w}};
 
   if (c == 3) {
     for (size_t y = 0; y < h; ++y) {
@@ -118,7 +121,6 @@ std::string pixel_importer_cpu::import_color(md_view<float, 3> dst,
       }
     }
     else {
-      md_view<float, 2> tmp_alpha{buffer_alpha.get(), {h, w}};
       for (size_t y = 0; y < h; ++y) {
         for (size_t x = 0; x < w; ++x) {
           tmp.at(0, y, x) = static_cast<float>(src.at(y, x, 2)) * quant;
@@ -133,6 +135,39 @@ std::string pixel_importer_cpu::import_color(md_view<float, 3> dst,
     assert(false);
   }
 
+  for (size_t y = h; y < dh; ++y) {
+    for (size_t x = 0; x < w; ++x) {
+      tmp.at(0, y, x) = tmp.at(0, h - 1, x);
+      tmp.at(1, y, x) = tmp.at(1, h - 1, x);
+      tmp.at(2, y, x) = tmp.at(2, h - 1, x);
+      if (c == 4 && buffer_alpha) {
+        tmp_alpha.at(y, x) = tmp_alpha.at(h - 1, x);
+      }
+    }
+  }
+
+  for (size_t y = 0; y < h; ++y) {
+    for (size_t x = w; x < dw; ++x) {
+      tmp.at(0, y, x) = tmp.at(0, y, w - 1);
+      tmp.at(1, y, x) = tmp.at(1, y, w - 1);
+      tmp.at(2, y, x) = tmp.at(2, y, w - 1);
+      if (c == 4 && buffer_alpha) {
+        tmp_alpha.at(y, x) = tmp_alpha.at(y, w - 1);
+      }
+    }
+  }
+
+  for (size_t y = h; y < dh; ++y) {
+    for (size_t x = w; x < dw; ++x) {
+      tmp.at(0, y, x) = tmp.at(0, h - 1, w - 1);
+      tmp.at(1, y, x) = tmp.at(1, h - 1, w - 1);
+      tmp.at(2, y, x) = tmp.at(2, h - 1, w - 1);
+      if (c == 4 && buffer_alpha) {
+        tmp_alpha.at(y, x) = tmp_alpha.at(h - 1, w - 1);
+      }
+    }
+  }
+
   auto err = cudaMemcpyAsync(dst.data, tmp.data, dst.size() * 4, cudaMemcpyHostToDevice, stream);
   if (err != cudaSuccess) {
     return std::string("CUDA error: ") + cudaGetErrorName(err);
@@ -148,8 +183,13 @@ std::string pixel_exporter_cpu_crop::export_data(md_uview<U, 3> dst, pad_descrip
   }
 
   auto [he, we, c] = dst.shape;
-  md_uview<float, 3> tmp = md_view<float, 3>{buffer.get(), {c, he, we}};
-  md_uview<float, 2> tmp_alpha = md_view<float, 2>{buffer_alpha.get(), {he, we}};
+  auto [_, hs, ws] = current_buffer_shape;
+  if (he > hs || we > ws) {
+    return "incompatible dimension";
+  }
+
+  md_uview<float, 3> tmp = md_view<float, 3>{buffer.get(), current_buffer_shape};
+  md_uview<float, 2> tmp_alpha = md_view<float, 2>{buffer_alpha.get(), current_buffer_shape.slice<1, 2>()};
 
   offset_t shrink = pad.pad / 2;
   offset_t hb = pad.top ? 0 : shrink;
diff --git a/workers.cpp b/workers.cpp
@@ -5,7 +5,6 @@
 #include <array>
 #include <memory>
 #include <iostream>
-#include <cmath>
 
 #include "nn-scaler.h"
 #include "reformat/reformat.h"
@@ -178,6 +177,12 @@ DECLARE_int32(tile_width);
 DECLARE_int32(tile_height);
 DECLARE_int32(tile_pad);
 DECLARE_int32(extend_grace);
+DEFINE_int32(alignment, 1, "model input alignment requirement");
+
+static offset_t align(offset_t n, size_t alignment) {
+  n += alignment - 1;
+  return n - (n % alignment);
+}
 
 static void pixel_import_worker(ichan &in, ichan &out) {
   bool nn_alpha = FLAGS_alpha == "nn";
@@ -192,19 +197,23 @@ static void pixel_import_worker(ichan &in, ichan &out) {
 
     auto [h, w, c] = ctx.in_image.shape;
     auto process_alpha = nn_alpha && c == 4;
+    offset_t h_split = align(h, FLAGS_alignment), w_split = align(w, FLAGS_alignment);
 
     split_range<offset_t>(
-        h, FLAGS_tile_height, FLAGS_tile_pad, FLAGS_extend_grace,
-        [&, w = w](offset_t y, offset_t th, bool h_beg, bool h_end) {
+        h_split, FLAGS_tile_height, FLAGS_tile_pad, FLAGS_extend_grace,
+        [&, h = h, w = w](offset_t y, offset_t th, bool h_beg, bool h_end) {
           return split_range<offset_t>(
-              w, FLAGS_tile_width, FLAGS_tile_pad, FLAGS_extend_grace,
+              w_split, FLAGS_tile_width, FLAGS_tile_pad, FLAGS_extend_grace,
               [&](offset_t x, offset_t tw, bool w_beg, bool w_end) -> bool {
                 auto tile_start = hr_clock::now();
 
                 md_view<float, 3> input_tensor = {reinterpret_cast<float *>(session->input), {3, th, tw}};
-                importer->import_color(input_tensor,
-                                       ctx.in_image.slice<0>(y, y + th).slice<1>(x, x + tw),
+                auto ret = importer->import_color(input_tensor,
+                                       ctx.in_image.slice<0>(y, std::min(y + th, h)).slice<1>(x, std::min(x + tw, w)),
                                        session->stream);
+                if (!ret.empty()) {
+                  LOG(FATAL) << "Unexpected error importing pixel: " << ret;
+                }
 
                 WorkContextInternal tile_ctx{
                     .tile_start = tile_start,
@@ -244,7 +253,11 @@ static void pixel_import_worker(ichan &in, ichan &out) {
 
                 if (process_alpha) {
                   auto alpha_start = hr_clock::now();
-                  importer->import_alpha(input_tensor, session->stream);
+                  ret = importer->import_alpha(input_tensor, session->stream);
+                  if (!ret.empty()) {
+                    LOG(FATAL) << "Unexpected error importing pixel: " << ret;
+                  }
+
                   tile_ctx = {
                       .tile_start = alpha_start,
                       .y = y, .x = x, .th = th, .tw = tw,
@@ -331,11 +344,15 @@ static void pixel_export_worker(ichan &in, ichan &out) {
     auto ctx = std::move(*i);
     md_view<float, 3> output_tensor =
         {reinterpret_cast<float *>(session->output), {3, ctx.th * h_scale, ctx.tw * w_scale}};
+    std::string ret;
     if (ctx.is_alpha) {
-      exporter->fetch_alpha(output_tensor, session->stream);
+      ret = exporter->fetch_alpha(output_tensor, session->stream);
     }
     else {
-      exporter->fetch_color(output_tensor, session->stream);
+      ret = exporter->fetch_color(output_tensor, session->stream);
+    }
+    if (!ret.empty()) {
+      LOG(FATAL) << "Unexpected error fetching result pixel: " << ret;
     }
 
     auto err = cudaStreamSynchronize(session->stream);
@@ -358,11 +375,15 @@ static void pixel_export_worker(ichan &in, ichan &out) {
     }
 
     pad_descriptor pad_desc{FLAGS_tile_pad * h_scale, ctx.h_beg, ctx.h_end, ctx.w_beg, ctx.w_end};
+    auto [h, w, _] = ctx.out_image.shape;
     auto out_tile = ctx.out_image
-        .slice<0>(h_scale * ctx.y, h_scale * (ctx.y + ctx.th))
-        .slice<1>(w_scale * ctx.x, w_scale * (ctx.x + ctx.tw));
+        .slice<0>(h_scale * ctx.y, std::min(h_scale * (ctx.y + ctx.th), h))
+        .slice<1>(w_scale * ctx.x, std::min(w_scale * (ctx.x + ctx.tw), w));
     if (!ctx.has_alpha || ctx.is_alpha) {
-      exporter->export_data(out_tile, pad_desc);
+      ret = exporter->export_data(out_tile, pad_desc);
+      if (!ret.empty()) {
+        LOG(FATAL) << "Unexpected error exporting pixel: " << ret;
+      }
     }
 
     VLOG(1) << "Tile "

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ std::string pixel_exporter_cpu_crop::fetch_color(md_view<const float, 3> src, cu`
`30`	`30`	`return std::string("CUDA error: ") + cudaGetErrorName(err);`
`31`	`31`	`}`
`32`	`32`
	`33`	`+ current_buffer_shape = src.shape;`
`33`	`34`	`return "";`
`34`	`35`	`}`
`35`	`36`
`@@ -40,6 +41,10 @@ std::string pixel_exporter_cpu_crop::fetch_alpha(md_view<const float, 3> src, cu`
`40`	`41`	`return "dimension too big";`
`41`	`42`	`}`
`42`	`43`
	`44`	`+ if (current_buffer_shape != src.shape) {`
	`45`	`+ return "incompatible color buffer shape";`
	`46`	`+ }`
	`47`	`+`
`43`	`48`	`auto err = cudaMemcpyAsync(buffer_alpha.get(), src.data, h * w * 4, cudaMemcpyDeviceToHost, stream);`
`44`	`49`	`if (err != cudaSuccess) {`
`45`	`50`	`return std::string("CUDA error: ") + cudaGetErrorName(err);`