Revert "Update function params and corresponding usages. " (#2596)

jainapurva · web-flow · commit 376d6d245896 · 2025-07-24T11:04:13.000-07:00
Revert "Update function params and corresponding usages." This reverts commit 9da7ad5.
diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h b/torchao/experimental/kernels/cpu/aarch64/linear/groupwise_lowbit_weight/groupwise_lowbit_weight_lut.h
@@ -44,17 +44,7 @@ chunked and interleaved during the packing process.
  * @param input Pointer to the source activation matrix (float32, row-major).
  */
 template <int mr_, int kr_, int sr_>
-inline void pack_activations(
-    float* output,
-    int m,
-    int k,
-    const float* input,
-    int mr,
-    int kr,
-    int sr) {
-  (void)mr; // unused
-  (void)kr; // unused
-  (void)sr; // unused
+inline void pack_activations(float* output, int m, int k, const float* input) {
   activation_packing::pack_activations<mr_, kr_, sr_>(output, m, k, input);
 }
 
@@ -110,7 +100,7 @@ row-major).
  * @param bias Pointer to the bias vector (float32, row-major).
  */
 template <int weight_nbit_, int nr_, int kr_, int sr_>
-void pack_weights(
+void pack_weights_for_groupwise_lut_kernel(
     /*output*/
     void* packed_weights_ptr,
     /*inputs*/
@@ -123,14 +113,7 @@ void pack_weights(
     int lut_group_size,
     bool has_scales,
     bool has_bias,
-    const float* bias,
-    int nr,
-    int kr,
-    int sr) {
-  (void)nr; // unused
-  (void)kr; // unused
-  (void)sr; // unused
-
+    const float* bias) {
   weight_packing::pack_weights<weight_nbit_, nr_, kr_, sr_>(
       packed_weights_ptr,
       weight_qvals_indices,
@@ -207,12 +190,7 @@ inline void groupwise_lowbit_weight_lut_kernel_1x4x32(
  * @param k The K dimension (width) of the activation matrix.
  * @return The byte offset from the start of the buffer.
  */
-inline size_t
-packed_activations_offset(int m_idx, int k, int mr, int kr, int sr) {
-  (void)mr; // unused
-  (void)kr; // unused
-  (void)sr; // unused
-
+inline size_t packed_activations_offset(int m_idx, int k) {
   // For a simple padded row-major format, the offset is just m_idx * k.
   return sizeof(float) * m_idx * k;
 }
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_lut.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_lut.cpp
@@ -71,7 +71,7 @@ void test_groupwise_lowbit_lut_kernel(
   std::vector<float> packed_activations_buffer(
       kernel_api::packed_activations_size(m, k, mr_, kr_, sr_));
   kernel_api::pack_activations<mr_, kr_, sr_>(
-      packed_activations_buffer.data(), m, k, source_activations.data(), mr_, kr_, sr_);
+      packed_activations_buffer.data(), m, k, source_activations.data());
   // 3. Pack Weights
   std::vector<char> packed_weights(kernel_api::packed_weights_size(
       n,
@@ -84,7 +84,7 @@ void test_groupwise_lowbit_lut_kernel(
       kr_,
       sr_));
   kernel_api::
-      pack_weights<weight_nbit_, nr_, kr_, sr_>(
+      pack_weights_for_groupwise_lut_kernel<weight_nbit_, nr_, kr_, sr_>(
           packed_weights.data(),
           test_case.weight_qval_indices.data(),
           test_case.weight_scales.data(),
@@ -95,7 +95,7 @@ void test_groupwise_lowbit_lut_kernel(
           flat_lut_group_size,
           has_scales_,
           has_bias,
-          test_case.bias.data(), nr_, kr_, sr_);
+          test_case.bias.data());
 
   // 4. Run the kernel
   std::vector<float> output(m * n);
diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h b/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h
@@ -640,10 +640,11 @@ struct groupwise_lowbit_weight_lut_test_case {
     const int total_weights = n * k;
     // Frequencies are controlled by their group sizes.
     assert(total_weights % scale_group_size == 0);
+    assert(total_weights % lut_group_size == 0);
 
     // The number of unique scales/LUTs is derived directly from their group size.
     const int num_scales = total_weights / scale_group_size;
-    const int num_luts = (total_weights + lut_group_size - 1) / lut_group_size;
+    const int num_luts = total_weights / lut_group_size;
     const int lut_size = 1 << weight_nbit;
     std::mt19937 gen(std::random_device{}());
 
@@ -725,6 +726,9 @@ struct groupwise_lowbit_weight_lut_test_case {
     int weight_nbit, bool has_scales,
     bool has_bias, bool has_clamp) {
 
+    std::cout << "[Generator Info] Using 'Per-Group' model.\n"
+              << "  - Both scales and LUTs will switch every " << group_size << " weights." << std::endl;
+
     // Just call the decoupled generator with the same group size for both.
     return _generate_master(
       m, k, n,
@@ -744,6 +748,10 @@ struct groupwise_lowbit_weight_lut_test_case {
     int scale_group_size, int lut_group_size, int weight_nbit, bool has_scales,
     bool has_bias, bool has_clamp) {
 
+    std::cout << "[Generator Info] Using 'Decoupled Grouping' model.\n"
+              << "  - Scales will switch every " << scale_group_size << " weights.\n"
+              << "  - LUTs will switch every " << lut_group_size << " weights." << std::endl;
+
     return _generate_master(
         m, k, n,
         scale_group_size, lut_group_size,