ROCm · turneram · Feb 5, 2025 · Feb 6, 2025 · Feb 10, 2025 · Feb 14, 2025
@@ -29,4 +29,4 @@ pybind/pybind11@3e9dfa2866941655c56877882565e7577de6fc7b --build
 msgpack/[email protected] -DMSGPACK_BUILD_TESTS=Off -DMSGPACK_BUILD_EXAMPLES=Off -DCMAKE_POLICY_VERSION_MINIMUM=3.5
 [email protected] -DCMAKE_POSITION_INDEPENDENT_CODE=On
 ROCm/composable_kernel@b7775add2d28251674d81e220cd4a857b90b997a -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCm/rocMLIR@fe6da4db4d6f0da8c74e28a0787cfbb4a026550a -DBUILD_FAT_LIBROCKCOMPILER=On -DLLVM_INCLUDE_TESTS=Off
+ROCm/rocMLIR@0100c11941426b7ad6f0724d51025fa33d227821 -DBUILD_FAT_LIBROCKCOMPILER=On -DLLVM_INCLUDE_TESTS=Off
@@ -190,6 +190,7 @@ register_migraphx_ops(
     ceil
     clip
     concat
+    concat_past_present
     contiguous
     convert
     convolution
@@ -212,6 +213,7 @@ register_migraphx_ops(
     gather
     gathernd
     get_tuple_elem
+    gqa_rotary_embedding
     greater
     group_query_attention
     group

@@ -209,14 +209,191 @@
     }
 };
 
+struct find_kv_cache_attention
+{
+    std::size_t* counter;
+
+    auto matcher() const
+    {
+        static const std::unordered_set<std::string> skip_set = {
+            "multibroadcast", "reshape", "unsqueeze"};
+
+        auto transpose1 = match::skip(match::name(skip_set))(match::name("transpose")(
+            match::arg(0)(match::skip(match::name(skip_set))(match::name("concat_past_present"))
+                              .bind("pres_k"))));
+        auto gemm1 =
+            match::name("dot")(match::arg(0)(match::name("slice")), match::arg(1)(transpose1));
+        auto scale             = match::name("mul")(match::any_arg(0, 1)(gemm1));
+        auto broadcasted_const = match::name("multibroadcast")(match::arg(0)(match::is_constant()));
+        auto attn_scores       = match::any_of(scale, gemm1);
+        auto causal_mask =
+            match::name("where")(match::arg(0)(broadcasted_const), match::arg(2)(attn_scores));
+        auto greater = match::name("multibroadcast")(match::arg(0)(match::name("convert")(
+            match::arg(0)(match::name("greater")(match::arg(1)(match::any().bind("total_sl")))))));
+        auto where   = match::name("where")(match::arg(0)(greater),
+                                          match::arg(2)(match::any_of(causal_mask, scale, gemm1)));
+        auto softmax = match::skip(match::name("convert"))(
+            match::softmax_input(match::skip(match::name("convert"))(where)));
+        auto gemm2 = match::name("dot")(
+            match::arg(0)(softmax),
+            match::arg(1)(match::skip(match::name(skip_set))(match::name("concat_past_present"))
+                              .bind("pres_v")));
+        auto transpose2 = match::name("transpose")(match::arg(0)(gemm2));
+        return match::name("reshape")(match::arg(0)(transpose2));
+    }
+
+    std::string get_count() const { return std::to_string((*counter)++); }
+
+    std::unordered_map<instruction_ref, instruction_ref>
+    invert_map_ins(const std::unordered_map<instruction_ref, instruction_ref>& map_ins) const
+    {
+        std::unordered_map<instruction_ref, instruction_ref> inverse_map;
+        for(auto const& [key, value] : map_ins)
+        {
+            assert(not contains(inverse_map, value));
+            inverse_map[value] = key;
+        }
+        return inverse_map;
+    }
+
+    std::vector<instruction_ref>
+    get_attn_instructions(module& m, instruction_ref start, instruction_ref end) const
+    {
+        std::queue<instruction_ref> inputs;
+        std::unordered_set<instruction_ref> inss;
+        inputs.push(end);
+
+        static const std::unordered_set<std::string> valid_attn_ops = {"softmax",
+                                                                       "broadcast",
+                                                                       "dot",
+                                                                       "slice",
+                                                                       "transpose",
+                                                                       "greater",
+                                                                       "convert",
+                                                                       "where",
+                                                                       "reshape",
+                                                                       "reduce_sum",
+                                                                       "reduce_max",
+                                                                       "broadcast",
+                                                                       "multibroadcast",
+                                                                       "@literal",
+                                                                       "unsqueeze"};
+
+        auto is_valid_attn_op = [&](auto i) {
+            return i->get_operator().attributes().get("pointwise", false) or
+                   contains(valid_attn_ops, i->get_operator().name()) or i == start or i == end;
+        };
+
+        while(not inputs.empty())
+        {
+            auto current_inp = inputs.front();
+            inputs.pop();
+
+            if(is_valid_attn_op(current_inp) and inss.insert(current_inp).second and
+               current_inp != start)
+            {
+                for(auto i : current_inp->inputs())
+                {
+                    inputs.push(i);
+                }
+            }
+        }
+        std::vector<instruction_ref> sorted_inss(inss.begin(), inss.end());
+        std::sort(
+            sorted_inss.begin(), sorted_inss.end(), [&](instruction_ref x, instruction_ref y) {
+                return std::distance(m.begin(), x) < std::distance(m.begin(), y);
+            });
+        return sorted_inss;
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto total_sl = r.instructions["total_sl"];
+        auto reshape  = r.result;
+
+        // Capture all instructions part of the attention op
+        auto attn_inss = get_attn_instructions(mpm.get_module(), total_sl, reshape);
+
+        // Add captured instructions to new submodule
+        module m_attn;
+        std::unordered_map<instruction_ref, instruction_ref> map_mm_to_mattn;
+        auto attn_outs = m_attn.fuse(attn_inss, &map_mm_to_mattn);
+
+        for(auto ins : iterator_for(m_attn))
+        {
+            if(ins->can_eval())
+            {
+                auto lit_s   = ins->get_shape();
+                auto strides = lit_s.strides();
+                if(strides.size() == 4 and
+                   std::all_of(
+                       strides.begin(), strides.end() - 1, [](auto s) { return s == 0; }) and
+                   strides.back() == 1)
+                {
+                    auto new_lit = m_attn.add_literal(
+                        literal{shape{lit_s.type(), {lit_s.lens().back()}}, ins->eval().data()});
+                    m_attn.replace_instruction(
+                        ins, make_op("multibroadcast", {{"out_lens", lit_s.lens()}}), {new_lit});
+                }
+            }
+        }
+        dead_code_elimination{}.apply(m_attn);
+
+        // Define outputs based on instructions that are used elsewhere in the graph
+        std::vector<instruction_ref> required_outputs;
+        std::copy_if(
+            attn_inss.begin(), attn_inss.end(), std::back_inserter(required_outputs), [&](auto i) {
+                return not std::all_of(i->outputs().begin(), i->outputs().end(), [&](auto o) {
+                    return contains(attn_inss, o);
+                });
+            });
+
+        assert(not required_outputs.empty());
+
+        // Find corresponding output instructions in m_attn
+        std::vector<instruction_ref> m_attn_outputs;
+        std::transform(required_outputs.begin(),
+                       required_outputs.end(),
+                       std::back_inserter(m_attn_outputs),
+                       [&](auto i) { return map_mm_to_mattn.at(i); });
+        m_attn.add_return({m_attn_outputs.back()});
+
+        // Define inputs to m_attn
+        auto map_mattn_to_mm = invert_map_ins(map_mm_to_mattn);
+        auto new_inputs      = m_attn.get_inputs(map_mattn_to_mm);
+
+        module_ref mpm_attn = mpm.create_module("attn" + get_count(), std::move(m_attn));
+        mpm_attn->set_bypass();
+
+        // Construct group op with the attention module
+        auto group_ins =
+            mpm.get_module().insert_instruction(required_outputs.back(),
+                                                make_op("group", {{"tag", "kv_cache_attention"}}),
+                                                new_inputs,
+                                                {mpm_attn});
+
+        mpm.get_module().replace_instruction(required_outputs.back(), group_ins);
+    }
+};
+
 } // namespace
 
 void fuse_attention::apply(module_pass_manager& mpm) const
 {
     std::size_t counter = 0;
-    match::find_matches(mpm, find_attention{.counter = &counter});
+
+    // Fuse kv-cache attention by default
+    match::find_matches(mpm, find_kv_cache_attention{.counter = &counter});
     mpm.get_module().sort();
     mpm.run_pass(dead_code_elimination{});
+
+    // Only fuse plain attention when requested
+    if(attn_enabled)
+    {
+        match::find_matches(mpm, find_attention{.counter = &counter});
+        mpm.get_module().sort();
+        mpm.run_pass(dead_code_elimination{});
+    }
 }
 
 } // namespace MIGRAPHX_INLINE_NS

@@ -35,6 +35,8 @@ struct module_pass_manager;
 
 struct MIGRAPHX_EXPORT fuse_attention
 {
+    bool attn_enabled = false;
+
     std::string name() const { return "fuse_attention"; }
     void apply(module_pass_manager& mpm) const;
 };

@@ -0,0 +1,161 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_CONCAT_PAST_PRESENT_HPP
+#define MIGRAPHX_GUARD_OPERATORS_CONCAT_PAST_PRESENT_HPP
+
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/gemm.hpp>
+#include <migraphx/argument.hpp>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct cache_parameters
+{
+    std::size_t batch_size              = 0; // Batch size used by input
+    std::size_t sequence_length         = 0; // Sequence length used by input
+    std::size_t head_size               = 0; // Head size
+    std::size_t num_heads               = 0; // num_heads = hidden_size / head_size
+    std::size_t seqlen_present_kv_cache = 0; // Sequence length of present kv-cache
+};
+
+struct concat_past_present
+{
+    std::size_t kv_num_heads = 0;
+    std::size_t num_heads    = 1;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.kv_num_heads, "kv_num_heads"), f(self.num_heads, "num_heads"));
+    }
+
+    std::string name() const { return "concat_past_present"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        return inputs.back();
+    }
+
+    template <class T>
+    void copy_data(T destination, const T source, std::size_t n) const
+    {
+        par_for(n, [&](auto i) { destination[i] = source[i]; });
+    }
+
+    template <typename T>
+    T concat_state_chunk(const T chunk,
+                         const T present,
+                         std::size_t present_buff_chunk_length,
+                         std::size_t past_chunk_length,
+                         std::size_t new_chunk_length,
+                         std::ptrdiff_t i) const
+    {
+        T start = present + i * present_buff_chunk_length;
+        copy_data(start + past_chunk_length, chunk, new_chunk_length);
+        return start;
+    }
+
+    template <class T, class U>
+    void
+    update_cache(T past_key, const U seqlens_k, const T present_key, cache_parameters params) const
+    {
+        const std::size_t batch_size                     = params.batch_size;
+        const std::size_t sequence_length                = params.sequence_length;
+        const std::size_t head_size                      = params.head_size;
+        const std::size_t past_buffer_sequence_length    = params.seqlen_present_kv_cache;
+        const std::size_t present_buffer_sequence_length = past_buffer_sequence_length;
+
+        const bool is_prompt = sequence_length != 1;
+        const std::size_t packed_batch_stride =
+            (num_heads + 2 * kv_num_heads) * sequence_length * head_size;
+        const std::size_t kv_num_heads_factor   = num_heads / kv_num_heads;
+        const std::size_t kv_input_chunk_length = sequence_length * head_size; // L x H
+        const std::size_t present_buff_chunk_length =
+            present_buffer_sequence_length * head_size; // T x H
+
+        const std::size_t loop_len = batch_size * num_heads;
+
+        par_for(loop_len, [&](const auto i) {
+            const std::size_t batch_index = i / num_heads;
+            const std::size_t head_index  = i % num_heads;
+            const std::size_t past_seqlen =
+                sequence_length == 1 ? seqlens_k[batch_index] : past_buffer_sequence_length;
+            const std::size_t past_chunk_length = is_prompt ? 0 : past_seqlen * head_size;
+
+            auto current = present_key + packed_batch_stride * batch_index +
+                           kv_input_chunk_length * (head_index / kv_num_heads_factor);
+            concat_state_chunk(current,
+                               past_key,
+                               present_buff_chunk_length,
+                               past_chunk_length,
+                               kv_input_chunk_length,
+                               i / kv_num_heads_factor);
+        });
+    }
+
+    argument compute(const shape& /* output_shape */, std::vector<argument> args) const
+    {
+        auto present                      = args[0];
+        auto seqlens                      = args[1];
+        auto past                         = args[2];
+        auto present_shape                = present.get_shape();
+        const auto& present_lens          = present_shape.lens();
+        const std::size_t batch_size      = present_lens[0];
+        const std::size_t sequence_length = present_lens[2];
+        auto past_kv_shape                = past.get_shape();
+        const auto& past_kv_lens          = past_kv_shape.lens();
+        auto past_sequence_length         = past_kv_lens[2];
+        std::size_t head_size             = present_lens[3];
+
+        cache_parameters cache_params        = {};
+        cache_params.batch_size              = batch_size;
+        cache_params.sequence_length         = sequence_length;
+        cache_params.head_size               = head_size;
+        cache_params.num_heads               = num_heads;
+        cache_params.seqlen_present_kv_cache = past_sequence_length;
+
+        visit_all(past, present)([&](auto past_kv, auto present_kv) {
+            visit_all(seqlens)([&](auto seqlens_kv) {
+                update_cache(past_kv.begin(), seqlens_kv.begin(), present_kv.begin(), cache_params);
+            });
+        });
+
+        return past;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif