ROCm
diff --git a/‎src/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎src/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/fuse_attention.cpp‎
Lines changed: 180 additions & 1 deletion b/‎src/fuse_attention.cpp‎
Lines changed: 180 additions & 1 deletion
diff --git a/‎src/include/migraphx/fuse_attention.hpp‎
Lines changed: 2 additions & 0 deletions b/‎src/include/migraphx/fuse_attention.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/include/migraphx/op/concat_past_present.hpp‎
Lines changed: 155 additions & 0 deletions b/‎src/include/migraphx/op/concat_past_present.hpp‎
Lines changed: 155 additions & 0 deletions
@@ -190,6 +190,7 @@ register_migraphx_ops(
     ceil
     clip
     concat
+    concat_past_present
     contiguous
     convert
     convolution
@@ -212,8 +213,8 @@ register_migraphx_ops(
     gather
     gathernd
     get_tuple_elem
+    gqa_rotary_embedding
     greater
-    group_query_attention
     group
     gru
     identity
 
@@ -209,14 +209,193 @@ struct find_attention
     }
 };
 
+struct find_kv_cache_attention
+{
+    std::size_t* counter;
+
+    auto matcher() const
+    {
+        static const std::unordered_set<std::string> skip_set = {
+            "multibroadcast", "reshape", "unsqueeze"};
+
+        auto keys =
+            match::skip(match::name(skip_set))(match::name("concat_past_present")).bind("pres_k");
+        auto k_transpose =
+            match::skip(match::name(skip_set))(match::name("transpose")(match::arg(0)(keys)));
+        auto queries = match::name("slice");
+        auto gemm1   = match::name("dot")(match::arg(0)(queries), match::arg(1)(k_transpose));
+        auto scale   = match::name("mul")(match::any_arg(0, 1)(gemm1));
+        auto broadcasted_const = match::name("multibroadcast")(match::arg(0)(match::is_constant()));
+        auto attn_scores       = match::any_of(scale, gemm1);
+        auto causal_mask =
+            match::name("where")(match::arg(0)(broadcasted_const), match::arg(2)(attn_scores));
+        auto greater = match::name("greater")(match::arg(1)(match::any().bind("total_sl")));
+        auto conv_greater =
+            match::skip(match::name("unsqueeze"))(match::name("convert")(match::arg(0)(greater)));
+        auto bc_greater         = match::name("multibroadcast")(match::arg(0)(conv_greater));
+        auto mask               = match::name("where")(match::arg(0)(bc_greater),
+                                         match::arg(2)(match::any_of(causal_mask, scale, gemm1)));
+        auto attn_probabilities = match::skip(match::name("convert"))(
+            match::softmax_input(match::skip(match::name("convert"))(mask)));
+        auto values =
+            match::skip(match::name(skip_set))(match::name("concat_past_present")).bind("pres_v");
+        auto gemm2 = match::name("dot")(match::arg(0)(attn_probabilities), match::arg(1)(values));
+        auto transpose_out = match::name("transpose")(match::arg(0)(gemm2));
+        return match::name("reshape")(match::arg(0)(transpose_out));
+    }
+
+    std::string get_count() const { return std::to_string((*counter)++); }
+
+    std::unordered_map<instruction_ref, instruction_ref>
+    invert_map_ins(const std::unordered_map<instruction_ref, instruction_ref>& map_ins) const
+    {
+        std::unordered_map<instruction_ref, instruction_ref> inverse_map;
+        for(auto const& [key, value] : map_ins)
+        {
+            assert(not contains(inverse_map, value));
+            inverse_map[value] = key;
+        }
+        return inverse_map;
+    }
+
+    std::vector<instruction_ref>
+    get_attn_instructions(module& m, instruction_ref start, instruction_ref end) const
+    {
+        std::queue<instruction_ref> inputs;
+        std::unordered_set<instruction_ref> inss;
+        inputs.push(end);
+
+        static const std::unordered_set<std::string> valid_attn_ops = {"softmax",
+                                                                       "broadcast",
+                                                                       "dot",
+                                                                       "slice",
+                                                                       "transpose",
+                                                                       "greater",
+                                                                       "convert",
+                                                                       "where",
+                                                                       "reshape",
+                                                                       "reduce_sum",
+                                                                       "reduce_max",
+                                                                       "broadcast",
+                                                                       "multibroadcast",
+                                                                       "@literal",
+                                                                       "unsqueeze"};
+
+        auto is_valid_attn_op = [&](auto i) {
+            return i->get_operator().attributes().get("pointwise", false) or
+                   contains(valid_attn_ops, i->get_operator().name()) or i == start or i == end;
+        };
+
+        while(not inputs.empty())
+        {
+            auto current_inp = inputs.front();
+            inputs.pop();
+
+            if(is_valid_attn_op(current_inp) and inss.insert(current_inp).second and
+               current_inp != start)
+            {
+                for(auto i : current_inp->inputs())
+                {
+                    inputs.push(i);
+                }
+            }
+        }
+        std::vector<instruction_ref> sorted_inss(inss.begin(), inss.end());
+        std::sort(
+            sorted_inss.begin(), sorted_inss.end(), [&](instruction_ref x, instruction_ref y) {
+                return std::distance(m.begin(), x) < std::distance(m.begin(), y);
+            });
+        return sorted_inss;
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto total_sl = r.instructions["total_sl"];
+        auto reshape  = r.result;
+
+        // Capture all instructions part of the attention op
+        auto attn_inss = get_attn_instructions(mpm.get_module(), total_sl, reshape);
+
+        // Add captured instructions to new submodule
+        module m_attn;
+        std::unordered_map<instruction_ref, instruction_ref> map_mm_to_mattn;
+        auto attn_outs = m_attn.fuse(attn_inss, &map_mm_to_mattn);
+
+        for(auto ins : iterator_for(m_attn))
+        {
+            if(ins->can_eval())
+            {
+                auto lit_s   = ins->get_shape();
+                auto strides = lit_s.strides();
+                if(strides.size() == 4 and
+                   std::all_of(
+                       strides.begin(), strides.end() - 1, [](auto s) { return s == 0; }) and
+                   strides.back() == 1)
+                {
+                    auto new_lit = m_attn.add_literal(
+                        literal{shape{lit_s.type(), {lit_s.lens().back()}}, ins->eval().data()});
+                    m_attn.replace_instruction(
+                        ins, make_op("multibroadcast", {{"out_lens", lit_s.lens()}}), {new_lit});
+                }
+            }
+        }
+        dead_code_elimination{}.apply(m_attn);
+
+        // Define outputs based on instructions that are used elsewhere in the graph
+        std::vector<instruction_ref> required_outputs;
+        std::copy_if(
+            attn_inss.begin(), attn_inss.end(), std::back_inserter(required_outputs), [&](auto i) {
+                return not std::all_of(i->outputs().begin(), i->outputs().end(), [&](auto o) {
+                    return contains(attn_inss, o);
+                });
+            });
+
+        assert(not required_outputs.empty());
+
+        // Find corresponding output instructions in m_attn
+        std::vector<instruction_ref> m_attn_outputs;
+        std::transform(required_outputs.begin(),
+                       required_outputs.end(),
+                       std::back_inserter(m_attn_outputs),
+                       [&](auto i) { return map_mm_to_mattn.at(i); });
+        m_attn.add_return({m_attn_outputs.back()});
+
+        // Define inputs to m_attn
+        auto map_mattn_to_mm = invert_map_ins(map_mm_to_mattn);
+        auto new_inputs      = m_attn.get_inputs(map_mattn_to_mm);
+
+        module_ref mpm_attn = mpm.create_module("attn" + get_count(), std::move(m_attn));
+        mpm_attn->set_bypass();
+
+        // Construct group op with the attention module
+        auto group_ins =
+            mpm.get_module().insert_instruction(required_outputs.back(),
+                                                make_op("group", {{"tag", "kv_cache_attention"}}),
+                                                new_inputs,
+                                                {mpm_attn});
+
+        mpm.get_module().replace_instruction(required_outputs.back(), group_ins);
+    }
+};
+
 } // namespace
 
 void fuse_attention::apply(module_pass_manager& mpm) const
 {
     std::size_t counter = 0;
-    match::find_matches(mpm, find_attention{.counter = &counter});
+
+    // Fuse kv-cache attention by default
+    match::find_matches(mpm, find_kv_cache_attention{.counter = &counter});
     mpm.get_module().sort();
     mpm.run_pass(dead_code_elimination{});
+
+    // Only fuse plain attention when requested
+    if(attn_enabled)
+    {
+        match::find_matches(mpm, find_attention{.counter = &counter});
+        mpm.get_module().sort();
+        mpm.run_pass(dead_code_elimination{});
+    }
 }
 
 } // namespace MIGRAPHX_INLINE_NS
 
@@ -35,6 +35,8 @@ struct module_pass_manager;
 
 struct MIGRAPHX_EXPORT fuse_attention
 {
+    bool attn_enabled = false;
+
     std::string name() const { return "fuse_attention"; }
     void apply(module_pass_manager& mpm) const;
 };
 
@@ -0,0 +1,155 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_CONCAT_PAST_PRESENT_HPP
+#define MIGRAPHX_GUARD_OPERATORS_CONCAT_PAST_PRESENT_HPP
+
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/gemm.hpp>
+#include <migraphx/argument.hpp>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct cache_parameters
+{
+    std::size_t batch_size              = 0; // Batch size used by input
+    std::size_t sequence_length         = 0; // Sequence length used by input
+    std::size_t head_size               = 0; // Head size
+    std::size_t seqlen_present_kv_cache = 0; // Sequence length of present kv-cache
+};
+
+struct concat_past_present
+{
+    std::size_t kv_num_heads = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.kv_num_heads, "kv_num_heads"));
+    }
+
+    std::string name() const { return "concat_past_present"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        return inputs.back();
+    }
+
+    template <class T>
+    void copy_data(T destination, const T source, std::size_t n) const
+    {
+        par_for(n, [&](auto i) { destination[i] = source[i]; });
+    }
+
+    template <typename T>
+    T concat_state_chunk(const T chunk,
+                         const T present,
+                         std::size_t present_buff_chunk_length,
+                         std::size_t past_chunk_length,
+                         std::size_t new_chunk_length,
+                         std::ptrdiff_t i) const
+    {
+        T start = present + i * present_buff_chunk_length;
+        copy_data(start + past_chunk_length, chunk, new_chunk_length);
+        return start;
+    }
+
+    template <class T, class U>
+    void
+    update_cache(T past_key, const U seqlens_k, const T present_key, cache_parameters params) const
+    {
+        const std::size_t batch_size                     = params.batch_size;
+        const std::size_t sequence_length                = params.sequence_length;
+        const std::size_t head_size                      = params.head_size;
+        const std::size_t past_buffer_sequence_length    = params.seqlen_present_kv_cache;
+        const std::size_t present_buffer_sequence_length = past_buffer_sequence_length;
+
+        const bool is_prompt                    = sequence_length != 1;
+        const std::size_t packed_batch_stride   = kv_num_heads * sequence_length * head_size;
+        const std::size_t kv_input_chunk_length = sequence_length * head_size; // L x H
+        const std::size_t present_buff_chunk_length =
+            present_buffer_sequence_length * head_size; // T x H
+
+        const std::size_t loop_len = batch_size * kv_num_heads;
+
+        par_for(loop_len, [&](const auto i) {
+            const std::size_t batch_index = i / kv_num_heads;
+            const std::size_t head_index  = i % kv_num_heads;
+            const std::size_t past_seqlen =
+                sequence_length == 1 ? seqlens_k[batch_index] : past_buffer_sequence_length;
+            const std::size_t past_chunk_length = is_prompt ? 0 : past_seqlen * head_size;
+            auto current                        = present_key + packed_batch_stride * batch_index +
+                           kv_input_chunk_length * head_index;
+            concat_state_chunk(current,
+                               past_key,
+                               present_buff_chunk_length,
+                               past_chunk_length,
+                               kv_input_chunk_length,
+                               i);
+        });
+    }
+
+    argument compute(const shape& /* output_shape */, std::vector<argument> args) const
+    {
+        auto present                      = args[0];
+        auto seqlens                      = args[1];
+        auto past                         = args[2];
+        auto present_shape                = present.get_shape();
+        const auto& present_lens          = present_shape.lens();
+        const std::size_t batch_size      = present_lens[0];
+        const std::size_t sequence_length = present_lens[2];
+        auto past_kv_shape                = past.get_shape();
+        const auto& past_kv_lens          = past_kv_shape.lens();
+        auto past_sequence_length         = past_kv_lens[2];
+        std::size_t head_size             = present_lens[3];
+
+        cache_parameters cache_params        = {};
+        cache_params.batch_size              = batch_size;
+        cache_params.sequence_length         = sequence_length;
+        cache_params.head_size               = head_size;
+        cache_params.seqlen_present_kv_cache = past_sequence_length;
+
+        visit_all(past, present)([&](auto past_kv, auto present_kv) {
+            visit_all(seqlens)([&](auto seqlens_kv) {
+                update_cache(past_kv.begin(), seqlens_kv.begin(), present_kv.begin(), cache_params);
+            });
+        });
+
+        return past;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,8 @@ struct module_pass_manager;`
`35`	`35`
`36`	`36`	`struct MIGRAPHX_EXPORT fuse_attention`
`37`	`37`	`{`
	`38`	`+ bool attn_enabled = false;`
	`39`	`+`
`38`	`40`	`std::string name() const { return "fuse_attention"; }`
`39`	`41`	`void apply(module_pass_manager& mpm) const;`
`40`	`42`	`};`