diff --git a/src/opencl/cl_algo_registry.cpp b/src/opencl/cl_algo_registry.cpp
index 65726579b..8905e089b 100644
--- a/src/opencl/cl_algo_registry.cpp
+++ b/src/opencl/cl_algo_registry.cpp
@@ -30,6 +30,7 @@
 #include <core/registry.hpp>
 #include <core/top.hpp>
 
+#include <opencl/cl_m_extract_row.hpp>
 #include <opencl/cl_m_reduce.hpp>
 #include <opencl/cl_mxmT_masked.hpp>
 #include <opencl/cl_mxv.hpp>
@@ -37,6 +38,7 @@
 #include <opencl/cl_v_count_mf.hpp>
 #include <opencl/cl_v_eadd.hpp>
 #include <opencl/cl_v_eadd_fdb.hpp>
+#include <opencl/cl_v_emult.hpp>
 #include <opencl/cl_v_map.hpp>
 #include <opencl/cl_v_reduce.hpp>
 #include <opencl/cl_vxm.hpp>
@@ -93,6 +95,16 @@ namespace spla {
         g_registry->add(MAKE_KEY_CL_0("mxmT_masked", INT), std::make_shared<Algo_mxmT_masked_cl<T_INT>>());
         g_registry->add(MAKE_KEY_CL_0("mxmT_masked", UINT), std::make_shared<Algo_mxmT_masked_cl<T_UINT>>());
         g_registry->add(MAKE_KEY_CL_0("mxmT_masked", FLOAT), std::make_shared<Algo_mxmT_masked_cl<T_FLOAT>>());
+
+        // algorthm m_extract_row
+        g_registry->add(MAKE_KEY_CL_0("m_extract_row", INT), std::make_shared<Algo_m_extract_row_cl<T_INT>>());
+        g_registry->add(MAKE_KEY_CL_0("m_extract_row", UINT), std::make_shared<Algo_m_extract_row_cl<T_UINT>>());
+        g_registry->add(MAKE_KEY_CL_0("m_extract_row", FLOAT), std::make_shared<Algo_m_extract_row_cl<T_FLOAT>>());
+
+        // algorthm v_emult
+        g_registry->add(MAKE_KEY_CL_0("v_emult", INT), std::make_shared<Algo_v_emult_cl<T_INT>>());
+        g_registry->add(MAKE_KEY_CL_0("v_emult", UINT), std::make_shared<Algo_v_emult_cl<T_UINT>>());
+        g_registry->add(MAKE_KEY_CL_0("v_emult", FLOAT), std::make_shared<Algo_v_emult_cl<T_FLOAT>>());
     }
 
 }// namespace spla
diff --git a/src/opencl/cl_m_extract_row.hpp b/src/opencl/cl_m_extract_row.hpp
new file mode 100644
index 000000000..5a8f3b59b
--- /dev/null
+++ b/src/opencl/cl_m_extract_row.hpp
@@ -0,0 +1,120 @@
+/**********************************************************************************/
+/* This file is part of spla project                                              */
+/* https://github.com/SparseLinearAlgebra/spla                                    */
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2025 SparseLinearAlgebra                                         */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#ifndef SPLA_CL_M_EXTRACT_ROW_HPP
+#define SPLA_CL_M_EXTRACT_ROW_HPP
+
+#include <schedule/schedule_tasks.hpp>
+
+#include <core/dispatcher.hpp>
+#include <core/registry.hpp>
+#include <core/tmatrix.hpp>
+#include <core/top.hpp>
+#include <core/tscalar.hpp>
+#include <core/ttype.hpp>
+#include <core/tvector.hpp>
+
+#include <opencl/cl_counter.hpp>
+#include <opencl/cl_debug.hpp>
+#include <opencl/cl_formats.hpp>
+#include <opencl/cl_program_builder.hpp>
+#include <opencl/generated/auto_m_extract_row.hpp>
+
+namespace spla {
+
+    template<typename T>
+    class Algo_m_extract_row_cl final : public RegistryAlgo {
+    public:
+        ~Algo_m_extract_row_cl() override = default;
+
+        std::string get_name() override {
+            return "m_extract_row";
+        }
+
+        std::string get_description() override {
+            return "opencl extract row from matrix";
+        }
+
+        Status execute(const DispatchContext& ctx) override {
+            auto t = ctx.task.template cast_safe<ScheduleTask_m_extract_row>();
+
+            ref_ptr<TVector<T>> r        = t->r.template cast_safe<TVector<T>>();
+            ref_ptr<TMatrix<T>> M        = t->M.template cast_safe<TMatrix<T>>();
+            auto                op_apply = t->op_apply.template cast_safe<TOpUnary<T, T>>();
+
+            r->validate_wd(FormatVector::AccDense);
+            M->validate_rw(FormatMatrix::AccCsr);
+
+            auto* p_cl_r   = r->template get<CLDenseVec<T>>();
+            auto* p_cl_M   = M->template get<CLCsr<T>>();
+            auto* p_cl_acc = get_acc_cl();
+            auto& queue    = p_cl_acc->get_queue_default();
+
+            // get the row boundaries from M->Ap
+            uint       row_bounds[2];
+            cl::Buffer cl_row_bounds(p_cl_acc->get_context(),
+                                     CL_MEM_READ_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                                     sizeof(row_bounds), row_bounds);
+
+            queue.enqueueCopyBuffer(p_cl_M->Ap, cl_row_bounds, t->index * sizeof(uint), 0, sizeof(row_bounds));
+            queue.finish();
+
+            std::shared_ptr<CLProgram> program;
+            ensure_kernel(op_apply, program);
+
+            auto kernel = program->make_kernel("extract_row");
+            kernel.setArg(0, p_cl_r->Ax);
+            kernel.setArg(1, p_cl_M->Ax);
+            kernel.setArg(2, p_cl_M->Aj);
+            kernel.setArg(3, row_bounds[1]);
+
+            // amount of elements in the row
+            const uint n = row_bounds[1] - row_bounds[0] - 1;
+
+            cl::NDRange global(p_cl_acc->get_default_wgs() * div_up_clamp(n, p_cl_acc->get_default_wgs(), 1u, 1024u));
+            cl::NDRange local(p_cl_acc->get_default_wgs());
+            queue.enqueueNDRangeKernel(kernel, cl::NDRange(row_bounds[0]), global, local);
+
+            return Status::Ok;
+        }
+
+    private:
+        void ensure_kernel(const ref_ptr<TOpUnary<T, T>>& op_apply, std::shared_ptr<CLProgram>& program) {
+            CLProgramBuilder program_builder;
+            program_builder
+                    .set_name("m_extract_row")
+                    .add_type("TYPE", get_ttype<T>().template as<Type>())
+                    .add_op("OP_APPLY", op_apply.template as<OpUnary>())
+                    .set_source(source_m_extract_row)
+                    .acquire();
+            program = program_builder.get_program();
+        }
+    };
+
+}// namespace spla
+
+#endif//SPLA_CL_M_EXTRACT_ROW_HPP
diff --git a/src/opencl/cl_v_emult.hpp b/src/opencl/cl_v_emult.hpp
new file mode 100644
index 000000000..1edb91b09
--- /dev/null
+++ b/src/opencl/cl_v_emult.hpp
@@ -0,0 +1,128 @@
+/**********************************************************************************/
+/* This file is part of spla project                                              */
+/* https://github.com/JetBrains-Research/spla                                     */
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2025 SparseLinearAlgebra                                         */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#ifndef SPLA_CL_VECTOR_EMULT_HPP
+#define SPLA_CL_VECTOR_EMULT_HPP
+
+#include <schedule/schedule_tasks.hpp>
+
+#include <core/dispatcher.hpp>
+#include <core/registry.hpp>
+#include <core/top.hpp>
+#include <core/tscalar.hpp>
+#include <core/ttype.hpp>
+#include <core/tvector.hpp>
+
+#include <opencl/cl_counter.hpp>
+#include <opencl/cl_fill.hpp>
+#include <opencl/cl_formats.hpp>
+#include <opencl/generated/auto_vector_emult.hpp>
+
+namespace spla {
+
+    template<typename T>
+    class Algo_v_emult_cl final : public RegistryAlgo {
+    public:
+        ~Algo_v_emult_cl() override = default;
+
+        std::string get_name() override {
+            return "v_emult";
+        }
+
+        std::string get_description() override {
+            return "parallel vector element-wise mult on opencl device";
+        }
+
+        Status execute(const DispatchContext& ctx) override {
+            auto                t = ctx.task.template cast_safe<ScheduleTask_v_emult>();
+            ref_ptr<TVector<T>> u = t->u.template cast_safe<TVector<T>>();
+            ref_ptr<TVector<T>> v = t->v.template cast_safe<TVector<T>>();
+
+            if (u->is_valid(FormatVector::AccDense) && v->is_valid(FormatVector::AccDense)) {
+                return execute_dn2dn(ctx);
+            }
+
+            return execute_dn2dn(ctx);
+        }
+
+    private:
+        Status execute_dn2dn(const DispatchContext& ctx) {
+            TIME_PROFILE_SCOPE("cl/vector_emult_dn2dn");
+
+            auto                        t  = ctx.task.template cast_safe<ScheduleTask_v_emult>();
+            ref_ptr<TVector<T>>         r  = t->r.template cast_safe<TVector<T>>();
+            ref_ptr<TVector<T>>         u  = t->u.template cast_safe<TVector<T>>();
+            ref_ptr<TVector<T>>         v  = t->v.template cast_safe<TVector<T>>();
+            ref_ptr<TOpBinary<T, T, T>> op = t->op.template cast_safe<TOpBinary<T, T, T>>();
+
+            std::shared_ptr<CLProgram> program;
+            if (!ensure_kernel(op, program)) return Status::CompilationError;
+
+            r->validate_wd(FormatVector::AccDense);
+            u->validate_rw(FormatVector::AccDense);
+            v->validate_rw(FormatVector::AccDense);
+
+            auto*       p_cl_r   = r->template get<CLDenseVec<T>>();
+            const auto* p_cl_u   = u->template get<CLDenseVec<T>>();
+            const auto* p_cl_v   = v->template get<CLDenseVec<T>>();
+            auto*       p_cl_acc = get_acc_cl();
+            auto&       queue    = p_cl_acc->get_queue_default();
+
+            const uint n = r->get_n_rows();
+
+            auto kernel = program->make_kernel("dense_to_dense");
+            kernel.setArg(0, p_cl_r->Ax);
+            kernel.setArg(1, p_cl_u->Ax);
+            kernel.setArg(2, p_cl_v->Ax);
+            kernel.setArg(3, n);
+            kernel.setArg(4, r->get_fill_value());
+
+            cl::NDRange global(p_cl_acc->get_default_wgs() * div_up_clamp(n, p_cl_acc->get_default_wgs(), 1u, 1024u));
+            cl::NDRange local(p_cl_acc->get_default_wgs());
+            queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, local);
+
+            return Status::Ok;
+        }
+
+        bool ensure_kernel(const ref_ptr<TOpBinary<T, T, T>>& op, std::shared_ptr<CLProgram>& program) {
+            CLProgramBuilder program_builder;
+            program_builder
+                    .set_name("vector_emult")
+                    .add_type("TYPE", get_ttype<T>().template as<Type>())
+                    .add_op("OP_BINARY", op.template as<OpBinary>())
+                    .set_source(source_vector_emult)
+                    .acquire();
+
+            program = program_builder.get_program();
+
+            return true;
+        }
+    };
+
+}// namespace spla
+
+#endif
diff --git a/src/opencl/generated/auto_m_extract_row.hpp b/src/opencl/generated/auto_m_extract_row.hpp
new file mode 100644
index 000000000..3a83a96bc
--- /dev/null
+++ b/src/opencl/generated/auto_m_extract_row.hpp
@@ -0,0 +1,23 @@
+////////////////////////////////////////////////////////////////////
+// Copyright (c) 2021 - 2025 SparseLinearAlgebra
+// Autogenerated file, do not modify
+////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+static const char source_m_extract_row[] = R"(
+
+
+__kernel void extract_row(__global TYPE*       g_rx,
+                          __global const TYPE* g_Ax,
+                          __global const uint* g_Aj,
+                          const uint           n) {
+    const uint gid   = get_global_id(0);
+    const uint gsize = get_global_size(0);
+
+    for (uint i = gid; i < n; i += gsize) {
+        g_rx[g_Aj[i]] = OP_APPLY(g_Ax[i]);
+    }
+}
+
+)";
\ No newline at end of file
diff --git a/src/opencl/generated/auto_vector_emult.hpp b/src/opencl/generated/auto_vector_emult.hpp
new file mode 100644
index 000000000..598af5223
--- /dev/null
+++ b/src/opencl/generated/auto_vector_emult.hpp
@@ -0,0 +1,27 @@
+////////////////////////////////////////////////////////////////////
+// Copyright (c) 2021 - 2025 SparseLinearAlgebra
+// Autogenerated file, do not modify
+////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+static const char source_vector_emult[] = R"(
+
+
+
+__kernel void dense_to_dense(__global TYPE*       g_rx,
+                             __global const TYPE* g_ux,
+                             __global const TYPE* g_vx,
+                             const uint           n,
+                             const TYPE           fill_value) {
+    const uint gid   = get_global_id(0);
+    const uint gsize = get_global_size(0);
+
+    for (uint i = gid; i < n; i += gsize) {
+        TYPE u  = g_ux[i];
+        TYPE v  = g_vx[i];
+        g_rx[i] = u != fill_value && v != fill_value ? OP_BINARY(u, v) : fill_value;
+    }
+}
+
+)";
\ No newline at end of file
diff --git a/src/opencl/kernels/m_extract_row.cl b/src/opencl/kernels/m_extract_row.cl
new file mode 100644
index 000000000..eef00eec7
--- /dev/null
+++ b/src/opencl/kernels/m_extract_row.cl
@@ -0,0 +1,40 @@
+/**********************************************************************************/
+/* This file is part of spla project                                              */
+/* https://github.com/SparseLinearAlgebra/spla                                    */
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2025 SparseLinearAlgebra                                         */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#include "common_def.cl"
+
+__kernel void extract_row(__global TYPE*       g_rx,
+                          __global const TYPE* g_Ax,
+                          __global const uint* g_Aj,
+                          const uint           n) {
+    const uint gid   = get_global_id(0);
+    const uint gsize = get_global_size(0);
+
+    for (uint i = gid; i < n; i += gsize) {
+        g_rx[g_Aj[i]] = OP_APPLY(g_Ax[i]);
+    }
+}
diff --git a/src/opencl/kernels/vector_emult.cl b/src/opencl/kernels/vector_emult.cl
new file mode 100644
index 000000000..ae5873873
--- /dev/null
+++ b/src/opencl/kernels/vector_emult.cl
@@ -0,0 +1,44 @@
+
+/**********************************************************************************/
+/* This file is part of spla project                                              */
+/* https://github.com/SparseLinearAlgebra/spla                                    */
+/**********************************************************************************/
+/* MIT License                                                                    */
+/*                                                                                */
+/* Copyright (c) 2023 SparseLinearAlgebra                                         */
+/*                                                                                */
+/* Permission is hereby granted, free of charge, to any person obtaining a copy   */
+/* of this software and associated documentation files (the "Software"), to deal  */
+/* in the Software without restriction, including without limitation the rights   */
+/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell      */
+/* copies of the Software, and to permit persons to whom the Software is          */
+/* furnished to do so, subject to the following conditions:                       */
+/*                                                                                */
+/* The above copyright notice and this permission notice shall be included in all */
+/* copies or substantial portions of the Software.                                */
+/*                                                                                */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR     */
+/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,       */
+/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE    */
+/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER         */
+/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  */
+/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  */
+/* SOFTWARE.                                                                      */
+/**********************************************************************************/
+
+#include "common_def.cl"
+
+__kernel void dense_to_dense(__global TYPE*       g_rx,
+                             __global const TYPE* g_ux,
+                             __global const TYPE* g_vx,
+                             const uint           n,
+                             const TYPE           fill_value) {
+    const uint gid   = get_global_id(0);
+    const uint gsize = get_global_size(0);
+
+    for (uint i = gid; i < n; i += gsize) {
+        TYPE u  = g_ux[i];
+        TYPE v  = g_vx[i];
+        g_rx[i] = u != fill_value && v != fill_value ? OP_BINARY(u, v) : fill_value;
+    }
+}