diff --git a/docs/api_extra.rst b/docs/api_extra.rst
index 83b17e99..aee71ef5 100644
--- a/docs/api_extra.rst
+++ b/docs/api_extra.rst
@@ -644,8 +644,8 @@ N-dimensional array type
 ------------------------
 
 The following type can be used to exchange n-dimension arrays with frameworks
-like NumPy, PyTorch, Tensorflow, JAX, CuPy, and others. It requires an
-additional include directive:
+like NumPy, PyTorch, Tensorflow, JAX, CuPy, PaddlePaddle, and others. It 
+requires an additional include directive:
 
 .. code-block:: cpp
 
@@ -1108,6 +1108,8 @@ convert into an equivalent representation in one of the following frameworks:
 
    Builtin Python ``memoryview`` for CPU-resident data.
 
+.. cpp:class:: paddle
+
 Eigen convenience type aliases
 ------------------------------
 
diff --git a/docs/ndarray.rst b/docs/ndarray.rst
index e62fcb37..f6184e70 100644
--- a/docs/ndarray.rst
+++ b/docs/ndarray.rst
@@ -8,7 +8,8 @@ The ``nb::ndarray<..>`` class
 nanobind can exchange n-dimensional arrays (henceforth "**nd-arrays**") with
 popular array programming frameworks including `NumPy <https://numpy.org>`__,
 `PyTorch <https://pytorch.org>`__, `TensorFlow <https://www.tensorflow.org>`__,
-`JAX <https://jax.readthedocs.io>`__, and `CuPy <https://cupy.dev>`_. It
+`JAX <https://jax.readthedocs.io>`__, `CuPy <https://cupy.dev>`_ and 
+`PaddlePaddle <https://www.paddlepaddle.org.cn/en>`__. It
 supports *zero-copy* exchange using two protocols:
 
 -  The classic `buffer
@@ -275,6 +276,7 @@ desired Python type.
 - :cpp:class:`nb::tensorflow <tensorflow>`: create a ``tensorflow.python.framework.ops.EagerTensor``.
 - :cpp:class:`nb::jax <jax>`: create a ``jaxlib.xla_extension.DeviceArray``.
 - :cpp:class:`nb::cupy <cupy>`: create a ``cupy.ndarray``.
+- :cpp:class:`nb::paddle <paddle>`: create a ``paddle.Tensor``.
 - No framework annotation. In this case, nanobind will create a raw Python
   ``dltensor`` `capsule <https://docs.python.org/3/c-api/capsule.html>`__
   representing the `DLPack <https://github.com/dmlc/dlpack>`__ metadata.
diff --git a/docs/porting.rst b/docs/porting.rst
index eb3b9061..74320d86 100644
--- a/docs/porting.rst
+++ b/docs/porting.rst
@@ -349,8 +349,8 @@ Removed features include:
 - ○ The NumPy array class (``py::array``) was removed in exchange for a more
   powerful alternative (:cpp:class:`nb::ndarray\<..\> <nanobind::ndarray>`)
   that additionally supports CPU/GPU tensors produced by various frameworks
-  (NumPy, PyTorch, TensorFlow, JAX, etc.). Its API is not compatible with
-  pybind11, however.
+  (NumPy, PyTorch, TensorFlow, JAX, PaddlePaddle, etc.). Its API is not compatible 
+  with pybind11, however.
 - ● Buffer protocol binding (``.def_buffer()``) was removed in favor of
   :cpp:class:`nb::ndarray\<..\> <nanobind::ndarray>`.
 - ● Support for evaluating Python files was removed.
diff --git a/docs/why.rst b/docs/why.rst
index 5f58e2c6..3c849031 100644
--- a/docs/why.rst
+++ b/docs/why.rst
@@ -117,7 +117,8 @@ nanobind includes a number of quality-of-life improvements for developers:
   `buffer protocol <https://docs.python.org/3/c-api/buffer.html>`__ to achieve
   *zero-copy* CPU/GPU array exchange with frameworks like `NumPy
   <https://numpy.org>`__, `PyTorch <https://pytorch.org>`__, `TensorFlow
-  <https://www.tensorflow.org>`__, `JAX <https://jax.readthedocs.io>`__, etc. See
+  <https://www.tensorflow.org>`__, `JAX <https://jax.readthedocs.io>`__, 
+  `PaddlePaddle <https://www.paddlepaddle.org.cn/en>`__ etc. See
   the :ref:`section on n-dimensional arrays <ndarrays>` for details.
 
 - **Stable ABI**: nanobind can target Python's `stable ABI interface
diff --git a/include/nanobind/ndarray.h b/include/nanobind/ndarray.h
index f71dc7e5..d2677092 100644
--- a/include/nanobind/ndarray.h
+++ b/include/nanobind/ndarray.h
@@ -86,6 +86,7 @@ NB_FRAMEWORK(tensorflow, 3, "tensorflow.python.framework.ops.EagerTensor");
 NB_FRAMEWORK(jax, 4, "jaxlib.xla_extension.DeviceArray");
 NB_FRAMEWORK(cupy, 5, "cupy.ndarray");
 NB_FRAMEWORK(memview, 6, "memoryview");
+NB_FRAMEWORK(paddle, 7, "paddle.Tensor");
 
 NAMESPACE_BEGIN(device)
 NB_DEVICE(none, 0); NB_DEVICE(cpu, 1); NB_DEVICE(cuda, 2);
diff --git a/src/nb_ndarray.cpp b/src/nb_ndarray.cpp
index d84177a8..cb2f1a48 100644
--- a/src/nb_ndarray.cpp
+++ b/src/nb_ndarray.cpp
@@ -371,7 +371,9 @@ bool ndarray_check(PyObject *o) noexcept {
         // Tensorflow
         strcmp(tp_name, "tensorflow.python.framework.ops.EagerTensor") == 0 ||
         // Cupy
-        strcmp(tp_name, "cupy.ndarray") == 0;
+        strcmp(tp_name, "cupy.ndarray") == 0 ||
+        // PaddlePaddle
+        strcmp(tp_name, "paddle.Tensor") == 0;
 
     Py_DECREF(name);
     return result;
@@ -402,6 +404,8 @@ ndarray_handle *ndarray_import(PyObject *o, const ndarray_config *c,
                     package = module_::import_("torch.utils.dlpack");
                 else if (strncmp(module_name, "jaxlib", 6) == 0)
                     package = module_::import_("jax.dlpack");
+                else if (strcmp(module_name, "paddle") == 0)
+                    package = module_::import_("paddle.utils.dlpack");
 
                 if (package.is_valid())
                     capsule = package.attr("to_dlpack")(handle(o));
@@ -538,11 +542,16 @@ ndarray_handle *ndarray_import(PyObject *o, const ndarray_config *c,
         try {
             if (strcmp(module_name, "numpy") == 0 || strcmp(module_name, "cupy") == 0) {
                 converted = handle(o).attr("astype")(dtype, order);
-            } else if (strcmp(module_name, "torch") == 0) {
+            } else if (strcmp(module_name, "torch") == 0 || strcmp(module_name, "paddle") == 0) {
                 converted = handle(o).attr("to")(
-                    arg("dtype") = module_::import_("torch").attr(dtype));
-                if (c->order == 'C')
+                    arg("dtype") = module_::import_(module_name).attr(dtype));
+                if (c->order == 'C') {
+                    // paddle.Tensor.contiguous will operate on self Tensor
+                    // so to have a similar behavior to pytorch, detach() should be called
+                    if (strcmp(module_name, "paddle") == 0)
+                        converted = converted.attr("detach")();
                     converted = converted.attr("contiguous")();
+                }
             } else if (strncmp(module_name, "tensorflow.", 11) == 0) {
                 converted = module_::import_("tensorflow")
                                 .attr("cast")(handle(o), dtype);
@@ -793,6 +802,7 @@ PyObject *ndarray_export(ndarray_handle *th, int framework,
                 case tensorflow::value: pkg_name = "tensorflow.experimental.dlpack"; break;
                 case jax::value: pkg_name = "jax.dlpack"; break;
                 case cupy::value: pkg_name = "cupy"; break;
+                case paddle::value: pkg_name = "paddle.utils.dlpack"; break;
                 default: pkg_name = nullptr;
             }
 
@@ -808,7 +818,8 @@ PyObject *ndarray_export(ndarray_handle *th, int framework,
 
     if (copy) {
         const char* copy_str = "copy";
-        if (framework == pytorch::value)
+        if (framework == pytorch::value ||
+            framework == paddle::value)
             copy_str = "clone";
 
         try {
diff --git a/src/stubgen.py b/src/stubgen.py
index ae7c65e0..28b026f5 100755
--- a/src/stubgen.py
+++ b/src/stubgen.py
@@ -272,7 +272,7 @@ def __init__(
 
         # Precompile RE to extract nanobind nd-arrays
         self.ndarray_re = re.compile(
-            sep_before + r"(numpy\.ndarray|ndarray|torch\.Tensor)\[([^\]]*)\]"
+            sep_before + r"(numpy\.ndarray|ndarray|torch\.Tensor|paddle\.Tensor)\[([^\]]*)\]"
         )
 
         # Types which moved from typing.* to collections.abc in Python 3.9
diff --git a/tests/test_ndarray.cpp b/tests/test_ndarray.cpp
index c46ba9f3..7c8e6439 100644
--- a/tests/test_ndarray.cpp
+++ b/tests/test_ndarray.cpp
@@ -299,6 +299,19 @@ NB_MODULE(test_ndarray_ext, m) {
                                                                 deleter);
     });
 
+    m.def("ret_paddle", []() {
+        float *f = new float[8] { 1, 2, 3, 4, 5, 6, 7, 8 };
+        size_t shape[2] = { 2, 4 };
+
+        nb::capsule deleter(f, [](void *data) noexcept {
+           destruct_count++;
+           delete[] (float *) data;
+        });
+
+        return nb::ndarray<nb::paddle, float, nb::shape<2, 4>>(f, 2, shape,
+                                                                deleter);
+    });
+
     m.def("ret_array_scalar", []() {
             float* f = new float[1] { 1 };
             size_t shape[1] = {};
diff --git a/tests/test_ndarray.py b/tests/test_ndarray.py
index b5fd9b07..49d87fca 100644
--- a/tests/test_ndarray.py
+++ b/tests/test_ndarray.py
@@ -18,6 +18,28 @@ def needs_torch(x):
 except:
     needs_torch = pytest.mark.skip(reason="PyTorch is required")
 
+try:
+    import paddle
+    def needs_paddle(x):
+        return x
+except:
+    needs_paddle = pytest.mark.skip(reason="paddle is required")
+
+try:
+    import tensorflow as tf
+    import tensorflow.config
+    def needs_tensorflow(x):
+        return x
+except:
+    needs_tensorflow = pytest.mark.skip(reason="TensorFlow is required")
+
+try:
+    import jax.numpy as jnp
+    def needs_jax(x):
+        return x
+except:
+    needs_jax = pytest.mark.skip(reason="JAX is required")
+
 try:
     import cupy as cp
     def needs_cupy(x):
@@ -553,7 +575,7 @@ def test35_view():
     x2 = x1 * (-1+2j)
     t.fill_view_5(x1)
     assert np.allclose(x1, x2)
-    x2 = -x2;
+    x2 = -x2
     t.fill_view_6(x1)
     assert np.allclose(x1, x2)
 
@@ -614,109 +636,109 @@ def test41_noninteger_stride():
     a = np.array([[1, 2, 3, 4, 0, 0], [5, 6, 7, 8, 0, 0]], dtype=np.float32)
     s = a[:, 0:4]  # slice
     t.pass_float32(s)
-    assert t.get_stride(s, 0) == 6;
-    assert t.get_stride(s, 1) == 1;
+    assert t.get_stride(s, 0) == 6
+    assert t.get_stride(s, 1) == 1
     try:
         v = s.view(np.complex64)
     except:
         pytest.skip('your version of numpy is too old')
     t.pass_complex64(v)
-    assert t.get_stride(v, 0) == 3;
-    assert t.get_stride(v, 1) == 1;
+    assert t.get_stride(v, 0) == 3
+    assert t.get_stride(v, 1) == 1
 
     a = np.array([[1, 2, 3, 4, 0], [5, 6, 7, 8, 0]], dtype=np.float32)
     s = a[:, 0:4]  # slice
     t.pass_float32(s)
-    assert t.get_stride(s, 0) == 5;
-    assert t.get_stride(s, 1) == 1;
+    assert t.get_stride(s, 0) == 5
+    assert t.get_stride(s, 1) == 1
     v = s.view(np.complex64)
     with pytest.raises(TypeError) as excinfo:
         t.pass_complex64(v)
     assert 'incompatible function arguments' in str(excinfo.value)
     with pytest.raises(TypeError) as excinfo:
-        t.get_stride(v, 0);
+        t.get_stride(v, 0)
     assert 'incompatible function arguments' in str(excinfo.value)
 
 
 @needs_numpy
 def test42_const_qualifiers_numpy():
     a = np.array([0, 0, 0, 3.14159, 0], dtype=np.float64)
-    assert t.check_rw_by_value(a);
-    assert a[1] == 1.414214;
-    assert t.check_rw_by_value_float64(a);
-    assert a[2] == 2.718282;
-    assert a[4] == 16.0;
-    assert t.check_ro_by_value_ro(a);
-    assert t.check_ro_by_value_const_float64(a);
+    assert t.check_rw_by_value(a)
+    assert a[1] == 1.414214
+    assert t.check_rw_by_value_float64(a)
+    assert a[2] == 2.718282
+    assert a[4] == 16.0
+    assert t.check_ro_by_value_ro(a)
+    assert t.check_ro_by_value_const_float64(a)
     a.setflags(write=False)
-    assert t.check_ro_by_value_ro(a);
-    assert t.check_ro_by_value_const_float64(a);
-    assert a[0] == 0.0;
-    assert a[3] == 3.14159;
+    assert t.check_ro_by_value_ro(a)
+    assert t.check_ro_by_value_const_float64(a)
+    assert a[0] == 0.0
+    assert a[3] == 3.14159
 
     a = np.array([0, 0, 0, 3.14159, 0], dtype=np.float64)
-    assert t.check_rw_by_const_ref(a);
-    assert a[1] == 1.414214;
-    assert t.check_rw_by_const_ref_float64(a);
-    assert a[2] == 2.718282;
-    assert a[4] == 16.0;
-    assert t.check_ro_by_const_ref_ro(a);
-    assert t.check_ro_by_const_ref_const_float64(a);
+    assert t.check_rw_by_const_ref(a)
+    assert a[1] == 1.414214
+    assert t.check_rw_by_const_ref_float64(a)
+    assert a[2] == 2.718282
+    assert a[4] == 16.0
+    assert t.check_ro_by_const_ref_ro(a)
+    assert t.check_ro_by_const_ref_const_float64(a)
     a.setflags(write=False)
-    assert t.check_ro_by_const_ref_ro(a);
-    assert t.check_ro_by_const_ref_const_float64(a);
-    assert a[0] == 0.0;
-    assert a[3] == 3.14159;
+    assert t.check_ro_by_const_ref_ro(a)
+    assert t.check_ro_by_const_ref_const_float64(a)
+    assert a[0] == 0.0
+    assert a[3] == 3.14159
 
     a = np.array([0, 0, 0, 3.14159, 0], dtype=np.float64)
-    assert t.check_rw_by_rvalue_ref(a);
-    assert a[1] == 1.414214;
-    assert t.check_rw_by_rvalue_ref_float64(a);
-    assert a[2] == 2.718282;
-    assert a[4] == 16.0;
-    assert t.check_ro_by_rvalue_ref_ro(a);
-    assert t.check_ro_by_rvalue_ref_const_float64(a);
+    assert t.check_rw_by_rvalue_ref(a)
+    assert a[1] == 1.414214
+    assert t.check_rw_by_rvalue_ref_float64(a)
+    assert a[2] == 2.718282
+    assert a[4] == 16.0
+    assert t.check_ro_by_rvalue_ref_ro(a)
+    assert t.check_ro_by_rvalue_ref_const_float64(a)
     a.setflags(write=False)
-    assert t.check_ro_by_rvalue_ref_ro(a);
-    assert t.check_ro_by_rvalue_ref_const_float64(a);
-    assert a[0] == 0.0;
-    assert a[3] == 3.14159;
+    assert t.check_ro_by_rvalue_ref_ro(a)
+    assert t.check_ro_by_rvalue_ref_const_float64(a)
+    assert a[0] == 0.0
+    assert a[3] == 3.14159
 
 
 @needs_torch
 def test43_const_qualifiers_pytorch():
     a = torch.tensor([0, 0, 0, 3.14159, 0], dtype=torch.float64)
-    assert t.check_rw_by_value(a);
-    assert a[1] == 1.414214;
-    assert t.check_rw_by_value_float64(a);
-    assert a[2] == 2.718282;
-    assert a[4] == 16.0;
-    assert t.check_ro_by_value_ro(a);
-    assert t.check_ro_by_value_const_float64(a);
-    assert a[0] == 0.0;
-    assert a[3] == 3.14159;
+    assert t.check_rw_by_value(a)
+    assert a[1] == 1.414214
+    assert t.check_rw_by_value_float64(a)
+    assert a[2] == 2.718282
+    assert a[4] == 16.0
+    assert t.check_ro_by_value_ro(a)
+    assert t.check_ro_by_value_const_float64(a)
+    assert a[0] == 0.0
+    assert a[3] == 3.14159
 
     a = torch.tensor([0, 0, 0, 3.14159, 0], dtype=torch.float64)
-    assert t.check_rw_by_const_ref(a);
-    assert a[1] == 1.414214;
-    assert t.check_rw_by_const_ref_float64(a);
-    assert a[2] == 2.718282;
-    assert a[4] == 16.0;
-    assert t.check_ro_by_const_ref_ro(a);
-    assert t.check_ro_by_const_ref_const_float64(a);
-    assert a[0] == 0.0;
-    assert a[3] == 3.14159;
+    assert t.check_rw_by_const_ref(a)
+    assert a[1] == 1.414214
+    assert t.check_rw_by_const_ref_float64(a)
+    assert a[2] == 2.718282
+    assert a[4] == 16.0
+    assert t.check_ro_by_const_ref_ro(a)
+    assert t.check_ro_by_const_ref_const_float64(a)
+    assert a[0] == 0.0
+    assert a[3] == 3.14159
 
     a = torch.tensor([0, 0, 0, 3.14159, 0], dtype=torch.float64)
-    assert t.check_rw_by_rvalue_ref(a);
-    assert a[1] == 1.414214;
-    assert t.check_rw_by_rvalue_ref_float64(a);
-    assert a[2] == 2.718282;
-    assert a[4] == 16.0;
-    assert t.check_ro_by_rvalue_ref_ro(a);
-    assert t.check_ro_by_rvalue_ref_const_float64(a);
-    assert a[0] == 0.0;
-    assert a[3] == 3.14159;
+    assert t.check_rw_by_rvalue_ref(a)
+    assert a[1] == 1.414214
+    assert t.check_rw_by_rvalue_ref_float64(a)
+    assert a[2] == 2.718282
+    assert a[4] == 16.0
+    assert t.check_ro_by_rvalue_ref_ro(a)
+    assert t.check_ro_by_rvalue_ref_const_float64(a)
+    assert a[0] == 0.0
+    assert a[3] == 3.14159
 
 
 @needs_cupy
@@ -851,3 +873,136 @@ def test52_accept_np_both_true_contig():
 def test53_issue_930():
     wrapper = t.Wrapper(np.ones(3, dtype=np.float32))
     assert wrapper.value[0] == 1
+
+@needs_paddle
+def test54_return_paddle():
+    try:
+        c = paddle.zeros([3, 5]).cpu()
+    except:
+        pytest.skip('paddle is missing')
+    collect()
+    dc = t.destruct_count()
+    x = t.ret_paddle()
+    assert x.shape == [2, 4]
+    assert paddle.all(x == paddle.to_tensor([[1, 2, 3, 4], [5, 6, 7, 8]], place='cpu', dtype='float32'))
+    del x
+    collect()
+    assert t.destruct_count() - dc == 1
+
+@needs_paddle
+@pytest.mark.filterwarnings
+def test55_force_contig_paddle():
+    a = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], place='cpu', dtype='float32')
+    b = t.make_contig(a)
+    assert b is a
+    a = a.T
+    b = t.make_contig(a)
+    assert b is not a
+    assert paddle.all(b == a)
+
+@needs_paddle
+@pytest.mark.filterwarnings
+def test56_constrain_order_paddle():
+    try:
+        c = paddle.zeros([3, 5]).cpu()
+        c.__dlpack__()
+    except:
+        pytest.skip('paddle is missing')
+
+    f = c.t().contiguous().t()
+    assert t.check_order(c) == 'C'
+    assert t.check_order(f) == 'F'
+    assert t.check_order(c[:, 2:5]) == '?'
+    assert t.check_order(f[1:3, :]) == '?'
+    assert t.check_device(c) == 'cpu'
+    if paddle.device.cuda.device_count() >= 1:
+        assert t.check_device(paddle.zeros([3, 5])) == 'cuda'
+
+@needs_paddle
+def test57_implicit_conversion_paddle():
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        try:
+            c = paddle.zeros([3, 5]).cpu()
+            c.__dlpack__()
+        except:
+            pytest.skip('paddle is missing')
+
+    t.implicit(paddle.zeros([2, 2], dtype=paddle.int32))
+    t.implicit(paddle.zeros([2, 2, 10], dtype=paddle.float32)[:, :, 4])
+    t.implicit(paddle.zeros([2, 2, 10], dtype=paddle.int32)[:, :, 4])
+
+    with pytest.raises(TypeError) as excinfo:
+        t.noimplicit(paddle.zeros([2, 2], dtype=paddle.int32))
+
+    with pytest.raises(TypeError) as excinfo:
+        t.noimplicit(paddle.zeros([2, 2, 10], dtype=paddle.float32)[:, :, 4])
+
+@needs_paddle
+def test58_single_and_empty_dimension_paddle():
+    a = paddle.ones((1,100,1025), dtype=paddle.float32)
+    t.noop_3d_c_contig(a)
+    a = paddle.ones((100,1,1025), dtype=paddle.float32)
+    t.noop_3d_c_contig(a)
+    a = paddle.ones((0,100,1025), dtype=paddle.float32)
+    t.noop_3d_c_contig(a)
+    a = paddle.ones((100,0,1025), dtype=paddle.float32)
+    t.noop_3d_c_contig(a)
+    a = paddle.ones((100,1025,0), dtype=paddle.float32)
+    t.noop_3d_c_contig(a)
+    a = paddle.ones((100,0,0), dtype=paddle.float32)
+    t.noop_3d_c_contig(a)
+    a = paddle.ones((0,0,0), dtype=paddle.float32)
+    t.noop_3d_c_contig(a)
+
+# See PR #162
+@needs_paddle
+def test59_single_and_empty_dimension_fortran_order_paddle():
+    # This idiom creates a paddle 2D tensor in column major (aka, 'F') ordering
+    a = paddle.ones((0,100), dtype=paddle.float32).t().contiguous().t()
+    t.noop_2d_f_contig(a)
+    a = paddle.ones((100,0), dtype=paddle.float32).t().contiguous().t()
+    t.noop_2d_f_contig(a)
+    a = paddle.ones((1,100), dtype=paddle.float32).t().contiguous().t()
+    t.noop_2d_f_contig(a)
+    a = paddle.ones((100,1), dtype=paddle.float32).t().contiguous().t()
+    t.noop_2d_f_contig(a)
+
+@needs_paddle
+def test60_check_paddle():
+    assert t.check(paddle.zeros((1)).cpu())
+
+@needs_paddle
+def test61_const_qualifiers_paddle():
+    a = paddle.to_tensor([0, 0, 0, 3.14159, 0], dtype=paddle.float64, place='cpu')
+    assert t.check_rw_by_value(a)
+    assert a[1] == 1.414214
+    assert t.check_rw_by_value_float64(a)
+    assert a[2] == 2.718282
+    assert a[4] == 16.0
+    assert t.check_ro_by_value_ro(a)
+    assert t.check_ro_by_value_const_float64(a)
+    assert a[0] == 0.0
+    assert a[3] == 3.14159
+
+    a = paddle.to_tensor([0, 0, 0, 3.14159, 0], dtype=paddle.float64, place='cpu')
+    assert t.check_rw_by_const_ref(a)
+    assert a[1] == 1.414214
+    assert t.check_rw_by_const_ref_float64(a)
+    assert a[2] == 2.718282
+    assert a[4] == 16.0
+    assert t.check_ro_by_const_ref_ro(a)
+    assert t.check_ro_by_const_ref_const_float64(a)
+    assert a[0] == 0.0
+    assert a[3] == 3.14159
+
+    a = paddle.to_tensor([0, 0, 0, 3.14159, 0], dtype=paddle.float64, place='cpu')
+    assert t.check_rw_by_rvalue_ref(a)
+    assert a[1] == 1.414214
+    assert t.check_rw_by_rvalue_ref_float64(a)
+    assert a[2] == 2.718282
+    assert a[4] == 16.0
+    assert t.check_ro_by_rvalue_ref_ro(a)
+    assert t.check_ro_by_rvalue_ref_const_float64(a)
+    assert a[0] == 0.0
+    assert a[3] == 3.14159
diff --git a/tests/test_ndarray_ext.pyi.ref b/tests/test_ndarray_ext.pyi.ref
index 476d78c9..fc43e266 100644
--- a/tests/test_ndarray_ext.pyi.ref
+++ b/tests/test_ndarray_ext.pyi.ref
@@ -115,6 +115,8 @@ def ret_numpy_const() -> Annotated[NDArray[numpy.float32], dict(shape=(2, 4), wr
 
 def ret_pytorch() -> Annotated[NDArray[numpy.float32], dict(shape=(2, 4))]: ...
 
+def ret_paddle() -> Annotated[NDArray[numpy.float32], dict(shape=(2, 4))]: ...
+
 def ret_array_scalar() -> NDArray[numpy.float32]: ...
 
 def noop_3d_c_contig(arg: Annotated[NDArray[numpy.float32], dict(shape=(None, None, None), order='C')], /) -> None: ...