From 7d09e44ba71de85da8037ca343aeae3165fd90b0 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 15 Feb 2018 16:01:32 -0500
Subject: [PATCH 01/25] Fix slicing with negative stride.

---
 pycuda/gpuarray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 16970a2c..bf9af239 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -857,7 +857,7 @@ def __getitem__(self, index):
 
                 array_stride = self.strides[array_axis]
 
-                new_shape.append((stop-start-1)//idx_stride+1)
+                new_shape.append((abs(stop-start)-1)//abs(idx_stride)+1)
                 new_strides.append(idx_stride*array_stride)
                 new_offset += array_stride*start
 

From 27bcf764fe86c8d6f48313380dd54e2b7ac684ad Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 19 Feb 2018 11:56:27 -0500
Subject: [PATCH 02/25] Commit only DeferredSourceModule support without
 changing calling behavior of users of get_elwise_module or
 get_elwise_range_module, to test backwards compatibility and performance.

---
 pycuda/deferred.py    | 484 ++++++++++++++++++++++++++++++++++++++++++
 pycuda/elementwise.py | 469 +++++++++++++++++++++++++++++++++-------
 2 files changed, 879 insertions(+), 74 deletions(-)
 create mode 100644 pycuda/deferred.py

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
new file mode 100644
index 00000000..9699daa2
--- /dev/null
+++ b/pycuda/deferred.py
@@ -0,0 +1,484 @@
+"""
+This exports a "deferred" implementation of SourceModule, where compilation
+is delayed until call-time.  Several methods, like get_function(), return
+"deferred" values that are also only evaluated at call-time.
+"""
+
+from pycuda.tools import context_dependent_memoize
+from pycuda.compiler import compile, SourceModule
+import pycuda.driver
+
+import re
+
+class DeferredSource(object):
+    '''
+    Source generator that supports user-directed indentation, nesting
+    ``DeferredSource`` objects, indentation-aware string interpolation,
+    and deferred generation.
+    Use ``+=`` or ``add()`` to add source fragments as strings or
+    other ``DeferredSource`` objects, ``indent()`` or ``dedent()`` to
+    change base indentation, and ``__call__`` or ``generate()`` to
+    generate source.
+    
+    '''
+    def __init__(self, subsources=None, base_indent=0, indent_step=2):
+        self.base_indent = base_indent
+        self.indent_step = indent_step
+        if subsources is None:
+            subsources = []
+        self.subsources = subsources
+
+    def __str__(self):
+        return self.generate()
+
+    def __repr__(self):
+        return repr(self.__str__())
+
+    def __call__(self, indent=0, indent_first=True):
+        return self.generate(indent, indent_first)
+
+    def generate(self, indent=0, indent_first=True, get_list=False):
+        if get_list:
+            retval = []
+        else:
+            retval = ''
+        do_indent = not indent_first
+        for subindent, strip_space, subsource, format_dict in self.subsources:
+            if do_indent:
+                newindent = self.base_indent + indent + subindent
+            else:
+                newindent = 0
+            do_indent = True
+            if isinstance(subsource, DeferredSource):
+                retval = retval + subsource.generate(indent=(indent + subindent), get_list=get_list)
+                continue
+            lines = subsource.split("\n")
+            regex_space = re.compile(r"^(\s*)(.*?)(\s*)$")
+            regex_format = re.compile(r"%\(([^\)]*)\)([a-zA-Z])")
+            minstrip = None
+            newlines = []
+            for line in lines:
+                linelen = len(line)
+                space_match = regex_space.match(line)
+                end_leading_space = space_match.end(1)
+                begin_trailing_space = space_match.start(3)
+                if strip_space:
+                    if linelen == end_leading_space:
+                        # all space, ignore
+                        continue
+                    if minstrip is None or end_leading_space < minstrip:
+                        minstrip = end_leading_space
+                if not format_dict:
+                    newlines.append(line)
+                    continue
+                newlinelist = None
+                newline = ''
+                curpos = 0
+                matches = list(regex_format.finditer(line, end_leading_space))
+                nummatches = len(matches)
+                for match in matches:
+                    formatchar = match.group(2)
+                    name = match.group(1)
+                    matchstart = match.start()
+                    matchend = match.end()
+                    repl = format_dict.get(name, None)
+                    if repl is None:
+                        continue
+                    if (isinstance(repl, DeferredSource) and
+                        nummatches == 1 and
+                        matchstart == end_leading_space):
+                        # only one replacement, and only spaces preceding
+                        space = space_match.group(1)
+                        newlinelist = [ space + x
+                                        for x in repl.generate(get_list=True) ]
+                    else:
+                        newline = newline + line[curpos:matchstart]
+                        newline = newline + (('%' + formatchar) % (repl,))
+                    curpos = matchend
+                if newlinelist is None:
+                    newline = newline + line[curpos:]
+                    newlines.append(newline)
+                else:
+                    newlines.extend(newlinelist)
+                    newlines.append(line[curpos:])
+            indentstr = ' ' * (indent + subindent)
+            for i, line in enumerate(newlines):
+                line = indentstr + line[minstrip:]
+                newlines[i] = line
+            if get_list:
+                retval += newlines
+            else:
+                retval = retval + "\n".join(newlines) + "\n"
+        return retval
+
+    def indent(self, indent_step=None):
+        if indent_step is None:
+            indent_step = self.indent_step
+        self.base_indent += indent_step
+        return self
+
+    def dedent(self, indent_step=None):
+        if indent_step is None:
+            indent_step = self.indent_step
+        self.base_indent -= indent_step
+        return self
+
+    def format_dict(self, format_dict):
+        for subsource in self.subsources:
+            subsource[3] = format_dict
+        return self
+
+    def add(self, other, strip_space=True, format_dict=None):
+        self.subsources.append([self.base_indent, strip_space, other, format_dict])
+        return self
+
+    def __iadd__(self, other):
+        self.add(other)
+        return self
+
+    def __add__(self, other):
+        newgen = DeferredSource(subsources=self.subsources,
+                                base_indent=self.base_indent,
+                                indent_step=self.indent_step)
+        newgen.add(other)
+        return newgen
+
+class DeferredVal(object):
+    '''
+    This is an object that serves as a proxy to an as-yet undetermined
+    object, which is only known at the time when either ``_set_val()``
+    or ``_eval()`` is called.  Any calls to methods listed in the class
+    attribute ``_deferred_method_dict`` are queued until then, at which
+    point the queued method calls are executed in order immediately on
+    the new object.
+    This class must be subclassed, and the class attribute
+    ``_deferred_method_dict`` must contain a mapping from defer-able method
+    names to either ``DeferredVal``, None (same as ``DeferredVal``), or a
+    subclass, which when instantiated, will be assigned (with ``_set_val()``)
+    the return value of the method.
+    There are two ways to set the proxied object.  One is to set it
+    explicitly with ``_set_val(val)``.  The other is to override the method
+    ``_evalbase()`` which should return the new object, and will be called
+    by ``_eval()``.
+    '''
+    __unimpl = object()
+    _deferred_method_dict = None # must be set by subclass
+
+    def __init__(self):
+        self._val_available = False
+        self._val = None
+        self._deferred_method_calls = []
+
+    def __repr__(self):
+        return self._repr(0)
+
+    def _repr(self, indent):
+        indentstr = " " * indent
+        retstrs = []
+        retstrs.append("%s" % (self.__class__,))
+        for dmc in self._deferred_method_calls:
+            (name, args, kwargs, retval) = dmc
+            retstrs.append("  method %s" % (repr(name),))
+            for arg in args:
+                if isinstance(arg, DeferredVal):
+                    retstrs.append("    deferred arg (id=%s)" % (id(arg),))
+                    retstrs.append(arg._repr(indent + 6))
+                else:
+                    retstrs.append("    arg %s" % (repr(arg),))
+            for kwname, arg in kwargs.items():
+                if isinstance(arg, DeferredVal):
+                    retstrs.append("    deferred kwarg %s (id=%s)" % (repr(kwname), id(arg)))
+                    retstrs.append(arg._repr(indent + 6))
+                else:
+                    retstrs.append("    kwarg %s=%s" % (kwname, repr(arg),))
+            retstrs.append("    deferred retval (id=%s)" % (id(retval),))
+        return "\n".join([(indentstr + retstr) for retstr in retstrs])
+
+    def _set_val(self, val):
+        self._val = val
+        self._val_available = True
+        self._eval_methods()
+        return val
+
+    def _evalbase(self):
+        raise NotImplementedError()
+
+    def _eval_list(self, vals):
+        newvals = []
+        for val in vals:
+            if isinstance(val, DeferredVal):
+                val = val._eval()
+            newvals.append(val)
+        return newvals
+
+    def _eval_dict(self, valsdict):
+        newvalsdict = {}
+        for name, val in valsdict.items():
+            if isinstance(val, DeferredVal):
+                val = val._eval()
+            newvalsdict[name] = val
+        return newvalsdict
+
+    def _eval_methods(self):
+        assert(self._val_available)
+        val = self._val
+        for op in self._deferred_method_calls:
+            (methodname, methodargs, methodkwargs, deferredretval) = op
+            methodargs = self._eval_list(methodargs)
+            methodkwargs = self._eval_dict(methodkwargs)
+            retval = getattr(val, methodname)(*methodargs, **methodkwargs)
+            deferredretval._set_val(retval)
+        self._deferred_method_calls = []
+
+    def _eval(self):
+        if not self._val_available:
+            self._val = self._evalbase()
+            self._val_available = True
+            self._eval_methods()
+        return self._val
+
+    def _get_deferred_func(self, _name, _retval):
+        def _deferred_func(*args, **kwargs):
+            if not self._val_available:
+                self._deferred_method_calls.append((_name, args, kwargs, _retval))
+                return _retval
+            args = self._eval_list(args)
+            kwargs = self._eval_dict(kwargs)
+            return getattr(self._val, _name)(*newargs, **newkwargs)
+        _deferred_func.__name__ = _name + ".deferred"
+        return _deferred_func
+
+    def __getattr__(self, name):
+        if self.__class__._deferred_method_dict is None:
+            raise Exception("DeferredVal must be subclassed and the class attribute _deferred_method_dict must be set to a valid dictionary!")
+        if self._val_available:
+            return getattr(self._val, name)
+        deferredclass = self.__class__._deferred_method_dict.get(name, self.__unimpl)
+        if deferredclass is not self.__unimpl:
+            if deferredclass is None:
+                deferredclass = DeferredVal
+            retval = deferredclass()
+            return self._get_deferred_func(name, retval)
+        raise AttributeError("no such attribute (yet): '%s'" % (name,))
+
+# we allow all math operators to be deferred
+_mathops = (
+    '__add__', '__sub__', '__mul__', '__floordiv__', '__mod__',
+    '__divmod__', '__pow__', '__lshift__', '__rshift__', '__and__',
+    '__xor__', '__or__', '__div__', '__truediv__', '__radd__', '__rsub__',
+    '__rmul__', '__rdiv__', '__rtruediv__', '__rfloordiv__', '__rmod__',
+    '__rdivmod__', '__rpow__', '__rlshift__', '__rrshift__', '__rand__',
+    '__rxor__', '__ror__', '__iadd__', '__isub__', '__imul__', '__idiv__',
+    '__itruediv__', '__ifloordiv__', '__imod__', '__ipow__', '__ilshift__',
+    '__irshift__', '__iand__', '__ixor__', '__ior__', '__pos__', '__abs__',
+    '__invert__', '__complex__', '__int__', '__long__', '__float__',
+    '__oct__', '__hex__', '__index__', '__coerce__')
+class DeferredNumeric(DeferredVal):
+    pass
+DeferredNumeric._deferred_method_dict = dict((x, DeferredNumeric)
+                                             for x in _mathops)
+def _get_deferred_attr_func(_name):
+    def _deferred_func(self, *args, **kwargs):
+        return self.__getattr__(_name)(*args, **kwargs)
+    _deferred_func.__name__ = _name
+    return _deferred_func
+for name in _mathops:
+    setattr(DeferredNumeric, name, _get_deferred_attr_func(name))
+
+class DeferredModuleVal(DeferredVal):
+    _deferred_method_dict = {}
+    def __init__(self, sourcemodule, methodstr, name):
+        super(DeferredModuleVal, self).__init__()
+        self._sourcemodule = sourcemodule
+        self._methodstr = methodstr
+        self._name = name
+
+    def _evalbase(self):
+        return getattr(self._sourcemodule.module, self._methodstr)(self._name)
+
+class DeferredTexRef(DeferredModuleVal):
+    _deferred_method_dict = {
+        "set_array": None,
+        "set_address": DeferredNumeric,
+        "set_address_2d": None,
+        "set_format": None,
+        "set_address_mode": None,
+        "set_flags": None,
+        "get_address": DeferredNumeric,
+        "get_flags": DeferredNumeric,
+    }
+
+class DeferredFunction(object):
+    '''
+    This class is a pseudo-replacement of ``pycuda.driver.Function``,
+    but takes a ``DeferredSourceModule`` and a function name as an argument,
+    and queues any call to ``prepare()`` until call-time, at which it
+    calls out to the ``DeferredSourceModule`` object do create the actual
+    Function before preparing (if necessary) and calling the underlying
+    kernel.  NOTE: you may now send the actual ``GPUArrays`` as arguments,
+    rather than their ``.gpudata`` members; this can be helpful to
+    dynamically create kernels.
+    '''
+    def __init__(self, modulelazy, funcname):
+        self._modulelazy = modulelazy
+        self._funcname = funcname
+        self._prepare_args = None
+
+        def get_unimplemented(_methodname):
+            def _unimplemented(self, _methodname=_methodname, *args, **kwargs):
+                raise NotImplementedError("%s does not implement method '%s'" % (type(self), _methodname,))
+            return _unimplemented
+
+        for meth_name in ["set_block_shape", "set_shared_size",
+                "param_set_size", "param_set", "param_seti", "param_setf",
+                "param_setv", "param_set_texref",
+                "launch", "launch_grid", "launch_grid_async"]:
+            setattr(self, meth_name, get_unimplemented(meth_name))
+
+    def _fix_texrefs(self, kwargs):
+        texrefs = kwargs.get('texrefs', None)
+        if texrefs is not None:
+            newtexrefs = []
+            for texref in texrefs:
+                if isinstance(texref, DeferredVal):
+                    texref = texref._eval()
+                newtexrefs.append(texref)
+            kwargs['texrefs'] = newtexrefs
+
+    def __call__(self, *args, **kwargs):
+        func = self._modulelazy.create_function(self._funcname, args)
+        self._fix_texrefs(kwargs)
+        return func.__call__(*args, **kwargs)
+
+    def param_set_texref(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def prepare(self, *args, **kwargs):
+        self._prepare_args = (args, kwargs)
+        return self
+
+    def _do_delayed_prepare(self, func):
+        if self._prepare_args is None:
+            raise Exception("prepared_*_call() requires that prepare() be called first")
+        (prepare_args, prepare_kwargs) = self._prepare_args
+        self._fix_texrefs(prepare_kwargs)
+        func.prepare(*prepare_args, **prepare_kwargs)
+
+    def _generic_prepared_call(self, funcmethodstr, funcmethodargs, funcargs, funckwargs):
+        grid = funcmethodargs[0]
+        block = funcmethodargs[1]
+        func = self._modulelazy._delayed_get_function(self._funcname, funcargs, grid, block)
+        self._do_delayed_prepare(func)
+        newfuncargs = [ getattr(arg, 'gpudata', arg) for arg in funcargs ]
+        fullargs = list(funcmethodargs)
+        fullargs.extend(newfuncargs)
+        return getattr(func, funcmethodstr)(*fullargs, **funckwargs)
+
+    def prepared_call(self, grid, block, *args, **kwargs):
+        return self._generic_prepared_call('prepared_call', (grid, block), args, kwargs)
+
+    def prepared_timed_call(self, grid, block, *args, **kwargs):
+        return self._generic_prepared_call('prepared_timed_call', (grid, block), args, kwargs)
+
+    def prepared_async_call(self, grid, block, stream, *args, **kwargs):
+        return self._generic_prepared_call('prepared_async_call', (grid, block, stream), args, kwargs)
+
+@context_dependent_memoize
+def _delayed_compile_aux(source, compileargs):
+    # re-convert any tuples to lists
+    newcompileargs = []
+    for i, arg in enumerate(compileargs):
+        if isinstance(arg, tuple):
+            arg = list(arg)
+        newcompileargs.append(arg)
+    cubin = compile(source, *newcompileargs)
+
+    from pycuda.driver import module_from_buffer
+    return module_from_buffer(cubin)
+
+class DeferredSourceModule(SourceModule):
+    '''
+    This is an abstract specialization of SourceModule which allows the
+    delay of creating the actual kernel source until call-time, at which
+    point the actual arguments are available and their characteristics can
+    be used to create specific kernels.
+    To support this, ``get_function()`` returns a ``DeferredFunction``
+    object which queues any calls to ``DeferredFunction.prepare()`` and
+    saves them until future calls to ``DeferredFunction.__call__()`` or
+    ``DeferredFunction.prepared_*_call()``.  NOTE: you may now send actual
+    ``GPUArrays`` to these functions rather their ``.gpudata`` members;
+    this can be helpful when creating dynamic kernels.
+    Likewise, ``get_global()``, ``get_texref()`` and ``get_surfref()``
+    return proxy objects that can be stored by ``DeferredFunction.prepare()``
+    and will only be evaluated at call-time.
+    This class must be subclassed and the function ``create_source(self,
+    grid, block, *args)`` must be overridden, returning the kernel source
+    (or ``DeferredSource`` object) that should be compiled.  ``grid``,
+    ``block``, and ``*args`` are the same arguments that were sent to the
+    ``DeferredFunction`` call functions above.
+    The function ``create_key(self, grid, block, *args)`` is always
+    called before ``create_source`` and the key returned (if not None) is
+    used to cache any compiled functions.
+    '''
+    _cache = {}
+
+    def __init__(self, nvcc="nvcc", options=None, keep=False,
+            no_extern_c=False, arch=None, code=None, cache_dir=None,
+            include_dirs=[]):
+        self._arch = arch
+        # tuples below are so _compileargs can be used as a hash key
+        if options is not None:
+            options = tuple(options)
+        include_dirs = tuple(include_dirs)
+        self._compileargs = (nvcc, options, keep, no_extern_c,
+                             arch, code, cache_dir, include_dirs)
+
+    def _delayed_compile(self, source):
+        self._check_arch(self._arch)
+
+        self.module = _delayed_compile_aux(source, self._compileargs)
+        return self.module
+
+    def create_key(self, grid, block, *funcargs):
+        return None
+
+    def create_source(self, grid, block, *funcargs):
+        raise NotImplementedError("create_source must be overridden!")
+
+    def _delayed_get_function(self, funcname, funcargs, grid, block):
+        '''
+        If ``create_key()`` returns non-None, then it is used as the key
+        to cache compiled functions.  Otherwise the return value of
+        ``create_source()`` is used as the key.
+        '''
+        context = pycuda.driver.Context.get_current()
+        funccache = DeferredSourceModule._cache.get(context, None)
+        if funccache is None:
+            funccache = self._cache[context] = {}
+        key = self.create_key(grid, block, *funcargs)
+        funckey = (funcname, key)
+        if key is None or funckey not in funccache:
+            source = self.create_source(grid, block, *funcargs)
+            if isinstance(source, DeferredSource):
+                source = source.generate()
+            if key is None:
+                funckey = (funcname, source)
+        func = funccache.get(funckey, None)
+        if func is None:
+            module = self._delayed_compile(source)
+            func = module.get_function(funcname)
+            funccache[funckey] = func
+        return func
+
+    def get_function(self, name):
+        return DeferredFunction(self, name)
+
+    def get_global(self, name):
+        raise NotImplementedError("Deferred globals in element-wise kernels not supported yet")
+
+    def get_texref(self, name):
+        return DeferredTexRef(self, 'get_texref', name)
+
+    def get_surfref(self, name):
+        raise NotImplementedError("Deferred surfaces in element-wise kernels not supported yet")
+
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index feab0a6b..f0ce0803 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -36,92 +36,413 @@
 import numpy as np
 from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
 from pytools import memoize_method
+from pycuda.deferred import DeferredSourceModule, DeferredSource
+
+class ElementwiseSourceModule(DeferredSourceModule):
+    '''
+    This is a ``DeferredSourceModule`` which is backwards-compatible with the
+    original ``get_elwise_module`` and ``get_elwise_range_module`` (using
+    ``do_range=True``).  However, this class delays the compilation of
+    kernels until call-time.  If you send actual ``GPUArray`` arguments
+    (instead of their ``.gpudata`` members) when calling the methods
+    supported by the return value of ``get_function()``, then you get:
+      * support for array-specific flat indices (i.e. for input array ``z``,
+        you can index it as ``z[z_i]`` in addition to the old-style ``z[i]``)
+      * support for non-contiguous (and arbitrarily-strided) arrays, but
+        only if you use the array-specific indices above.
+    Array-specific flat indices only really work if all the arrays using them
+    are the same shape.  This shape is also used to optimize index
+    calculations.  By default, the shape is taken from the first argument
+    that is specified as a pointer/array, but you can override this by
+    sending ``shape_arg_index=N`` where ``N`` is the zero-based index of the
+    kernel argument whose shape should be used.
+    '''
+    def __init__(self, arguments, operation,
+                 name="kernel", preamble="", loop_prep="", after_loop="",
+                 do_range=False, shape_arg_index=None,
+                 **compilekwargs):
+        super(ElementwiseSourceModule, self).__init__(**compilekwargs)
+        self._do_range = do_range
+        self._shape_arg_index = shape_arg_index
+        self._init_args = (arguments, operation,
+                           name, preamble, loop_prep, after_loop)
+
+    def create_key(self, grid, block, *args):
+        (arguments, operation,
+         funcname, preamble, loop_prep, after_loop) = self._init_args
+        shape_arg_index = self._shape_arg_index
+
+        # 'args' is the list of actual parameters being sent to the kernel
+        # 'arguments' is the list of argument descriptors (VectorArg, ScalarArg)
+
+        arraypairs = []
+        contigmatch = True
+        arrayspecificinds = True
+        shape = None
+        size = None
+        order = None
+        for i, argpair in enumerate(zip(args, arguments)):
+            arg, arg_descr = argpair
+            if isinstance(arg_descr, VectorArg):
+                # is a GPUArray/DeviceAllocation
+                arraypairs.append(argpair)
+                if not arrayspecificinds:
+                    continue
+                if not hasattr(arg, 'shape'):
+                    # At least one array argument is probably sent as a
+                    # GPUArray.gpudata rather than the GPUArray itself,
+                    # so disable array-specific indices -- caller is on
+                    # their own.
+                    arrayspecificinds = False
+                    continue
+                curshape = arg.shape
+                cursize = arg.size
+                curorder = 'N'
+                if arg.flags.f_contiguous:
+                    curorder = 'F'
+                elif arg.flags.c_contiguous:
+                    curorder = 'C'
+                if shape is None:
+                    shape = curshape
+                    size = cursize
+                    order = curorder
+                elif curorder == 'N' or order != curorder:
+                    contigmatch = False
+                elif shape_arg_index is None and shape != curshape:
+                    raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index; found shapes %s and %s" % (shape, curshape))
+                if shape_arg_index == i:
+                    shape = curshape
+
+        self._contigmatch = contigmatch
+        self._arraypairs = arraypairs
+        self._arrayspecificinds = arrayspecificinds
+
+        key = repr(self._init_args)
+
+        if contigmatch:
+            return key
+
+        # Arrays are not contiguous or different order
+
+        if grid[1] != 1 or block[1] != 1 or block[2] != 1:
+            raise Exception("Grid (%s) and block (%s) specifications should have all '1' except in the first element" % (grid, block))
+
+        ndim = len(shape)
+        numthreads = block[0]
+        shape = np.array(shape)
+
+        # Use index of minimum stride in first array as a hint on how to
+        # order the traversal of dimensions.  We could probably do something
+        # smarter, like tranposing/reshaping arrays if possible to maximize
+        # performance, but that is probably best done in a pre-processing step.
+        # Note that this could mess up custom indexing that assumes a
+        # particular traversal order, but in that case one should probably
+        # ensure that inputs have the same order, and explicitly send
+        # shape_arg_index to turn this off.
+        do_reverse = False
+        if (shape_arg_index is None and
+            np.argmin(np.abs(arraypairs[0][0].strides)) > ndim // 2):
+            print "traversing dimensions in reverse order"
+            # traverse dimensions in reverse order
+            do_reverse = True
+        if do_reverse:
+            shape = shape[::-1]
+        block_step = np.array(shape)
+        tmp = numthreads
+        for dimnum in range(ndim):
+            newstep = tmp % block_step[dimnum]
+            tmp = tmp // block_step[dimnum]
+            block_step[dimnum] = newstep
+        arrayarginfos = []
+        for arg, arg_descr in arraypairs:
+            if do_reverse:
+                elemstrides = np.array(arg.strides[::-1]) // arg.itemsize
+            else:
+                elemstrides = np.array(arg.strides) // arg.itemsize
+            dimelemstrides = elemstrides * shape
+            blockelemstrides = elemstrides * block_step
+            arrayarginfos.append(
+                (arg_descr.name, tuple(elemstrides), tuple(dimelemstrides), tuple(blockelemstrides))
+            )
+
+        self._arrayarginfos = arrayarginfos
+        self._ndim = ndim
+        self._numthreads = numthreads
+        self._shape = shape
+        self._block_step = block_step
+
+        key = (self._init_args, grid, block, tuple(self._arrayarginfos))
+
+        return key
+
+    def create_source(self, grid, block, *args):
+        # Precondition: create_key() must have been run with the same arguments
+
+        (arguments, operation,
+         funcname, preamble, loop_prep, after_loop) = self._init_args
+
+        contigmatch = self._contigmatch
+
+        if contigmatch:
+            arraypairs = self._arraypairs
+            arrayspecificinds = self._arrayspecificinds
+
+            indtype = 'unsigned'
+            if self._do_range:
+                indtype = 'long'
+
+            # All arrays are contiguous and same order (or we don't know and
+            # it's up to the caller to make sure it works)
+            if arrayspecificinds:
+                for arg, arg_descr in arraypairs:
+                    preamble = preamble + """
+                        #define %s_i i
+                    """ % (arg_descr.name,)
+            if self._do_range:
+                loop_body = """
+                  if (step < 0)
+                  {
+                    for (i = start + (cta_start + tid)*step;
+                      i > stop; i += total_threads*step)
+                    {
+                      %(operation)s;
+                    }
+                  }
+                  else
+                  {
+                    for (i = start + (cta_start + tid)*step;
+                      i < stop; i += total_threads*step)
+                    {
+                      %(operation)s;
+                    }
+                  }
+                """ % {
+                    "operation": operation,
+                }
+            else:
+                loop_body = """
+                  for (i = cta_start + tid; i < n; i += total_threads)
+                  {
+                    %(operation)s;
+                  }
+                """ % {
+                    "operation": operation,
+                }
+
+            return """
+                #include <pycuda-complex.hpp>
+
+                %(preamble)s
+
+                __global__ void %(name)s(%(arguments)s)
+                {
+                  unsigned tid = threadIdx.x;
+                  unsigned total_threads = gridDim.x*blockDim.x;
+                  unsigned cta_start = blockDim.x*blockIdx.x;
+
+                  %(indtype)s i;
+
+                  %(loop_prep)s;
+
+                  %(loop_body)s;
+
+                  %(after_loop)s;
+                }
+                """ % {
+                    "arguments": ", ".join(arg.declarator() for arg in arguments),
+                    "name": funcname,
+                    "preamble": preamble,
+                    "loop_prep": loop_prep,
+                    "after_loop": after_loop,
+                    "loop_body": loop_body,
+                    "indtype": indtype,
+                }
+
+        # Arrays are not contiguous or different order
+
+        arrayarginfos = self._arrayarginfos
+        ndim = self._ndim
+        numthreads = self._numthreads
+        shape = self._shape
+        block_step = self._block_step
+
+        arraynames = [ x[0] for x in arrayarginfos ]
+
+        defines = DeferredSource()
+        for dimnum in range(ndim):
+            defines += """
+                #define SHAPE_%d %d
+                #define BLOCK_STEP_%d %d
+            """ % (dimnum, shape[dimnum],
+                   dimnum, block_step[dimnum])
+            for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
+                basename = "%s_%d" % (name, dimnum)
+                defines += """
+                    #define ELEMSTRIDE_%s_%d %d
+                    #define DIMELEMSTRIDE_%s_%d %d
+                    #define BLOCKELEMSTRIDE_%s_%d %d
+                """ % (name, dimnum, elemstrides[dimnum],
+                       name, dimnum, dimelemstrides[dimnum],
+                       name, dimnum, blockelemstrides[dimnum])
+
+        decls = DeferredSource()
+        decls += """
+            unsigned GLOBAL_i = cta_start + tid;
+        """
+        for name in arraynames:
+            decls += """
+                long %s_i = 0;
+            """ % (name,)
+        for dimnum in range(ndim):
+            decls += """
+                long INDEX_%d;
+            """ % (dimnum,)
+
+        loop_inds_calc = DeferredSource()
+        loop_inds_calc += """
+            unsigned int TMP_GLOBAL_i = GLOBAL_i;
+        """
+        for dimnum in range(ndim):
+            loop_inds_calc += """
+                INDEX_%d = TMP_GLOBAL_i %% SHAPE_%d;
+                TMP_GLOBAL_i = TMP_GLOBAL_i / SHAPE_%d;
+            """ % (dimnum, dimnum,
+                   dimnum)
+
+            for name in arraynames:
+                loop_inds_calc += """
+                    %s_i += INDEX_%d * ELEMSTRIDE_%s_%d;
+                """ % (name, dimnum, name, dimnum)
+
+        loop_inds_inc = DeferredSource()
+        for dimnum in range(ndim):
+            loop_inds_inc += """
+                    INDEX_%d += BLOCK_STEP_%d;
+            """ % (dimnum, dimnum)
+            for name in arraynames:
+                loop_inds_inc += """
+                    %s_i += BLOCKELEMSTRIDE_%s_%d;
+                """ % (name, name, dimnum)
+            if dimnum < ndim - 1:
+                loop_inds_inc += """
+                    if (INDEX_%d > SHAPE_%d) {
+                """ % (dimnum, dimnum)
+                loop_inds_inc.indent()
+                loop_inds_inc += """
+                      INDEX_%d -= SHAPE_%d;
+                      INDEX_%d ++;
+                """ % (dimnum, dimnum,
+                       dimnum + 1)
+                for name in arraynames:
+                    loop_inds_inc += """
+                      %s_i -= DIMELEMSTRIDE_%s_%d;
+                    """ % (name, name, dimnum)
+                loop_inds_inc.dedent()
+                loop_inds_inc += """
+                    }
+                """
+
+        loop_body = DeferredSource()
+        if self._do_range:
+            loop_body.add("""
+              if (step < 0)
+              {
+                for (/*void*/; GLOBAL_i > stop; GLOBAL_i += total_threads*step)
+                {
+                  %(operation)s;
+
+                  %(loop_inds_inc)s;
+                }
+              }
+              else
+              {
+                for (/*void*/; GLOBAL_i < stop; GLOBAL_i += total_threads*step)
+                {
+                  %(operation)s;
+
+                  %(loop_inds_inc)s;
+                }
+              }
+            """, format_dict={
+                "operation": operation,
+                "loop_inds_inc": loop_inds_inc,
+            })
+        else:
+            loop_body.add("""
+              for (/*void*/; GLOBAL_i < n; GLOBAL_i += total_threads)
+              {
+                %(operation)s;
 
+                %(loop_inds_inc)s;
+              }
+            """, format_dict={
+                "operation": operation,
+                "loop_inds_inc": loop_inds_inc,
+            })
 
-def get_elwise_module(arguments, operation,
-        name="kernel", keep=False, options=None,
-        preamble="", loop_prep="", after_loop=""):
-    from pycuda.compiler import SourceModule
-    return SourceModule("""
-        #include <pycuda-complex.hpp>
+        source = DeferredSource()
 
-        %(preamble)s
+        source.add("""
+            #include <pycuda-complex.hpp>
+            #include <stdio.h>
 
-        __global__ void %(name)s(%(arguments)s)
-        {
+            %(defines)s
 
-          unsigned tid = threadIdx.x;
-          unsigned total_threads = gridDim.x*blockDim.x;
-          unsigned cta_start = blockDim.x*blockIdx.x;
-          unsigned i;
+            %(preamble)s
 
-          %(loop_prep)s;
+            __global__ void %(name)s(%(arguments)s)
+            {
 
-          for (i = cta_start + tid; i < n; i += total_threads)
-          {
-            %(operation)s;
-          }
+              unsigned tid = threadIdx.x;
+              unsigned total_threads = gridDim.x*blockDim.x;
+              unsigned cta_start = blockDim.x*blockIdx.x;
 
-          %(after_loop)s;
-        }
-        """ % {
-            "arguments": ", ".join(arg.declarator() for arg in arguments),
-            "operation": operation,
-            "name": name,
-            "preamble": preamble,
-            "loop_prep": loop_prep,
-            "after_loop": after_loop,
-            },
-        options=options, keep=keep)
+              %(decls)s
 
+              %(loop_prep)s;
 
-def get_elwise_range_module(arguments, operation,
-        name="kernel", keep=False, options=None,
-        preamble="", loop_prep="", after_loop=""):
-    from pycuda.compiler import SourceModule
-    return SourceModule("""
-        #include <pycuda-complex.hpp>
-
-        %(preamble)s
-
-        __global__ void %(name)s(%(arguments)s)
-        {
-          unsigned tid = threadIdx.x;
-          unsigned total_threads = gridDim.x*blockDim.x;
-          unsigned cta_start = blockDim.x*blockIdx.x;
-          long i;
-
-          %(loop_prep)s;
-
-          if (step < 0)
-          {
-            for (i = start + (cta_start + tid)*step;
-              i > stop; i += total_threads*step)
-            {
-              %(operation)s;
-            }
-          }
-          else
-          {
-            for (i = start + (cta_start + tid)*step;
-              i < stop; i += total_threads*step)
-            {
-              %(operation)s;
+              %(loop_inds_calc)s;
+
+              %(loop_body)s;
+
+              %(after_loop)s;
             }
-          }
-
-          %(after_loop)s;
-        }
-        """ % {
-            "arguments": ", ".join(arg.declarator() for arg in arguments),
-            "operation": operation,
-            "name": name,
-            "preamble": preamble,
-            "loop_prep": loop_prep,
-            "after_loop": after_loop,
-            },
-        options=options, keep=keep)
+            """, format_dict={
+                "arguments": ", ".join(arg.declarator() for arg in arguments),
+                "operation": operation,
+                "name": funcname,
+                "preamble": preamble,
+                "loop_prep": loop_prep,
+                "after_loop": after_loop,
+                "defines": defines,
+                "decls": decls,
+                "loop_inds_calc": loop_inds_calc,
+                "loop_body": loop_body,
+            })
+
+        return source
+
 
+def get_elwise_module(arguments, operation,
+        name="kernel", keep=False, options=None,
+        preamble="", loop_prep="", after_loop="",
+        shape_arg_index=None):
+    return ElementwiseSourceModule(arguments, operation,
+                                   name=name, preamble=preamble,
+                                   loop_prep=loop_prep, after_loop=after_loop,
+                                   keep=keep, options=options,
+                                   shape_arg_index=shape_arg_index)
+
+def get_elwise_range_module(arguments, operation,
+        name="kernel", keep=False, options=None,
+        preamble="", loop_prep="", after_loop="",
+        shape_arg_index=None):
+    return ElementwiseSourceModule(arguments, operation,
+                                   name=name, preamble=preamble,
+                                   loop_prep=loop_prep, after_loop=after_loop,
+                                   keep=keep, options=options,
+                                   do_range=True,
+                                   shape_arg_index=shape_arg_index)
 
 def get_elwise_kernel_and_types(arguments, operation,
         name="kernel", keep=False, options=None, use_range=False, **kwargs):

From be4014c5eb894897a31fea880a30a6e3c3cbe9b0 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Mon, 19 Feb 2018 12:24:20 -0500
Subject: [PATCH 03/25] Smarter _new_like_me that handles discontiguous input. 
 Have copy() use it too.

---
 pycuda/gpuarray.py | 119 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 103 insertions(+), 16 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index bf9af239..cb78f5c6 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -246,13 +246,14 @@ def set_async(self, ary, stream=None):
         return self.set(ary, async=True, stream=stream)
 
     def get(self, ary=None, pagelocked=False, async=False, stream=None):
+        slicer = None
         if ary is None:
             if pagelocked:
                 ary = drv.pagelocked_empty(self.shape, self.dtype)
             else:
                 ary = np.empty(self.shape, self.dtype)
 
-            strides = _compact_strides(self)
+            self, strides, slicer = _compact_positive_strides(self)
             ary = _as_strided(ary, strides=strides)
         else:
             if self.size != ary.size:
@@ -269,13 +270,15 @@ def get(self, ary=None, pagelocked=False, async=False, stream=None):
 
         if self.size:
             _memcpy_discontig(ary, self, async=async, stream=stream)
+        if slicer:
+            ary = ary[slicer]
         return ary
 
     def get_async(self, stream=None, ary=None):
         return self.get(ary=ary, async=True, stream=stream)
 
-    def copy(self):
-        new = GPUArray(self.shape, self.dtype, self.allocator)
+    def copy(self, order="K"):
+        new = self._new_like_me(order=order)
         _memcpy_discontig(new, self)
         return new
 
@@ -375,15 +378,52 @@ def _div(self, other, out, stream=None):
 
         return out
 
-    def _new_like_me(self, dtype=None, order="C"):
-        strides = None
+    def _new_like_me(self, dtype=None, order="K"):
+        slicer, selflist = _flip_negative_strides((self,))
+        self = selflist[0]
+        ndim = self.ndim
+        shape = self.shape
+        mystrides = self.strides
+        mydtype = self.dtype
+        myitemsize = mydtype.itemsize
         if dtype is None:
-            dtype = self.dtype
-        if dtype == self.dtype:
-            strides = self.strides
-
-        return self.__class__(self.shape, dtype,
-                allocator=self.allocator, strides=strides, order=order)
+            dtype = mydtype
+        else:
+            dtype = np.dtype(dtype)
+        itemsize = dtype.itemsize
+        if order == "K":
+            if self.flags.c_contiguous:
+                order = "C"
+            elif self.flags.f_contiguous:
+                order = "F"
+        if order == "C":
+            newstrides = _c_contiguous_strides(itemsize, shape)
+        elif order == "F":
+            newstrides = _f_contiguous_strides(itemsize, shape)
+        else:
+            maxstride = mystrides[0]
+            maxstrideind = 0
+            for i in range(1, ndim):
+                curstride = mystrides[i]
+                if curstride > maxstride:
+                    maxstrideind = i
+                    maxstride = curstride
+            mymaxoffset = (maxstride / myitemsize) * shape[maxstrideind]
+            if mymaxoffset <= self.size:
+                # it's probably safe to just allocate and pass strides
+                # XXX (do we need to calculate full offset for [-1,-1,-1...]?)
+                newstrides = tuple((x // myitemsize) * itemsize for x in mystrides)
+            else:
+                # just punt and choose something close
+                if ndim > 1 and maxstrideind == 0:
+                    newstrides = _c_contiguous_strides(itemsize, shape)
+                else:
+                    newstrides = _f_contiguous_strides(itemsize, shape)
+        retval = self.__class__(shape, dtype,
+                     allocator=self.allocator, strides=newstrides)
+        if slicer:
+            retval = retval[slicer]
+        return retval
 
     # operators ---------------------------------------------------------------
     def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None):
@@ -1012,15 +1052,21 @@ def conj(self):
 
 def to_gpu(ary, allocator=drv.mem_alloc):
     """converts a numpy array to a GPUArray"""
-    result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary))
+    ary, newstrides, slicer = _compact_positive_strides(ary)
+    result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides)
     result.set(ary)
+    if slicer:
+        result = result[slicer]
     return result
 
 
 def to_gpu_async(ary, allocator=drv.mem_alloc, stream=None):
     """converts a numpy array to a GPUArray"""
-    result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary))
+    ary, newstrides, slicer = _compact_positive_strides(ary)
+    result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides)
     result.set_async(ary, stream)
+    if slicer:
+        result = result[slicer]
     return result
 
 
@@ -1180,8 +1226,41 @@ class Info(Record):
 # }}}
 
 
-def _compact_strides(a):
-    # Compute strides to have same order as self, but packed
+def _flip_negative_strides(arrays):
+    # If arrays have negative strides, flip them.  Return a list
+    # ``(slicer, arrays)`` where ``slicer`` is a tuple of slice objects
+    # used to flip the arrays (or ``None`` if there was no flipping),
+    # and ``arrays`` is the list of flipped arrays.
+    # NOTE: Every input array A must have the same value for the following
+    # expression:  np.sign(A.strides)
+    # NOTE: ``slicer`` is its own inverse, so ``A[slicer][slicer] == A``
+    if isinstance(arrays, GPUArray):
+        raise TypeError("_flip_negative_strides expects a list of GPUArrays")
+    slicer = None
+    ndim = arrays[0].ndim
+    shape = arrays[0].shape
+    for t in zip(range(ndim), *[np.sign(x.strides) for x in arrays]):
+        axis = t[0]
+        stride_sign = t[1]
+        if len(arrays) > 1:
+            if not np.all(t[2:] == stride_sign):
+                raise ValueError("found differing signs in dimension %d: %s" % (axis, t[1:]))
+        if stride_sign == -1:
+            if slicer is None:
+                slicer = [slice(None)] * ndim
+            slicer[axis] = slice(None, None, -1)
+    if slicer is not None:
+        slicer = tuple(slicer)
+        arrays = [x[slicer] for x in arrays]
+    return slicer, arrays
+
+
+def _compact_positive_strides(a):
+    # Flip ``a``'s axes if there are any negative strides, then compute
+    # strides to have same order as a, but packed.  Return flipped ``a``
+    # and packed strides.
+    slicer, alist = _flip_negative_strides((a,))
+    a = alist[0]
     info = sorted(
             (a.strides[axis], a.shape[axis], axis)
             for axis in range(len(a.shape)))
@@ -1191,7 +1270,7 @@ def _compact_strides(a):
     for _, dim, axis in info:
         strides[axis] = stride
         stride *= dim
-    return strides
+    return a, strides, slicer
 
 
 def _memcpy_discontig(dst, src, async=False, stream=None):
@@ -1216,6 +1295,14 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
         dst[...] = src
         return
 
+    dst_gpudata = 0
+    src_gpudata = 0
+    if isinstance(src, GPUArray):
+        src_gpudata = src.gpudata
+    if isinstance(dst, GPUArray):
+        dst_gpudata = dst.gpudata
+    src, dst = _flip_negative_strides((src, dst))[1]
+    
     if src.flags.forc and dst.flags.forc:
         shape = [src.size]
         src_strides = dst_strides = [src.dtype.itemsize]

From afc32518a26e142ccb69aa8062b93f1f12946c9c Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 21 Feb 2018 10:36:31 -0500
Subject: [PATCH 04/25] Make sure key is hashable.

---
 pycuda/deferred.py    | 7 +++++--
 pycuda/elementwise.py | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 9699daa2..f8436702 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -417,8 +417,11 @@ class DeferredSourceModule(SourceModule):
     ``block``, and ``*args`` are the same arguments that were sent to the
     ``DeferredFunction`` call functions above.
     The function ``create_key(self, grid, block, *args)`` is always
-    called before ``create_source`` and the key returned (if not None) is
-    used to cache any compiled functions.
+    called before ``create_source`` and the key returned is used to cache
+    any compiled functions.  Default return value of ``create_key()`` is
+    None, which means to use the function name and generated source as the
+    key.  The return value of ``create_key()`` must be usable as a hash
+    key.
     '''
     _cache = {}
 
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index f0ce0803..d0c559f7 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -64,7 +64,7 @@ def __init__(self, arguments, operation,
         super(ElementwiseSourceModule, self).__init__(**compilekwargs)
         self._do_range = do_range
         self._shape_arg_index = shape_arg_index
-        self._init_args = (arguments, operation,
+        self._init_args = (tuple(arguments), operation,
                            name, preamble, loop_prep, after_loop)
 
     def create_key(self, grid, block, *args):

From 9cb80f75537ae699c1b77445d96efc53a873c583 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 21 Feb 2018 11:24:01 -0500
Subject: [PATCH 05/25] Allow existing kernel calls to use non-contiguous
 arrays by sending actual GPUArrays and using ARRAY_i indices (rather than
 just i).

---
 pycuda/cumath.py      |  10 +--
 pycuda/curandom.py    |   2 +-
 pycuda/elementwise.py |  73 ++++++++++-----------
 pycuda/gpuarray.py    | 146 +++++++++++++++++++++---------------------
 4 files changed, 116 insertions(+), 115 deletions(-)

diff --git a/pycuda/cumath.py b/pycuda/cumath.py
index dbae5bd6..1a8dcb5b 100644
--- a/pycuda/cumath.py
+++ b/pycuda/cumath.py
@@ -42,7 +42,7 @@ def f(array, stream_or_out=None, **kwargs):
 
         func = elementwise.get_unary_func_kernel(func_name, array.dtype)
         func.prepared_async_call(array._grid, array._block, stream,
-                array.gpudata, out.gpudata, array.mem_size)
+                array, out, array.mem_size)
 
         return out
     return f
@@ -77,7 +77,7 @@ def fmod(arg, mod, stream=None):
 
     func = elementwise.get_fmod_kernel()
     func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size)
+            arg, mod, result, arg.mem_size)
 
     return result
 
@@ -94,7 +94,7 @@ def frexp(arg, stream=None):
 
     func = elementwise.get_frexp_kernel()
     func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size)
+            arg, sig, expt, arg.mem_size)
 
     return sig, expt
 
@@ -111,7 +111,7 @@ def ldexp(significand, exponent, stream=None):
 
     func = elementwise.get_ldexp_kernel()
     func.prepared_async_call(significand._grid, significand._block, stream,
-            significand.gpudata, exponent.gpudata, result.gpudata,
+            significand, exponent, result,
             significand.mem_size)
 
     return result
@@ -129,7 +129,7 @@ def modf(arg, stream=None):
 
     func = elementwise.get_modf_kernel()
     func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, intpart.gpudata, fracpart.gpudata,
+            arg, intpart, fracpart,
             arg.mem_size)
 
     return fracpart, intpart
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index e1c68428..d31ee01a 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -240,7 +240,7 @@ def rand(shape, dtype=np.float32, stream=None):
         raise NotImplementedError;
 
     func.prepared_async_call(result._grid, result._block, stream,
-            result.gpudata, np.random.randint(2**31-1), result.size)
+            result, np.random.randint(2**31-1), result.size)
 
     return result
 
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index d0c559f7..6471af65 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -117,9 +117,8 @@ def create_key(self, grid, block, *args):
         self._arraypairs = arraypairs
         self._arrayspecificinds = arrayspecificinds
 
-        key = repr(self._init_args)
-
         if contigmatch:
+            key = repr(self._init_args)
             return key
 
         # Arrays are not contiguous or different order
@@ -526,7 +525,7 @@ def __call__(self, *args, **kwargs):
                             "deal with non-contiguous arrays")
 
                 vectors.append(arg)
-                invocation_args.append(arg.gpudata)
+                invocation_args.append(arg)
             else:
                 invocation_args.append(arg)
 
@@ -573,9 +572,9 @@ def get_take_kernel(dtype, idx_dtype, vec_count=1):
         "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
         for i in range(vec_count))
     body = (
-            ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
+            ("%(idx_tp)s src_idx = idx[idx_i];\n" % ctx)
             + "\n".join(
-                "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
+                "dest%d[dest%d_i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i, i)
                 for i in range(vec_count)))
 
     mod = get_elwise_module(args, body, "take", preamble=preamble)
@@ -618,11 +617,12 @@ def get_copy_insn(i):
             return ("dest%d[dest_idx] = "
                     "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i))
 
-    body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n"
-                "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx)
+    body = (("%(idx_tp)s src_idx = gmem_src_idx[gmem_src_idx_i];\n"
+                "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx)
             + "\n".join(get_copy_insn(i) for i in range(vec_count)))
 
-    mod = get_elwise_module(args, body, "take_put", preamble=preamble)
+    mod = get_elwise_module(args, body, "take_put",
+                            preamble=preamble, shape_arg_index=0)
     func = mod.get_function("take_put")
     tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
 
@@ -652,11 +652,12 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1):
             ] + [ScalarArg(np.intp, "n")]
 
     body = (
-            "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
-            + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
+            "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx
+            + "\n".join("dest%d[dest_idx] = src%d[src%d_i];" % (i, i, i)
                 for i in range(vec_count)))
 
-    func = get_elwise_module(args, body, "put").get_function("put")
+    func = get_elwise_module(args, body, "put",
+                             shape_arg_index=0).get_function("put")
     func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char)
     return func
 
@@ -668,7 +669,7 @@ def get_copy_kernel(dtype_dest, dtype_src):
                 "tp_dest": dtype_to_ctype(dtype_dest),
                 "tp_src": dtype_to_ctype(dtype_src),
                 },
-            "dest[i] = src[i]",
+            "dest[dest_i] = src[src_i]",
             "copy")
 
 
@@ -700,13 +701,13 @@ def get_linear_combination_kernel(summand_descriptors,
             args.append(ScalarArg(scalar_dtype, "a%d" % i))
             args.append(VectorArg(vector_dtype, "x%d" % i))
 
-        summands.append("a%d*x%d[i]" % (i, i))
+        summands.append("a%d*x%d[x%d_i]" % (i, i, i))
 
     args.append(VectorArg(dtype_z, "z"))
     args.append(ScalarArg(np.uintp, "n"))
 
     mod = get_elwise_module(args,
-            "z[i] = " + " + ".join(summands),
+            "z[z_i] = " + " + ".join(summands),
             "linear_combination",
             preamble="\n".join(preamble),
             loop_prep=";\n".join(loop_prep))
@@ -727,7 +728,7 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = a*x[i] + b*y[i]",
+            "z[z_i] = a*x[x_i] + b*y[y_i]",
             "axpbyz")
 
 
@@ -738,7 +739,7 @@ def get_axpbz_kernel(dtype_x, dtype_z):
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z)
                 },
-            "z[i] = a * x[i] + b",
+            "z[z_i] = a * x[x_i] + b",
             "axpb")
 
 
@@ -750,7 +751,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = x[i] %s y[i]" % operator,
+            "z[z_i] = x[x_i] %s y[y_i]" % operator,
             "multiply")
 
 
@@ -761,7 +762,7 @@ def get_rdivide_elwise_kernel(dtype_x, dtype_z):
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = y / x[i]",
+            "z[z_i] = y / x[x_i]",
             "divide_r")
 
 
@@ -773,7 +774,7 @@ def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s(x[i], y[i])" % func,
+            "z[z_i] = %s(x[x_i], y[y_i])" % func,
             func+"_kernel")
 
 
@@ -785,7 +786,7 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s(x[i], y)" % func,
+            "z[z_i] = %s(x[x_i], y)" % func,
             func+"_kernel")
 
 
@@ -809,7 +810,7 @@ def get_fill_kernel(dtype):
             "%(tp)s a, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = a",
+            "z[z_i] = a",
             "fill")
 
 
@@ -819,7 +820,7 @@ def get_reverse_kernel(dtype):
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = y[n-1-i]",
+            "z[z_i] = y[n-1-y_i]",
             "reverse")
 
 
@@ -830,7 +831,7 @@ def get_real_kernel(dtype, real_dtype):
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
-            "z[i] = real(y[i])",
+            "z[z_i] = real(y[y_i])",
             "real")
 
 
@@ -841,7 +842,7 @@ def get_imag_kernel(dtype, real_dtype):
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
-            "z[i] = imag(y[i])",
+            "z[z_i] = imag(y[y_i])",
             "imag")
 
 
@@ -851,7 +852,7 @@ def get_conj_kernel(dtype):
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = pycuda::conj(y[i])",
+            "z[z_i] = pycuda::conj(y[y_i])",
             "conj")
 
 
@@ -861,7 +862,7 @@ def get_arange_kernel(dtype):
             "%(tp)s *z, %(tp)s start, %(tp)s step" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = start + i*step",
+            "z[z_i] = start + z_i*step",
             "arange")
 
 
@@ -876,7 +877,7 @@ def get_pow_kernel(dtype):
             "%(tp)s value, %(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = %s(y[i], value)" % func,
+            "z[z_i] = %s(y[y_i], value)" % func,
             "pow_method")
 
 
@@ -893,7 +894,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s(x[i], y[i])" % func,
+            "z[z_i] = %s(x[x_i], y[y_i])" % func,
             "pow_method")
 
 
@@ -901,7 +902,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
 def get_fmod_kernel():
     return get_elwise_kernel(
             "float *arg, float *mod, float *z",
-            "z[i] = fmod(arg[i], mod[i])",
+            "z[z_i] = fmod(arg[arg_i], mod[mod_i])",
             "fmod_kernel")
 
 
@@ -909,7 +910,7 @@ def get_fmod_kernel():
 def get_modf_kernel():
     return get_elwise_kernel(
             "float *x, float *intpart ,float *fracpart",
-            "fracpart[i] = modf(x[i], &intpart[i])",
+            "fracpart[fracpart_i] = modf(x[x_i], &intpart[intpart_i])",
             "modf_kernel")
 
 
@@ -919,8 +920,8 @@ def get_frexp_kernel():
             "float *x, float *significand, float *exponent",
             """
                 int expt = 0;
-                significand[i] = frexp(x[i], &expt);
-                exponent[i] = expt;
+                significand[significand_i] = frexp(x[x_i], &expt);
+                exponent[exponent_i] = expt;
             """,
             "frexp_kernel")
 
@@ -929,7 +930,7 @@ def get_frexp_kernel():
 def get_ldexp_kernel():
     return get_elwise_kernel(
             "float *sig, float *expt, float *z",
-            "z[i] = ldexp(sig[i], int(expt[i]))",
+            "z[z_i] = ldexp(sig[sig_i], int(expt[expt_i]))",
             "ldexp_kernel")
 
 
@@ -943,7 +944,7 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None):
                 "tp_in": dtype_to_ctype(in_dtype),
                 "tp_out": dtype_to_ctype(out_dtype),
                 },
-            "z[i] = %s(y[i])" % func_name,
+            "z[z_i] = %s(y[y_i])" % func_name,
             "%s_kernel" % func_name)
 
 
@@ -955,7 +956,7 @@ def get_if_positive_kernel(crit_dtype, dtype):
             VectorArg(dtype, "else_"),
             VectorArg(dtype, "result"),
             ],
-            "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
+            "result[result_i] = crit[crit_i] > 0 ? then_[then__i] : else_[else__i]",
             "if_positive")
 
 
@@ -967,5 +968,5 @@ def get_scalar_op_kernel(dtype_x, dtype_y, operator):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_a": dtype_to_ctype(dtype_x),
                 },
-            "y[i] = x[i] %s a" % operator,
+            "y[y_i] = x[x_i] %s a" % operator,
             "scalarop_kernel")
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index cb78f5c6..b4d72e2b 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -134,7 +134,7 @@ def func(self, other):
                     self.dtype, other.dtype, result.dtype,
                     operator)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
+                    self, other, result,
                     self.mem_size)
 
             return result
@@ -143,7 +143,7 @@ def func(self, other):
             func = elementwise.get_scalar_op_kernel(
                     self.dtype, result.dtype, operator)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other, result.gpudata,
+                    self, other, result,
                     self.mem_size)
             return result
 
@@ -300,47 +300,47 @@ def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None):
         """Compute ``out = selffac * self + otherfac*other``,
         where `other` is a vector.."""
         assert self.shape == other.shape
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        #if not self.flags.forc or not other.flags.forc:
+        #    raise RuntimeError("only contiguous arrays may "
+        #            "be used as arguments to this operation")
 
         func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
 
         if add_timer is not None:
             add_timer(3*self.size, func.prepared_timed_call(self._grid,
-                selffac, self.gpudata, otherfac, other.gpudata,
-                out.gpudata, self.mem_size))
+                selffac, self, otherfac, other,
+                out, self.mem_size))
         else:
             func.prepared_async_call(self._grid, self._block, stream,
-                    selffac, self.gpudata, otherfac, other.gpudata,
-                    out.gpudata, self.mem_size)
+                    selffac, self, otherfac, other,
+                    out, self.mem_size)
 
         return out
 
     def _axpbz(self, selffac, other, out, stream=None):
         """Compute ``out = selffac * self + other``, where `other` is a scalar."""
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        #if not self.flags.forc:
+        #    raise RuntimeError("only contiguous arrays may "
+        #            "be used as arguments to this operation")
 
         func = elementwise.get_axpbz_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                selffac, self.gpudata,
-                other, out.gpudata, self.mem_size)
+                selffac, self,
+                other, out, self.mem_size)
 
         return out
 
     def _elwise_multiply(self, other, out, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        #if not self.flags.forc:
+        #    raise RuntimeError("only contiguous arrays may "
+        #            "be used as arguments to this operation")
 
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
                 out.dtype, "*")
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                self, other,
+                out, self.mem_size)
 
         return out
 
@@ -350,31 +350,31 @@ def _rdiv_scalar(self, other, out, stream=None):
            y = n / self
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        #if not self.flags.forc:
+        #    raise RuntimeError("only contiguous arrays may "
+        #            "be used as arguments to this operation")
 
         func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other,
-                out.gpudata, self.mem_size)
+                self, other,
+                out, self.mem_size)
 
         return out
 
     def _div(self, other, out, stream=None):
         """Divides an array by another array."""
 
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        #if not self.flags.forc or not other.flags.forc:
+        #    raise RuntimeError("only contiguous arrays may "
+        #            "be used as arguments to this operation")
 
         assert self.shape == other.shape
 
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
                 out.dtype, "/")
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                self, other,
+                out, self.mem_size)
 
         return out
 
@@ -554,7 +554,7 @@ def fill(self, value, stream=None):
         """fills the array with the specified value"""
         func = elementwise.get_fill_kernel(self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                value, self.gpudata, self.mem_size)
+                value, self, self.mem_size)
 
         return self
 
@@ -640,7 +640,7 @@ def __abs__(self):
                 out_dtype=out_dtype)
 
         func.prepared_async_call(self._grid, self._block, None,
-                self.gpudata, result.gpudata, self.mem_size)
+                self, result, self.mem_size)
 
         return result
 
@@ -651,9 +651,9 @@ def _pow(self, other, new):
         """
 
         if isinstance(other, GPUArray):
-            if not self.flags.forc or not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+            #if not self.flags.forc or not other.flags.forc:
+            #    raise RuntimeError("only contiguous arrays may "
+            #            "be used as arguments to this operation")
 
             assert self.shape == other.shape
 
@@ -666,14 +666,14 @@ def _pow(self, other, new):
                     self.dtype, other.dtype, result.dtype)
 
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
+                    self, other, result,
                     self.mem_size)
 
             return result
         else:
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+            #if not self.flags.forc:
+            #    raise RuntimeError("only contiguous arrays may "
+            #            "be used as arguments to this operation")
 
             if new:
                 result = self._new_like_me()
@@ -681,7 +681,7 @@ def _pow(self, other, new):
                 result = self
             func = elementwise.get_pow_kernel(self.dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    other, self.gpudata, result.gpudata,
+                    other, self, result,
                     self.mem_size)
 
             return result
@@ -713,23 +713,23 @@ def reverse(self, stream=None):
         as one-dimensional.
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        #if not self.flags.forc:
+        #    raise RuntimeError("only contiguous arrays may "
+        #            "be used as arguments to this operation")
 
         result = self._new_like_me()
 
         func = elementwise.get_reverse_kernel(self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, result.gpudata,
+                self, result,
                 self.mem_size)
 
         return result
 
     def astype(self, dtype, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        #if not self.flags.forc:
+        #    raise RuntimeError("only contiguous arrays may "
+        #            "be used as arguments to this operation")
 
         if dtype == self.dtype:
             return self.copy()
@@ -738,7 +738,7 @@ def astype(self, dtype, stream=None):
 
         func = elementwise.get_copy_kernel(dtype, self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                result.gpudata, self.gpudata,
+                result, self,
                 self.mem_size)
 
         return result
@@ -750,9 +750,9 @@ def reshape(self, *shape, **kwargs):
         order = kwargs.pop("order", "C")
 
         # TODO: add more error-checking, perhaps
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        #if not self.flags.forc:
+        #    raise RuntimeError("only contiguous arrays may "
+        #            "be used as arguments to this operation")
 
         if isinstance(shape[0], tuple) or isinstance(shape[0], list):
             shape = tuple(shape[0])
@@ -978,7 +978,7 @@ def real(self):
 
             func = elementwise.get_real_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
+                    self, result,
                     self.mem_size)
 
             return result
@@ -989,9 +989,9 @@ def real(self):
     def imag(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+            #if not self.flags.forc:
+            #    raise RuntimeError("only contiguous arrays may "
+            #            "be used as arguments to this operation")
 
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
@@ -1003,7 +1003,7 @@ def imag(self):
 
             func = elementwise.get_imag_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
+                    self, result,
                     self.mem_size)
 
             return result
@@ -1013,9 +1013,9 @@ def imag(self):
     def conj(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+            #if not self.flags.forc:
+            #    raise RuntimeError("only contiguous arrays may "
+            #            "be used as arguments to this operation")
 
             if self.flags.f_contiguous:
                 order = "F"
@@ -1025,7 +1025,7 @@ def conj(self):
 
             func = elementwise.get_conj_kernel(dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
+                    self, result,
                     self.mem_size)
 
             return result
@@ -1219,7 +1219,7 @@ class Info(Record):
 
     func = elementwise.get_arange_kernel(dtype)
     func.prepared_async_call(result._grid, result._block, kwargs.get("stream"),
-            result.gpudata, start, step, size)
+            result, start, step, size)
 
     return result
 
@@ -1244,7 +1244,7 @@ def _flip_negative_strides(arrays):
         stride_sign = t[1]
         if len(arrays) > 1:
             if not np.all(t[2:] == stride_sign):
-                raise ValueError("found differing signs in dimension %d: %s" % (axis, t[1:]))
+                raise ValueError("found differing signs in strides for dimension %d (strides for all arrays: %s)" % (axis, [x.strides for x in arrays]))
         if stride_sign == -1:
             if slicer is None:
                 slicer = [slice(None)] * ndim
@@ -1431,7 +1431,7 @@ def take(a, indices, out=None, stream=None):
     a.bind_to_texref_ext(tex_src[0], allow_double_hack=True, allow_complex_hack=True)
 
     func.prepared_async_call(out._grid, out._block, stream,
-            indices.gpudata, out.gpudata, indices.size)
+            indices, out, indices.size)
 
     return out
 
@@ -1473,8 +1473,8 @@ def make_func_for_chunk_size(chunk_size):
             a.bind_to_texref_ext(tex_src[i], allow_double_hack=True)
 
         func.prepared_async_call(indices._grid, indices._block, stream,
-                indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
+                indices,
+                *([o for o in out[chunk_slice]]
                     + [indices.size]))
 
     return out
@@ -1538,8 +1538,8 @@ def make_func_for_chunk_size(chunk_size):
             a.bind_to_texref_ext(src_tr, allow_double_hack=True)
 
         func.prepared_async_call(src_indices._grid,  src_indices._block, stream,
-                dest_indices.gpudata, src_indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
+                dest_indices, src_indices,
+                *([o for o in out[chunk_slice]]
                     + src_offsets_list[chunk_slice]
                     + [src_indices.size]))
 
@@ -1583,9 +1583,9 @@ def make_func_for_chunk_size(chunk_size):
             func = make_func_for_chunk_size(vec_count-start_i)
 
         func.prepared_async_call(dest_indices._grid, dest_indices._block, stream,
-                dest_indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
-                    + [i.gpudata for i in arrays[chunk_slice]]
+                dest_indices,
+                *([o for o in out[chunk_slice]]
+                    + [i for i in arrays[chunk_slice]]
                     + [dest_indices.size]))
 
     return out
@@ -1637,7 +1637,7 @@ def if_positive(criterion, then_, else_, out=None, stream=None):
         out = empty_like(then_)
 
     func.prepared_async_call(criterion._grid, criterion._block, stream,
-            criterion.gpudata, then_.gpudata, else_.gpudata, out.gpudata,
+            criterion, then_, else_, out,
             criterion.size)
 
     return out
@@ -1652,14 +1652,14 @@ def f(a, b, out=None, stream=None):
                     a.dtype, b.dtype, out.dtype, use_scalar=False)
 
             func.prepared_async_call(a._grid, a._block, stream,
-                    a.gpudata, b.gpudata, out.gpudata, a.size)
+                    a, b, out, a.size)
         elif isinstance(a, GPUArray):
             if out is None:
                 out = empty_like(a)
             func = elementwise.get_binary_minmax_kernel(which,
                     a.dtype, a.dtype, out.dtype, use_scalar=True)
             func.prepared_async_call(a._grid, a._block, stream,
-                    a.gpudata, b, out.gpudata, a.size)
+                    a, b, out, a.size)
         else:  # assuming b is a GPUArray
             if out is None:
                 out = empty_like(b)
@@ -1667,7 +1667,7 @@ def f(a, b, out=None, stream=None):
                     b.dtype, b.dtype, out.dtype, use_scalar=True)
             # NOTE: we switch the order of a and b here!
             func.prepared_async_call(b._grid, b._block, stream,
-                    b.gpudata, a, out.gpudata, b.size)
+                    b, a, out, b.size)
         return out
     return f
 

From b38c75fec1c9ad01307e1ea1e91dac2920a499d5 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 21 Feb 2018 13:44:34 -0500
Subject: [PATCH 06/25] Fix variable names.

---
 pycuda/deferred.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index f8436702..104ca60b 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -244,7 +244,7 @@ def _deferred_func(*args, **kwargs):
                 return _retval
             args = self._eval_list(args)
             kwargs = self._eval_dict(kwargs)
-            return getattr(self._val, _name)(*newargs, **newkwargs)
+            return getattr(self._val, _name)(*args, **kwargs)
         _deferred_func.__name__ = _name + ".deferred"
         return _deferred_func
 

From 1f6486ba7afbdef52d2752bf85f509646c98349c Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 21 Feb 2018 13:45:16 -0500
Subject: [PATCH 07/25] Non-contiguous is OK now.

---
 pycuda/gpuarray.py | 72 ++--------------------------------------------
 1 file changed, 3 insertions(+), 69 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index b4d72e2b..e2bb3e54 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -118,17 +118,9 @@ def splay(n, dev=None):
 
 def _make_binary_op(operator):
     def func(self, other):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         if isinstance(other, GPUArray):
             assert self.shape == other.shape
 
-            if not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             result = self._new_like_me()
             func = elementwise.get_binary_op_kernel(
                     self.dtype, other.dtype, result.dtype,
@@ -300,9 +292,6 @@ def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None):
         """Compute ``out = selffac * self + otherfac*other``,
         where `other` is a vector.."""
         assert self.shape == other.shape
-        #if not self.flags.forc or not other.flags.forc:
-        #    raise RuntimeError("only contiguous arrays may "
-        #            "be used as arguments to this operation")
 
         func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
 
@@ -320,10 +309,6 @@ def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None):
     def _axpbz(self, selffac, other, out, stream=None):
         """Compute ``out = selffac * self + other``, where `other` is a scalar."""
 
-        #if not self.flags.forc:
-        #    raise RuntimeError("only contiguous arrays may "
-        #            "be used as arguments to this operation")
-
         func = elementwise.get_axpbz_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
                 selffac, self,
@@ -332,10 +317,6 @@ def _axpbz(self, selffac, other, out, stream=None):
         return out
 
     def _elwise_multiply(self, other, out, stream=None):
-        #if not self.flags.forc:
-        #    raise RuntimeError("only contiguous arrays may "
-        #            "be used as arguments to this operation")
-
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
                 out.dtype, "*")
         func.prepared_async_call(self._grid, self._block, stream,
@@ -350,10 +331,6 @@ def _rdiv_scalar(self, other, out, stream=None):
            y = n / self
         """
 
-        #if not self.flags.forc:
-        #    raise RuntimeError("only contiguous arrays may "
-        #            "be used as arguments to this operation")
-
         func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
                 self, other,
@@ -364,10 +341,6 @@ def _rdiv_scalar(self, other, out, stream=None):
     def _div(self, other, out, stream=None):
         """Divides an array by another array."""
 
-        #if not self.flags.forc or not other.flags.forc:
-        #    raise RuntimeError("only contiguous arrays may "
-        #            "be used as arguments to this operation")
-
         assert self.shape == other.shape
 
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
@@ -651,10 +624,6 @@ def _pow(self, other, new):
         """
 
         if isinstance(other, GPUArray):
-            #if not self.flags.forc or not other.flags.forc:
-            #    raise RuntimeError("only contiguous arrays may "
-            #            "be used as arguments to this operation")
-
             assert self.shape == other.shape
 
             if new:
@@ -671,10 +640,6 @@ def _pow(self, other, new):
 
             return result
         else:
-            #if not self.flags.forc:
-            #    raise RuntimeError("only contiguous arrays may "
-            #            "be used as arguments to this operation")
-
             if new:
                 result = self._new_like_me()
             else:
@@ -713,10 +678,6 @@ def reverse(self, stream=None):
         as one-dimensional.
         """
 
-        #if not self.flags.forc:
-        #    raise RuntimeError("only contiguous arrays may "
-        #            "be used as arguments to this operation")
-
         result = self._new_like_me()
 
         func = elementwise.get_reverse_kernel(self.dtype)
@@ -727,10 +688,6 @@ def reverse(self, stream=None):
         return result
 
     def astype(self, dtype, stream=None):
-        #if not self.flags.forc:
-        #    raise RuntimeError("only contiguous arrays may "
-        #            "be used as arguments to this operation")
-
         if dtype == self.dtype:
             return self.copy()
 
@@ -750,9 +707,6 @@ def reshape(self, *shape, **kwargs):
         order = kwargs.pop("order", "C")
 
         # TODO: add more error-checking, perhaps
-        #if not self.flags.forc:
-        #    raise RuntimeError("only contiguous arrays may "
-        #            "be used as arguments to this operation")
 
         if isinstance(shape[0], tuple) or isinstance(shape[0], list):
             shape = tuple(shape[0])
@@ -970,11 +924,7 @@ def real(self):
         if issubclass(dtype.type, np.complexfloating):
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            result = self._new_like_me(dtype=real_dtype, order=order)
+            result = self._new_like_me(dtype=real_dtype)
 
             func = elementwise.get_real_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
@@ -989,17 +939,9 @@ def real(self):
     def imag(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            #if not self.flags.forc:
-            #    raise RuntimeError("only contiguous arrays may "
-            #            "be used as arguments to this operation")
-
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            result = self._new_like_me(dtype=real_dtype, order=order)
+            result = self._new_like_me(dtype=real_dtype)
 
             func = elementwise.get_imag_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
@@ -1013,15 +955,7 @@ def imag(self):
     def conj(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            #if not self.flags.forc:
-            #    raise RuntimeError("only contiguous arrays may "
-            #            "be used as arguments to this operation")
-
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            result = self._new_like_me(order=order)
+            result = self._new_like_me()
 
             func = elementwise.get_conj_kernel(dtype)
             func.prepared_async_call(self._grid, self._block, None,

From edcc44af0dd5d8bc65ffe5672ee9c0e63438162c Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 21 Feb 2018 14:36:40 -0500
Subject: [PATCH 08/25] Forgot to remove non-contiguity check.

---
 pycuda/elementwise.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 6471af65..572e0f0b 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -520,10 +520,6 @@ def __call__(self, *args, **kwargs):
 
         for arg, arg_descr in zip(args, arguments):
             if isinstance(arg_descr, VectorArg):
-                if not arg.flags.forc:
-                    raise RuntimeError("elementwise kernel cannot "
-                            "deal with non-contiguous arrays")
-
                 vectors.append(arg)
                 invocation_args.append(arg)
             else:

From e23c943366db0e1487be3d3cd0e77a183acae42a Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 21 Feb 2018 15:26:47 -0500
Subject: [PATCH 09/25] Allow setting scalars.

---
 pycuda/gpuarray.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index e2bb3e54..bf094dde 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -912,7 +912,11 @@ def __getitem__(self, index):
                 strides=tuple(new_strides))
 
     def __setitem__(self, index, value):
-        _memcpy_discontig(self[index], value)
+        if isinstance(value, GPUArray) or isinstance(value, np.ndarray):
+            return _memcpy_discontig(self[index], value)
+
+        # Let's assume it's a scalar
+        self[index].fill(value)
 
     # }}}
 
@@ -1229,12 +1233,6 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
         dst[...] = src
         return
 
-    dst_gpudata = 0
-    src_gpudata = 0
-    if isinstance(src, GPUArray):
-        src_gpudata = src.gpudata
-    if isinstance(dst, GPUArray):
-        dst_gpudata = dst.gpudata
     src, dst = _flip_negative_strides((src, dst))[1]
     
     if src.flags.forc and dst.flags.forc:

From 9cfaf9736e05a2637b3795820f55ebbc7bba9fdd Mon Sep 17 00:00:00 2001
From: "Gregory R. Lee" <gregory.lee@cchmc.org>
Date: Wed, 6 Dec 2017 17:38:14 -0500
Subject: [PATCH 10/25] fix: update signature of gpuarray.reshape to match the
 GPUArray method

---
 pycuda/gpuarray.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index bf094dde..a714f572 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -671,7 +671,7 @@ def __ipow__(self, other):
 
         """
         return self._pow(other,new=False)
-        
+
 
     def reverse(self, stream=None):
         """Return this array in reversed order. The array is treated
@@ -1542,13 +1542,13 @@ def transpose(a, axes=None):
     return a.transpose(axes)
 
 
-def reshape(a, shape):
+def reshape(a, *shape, **kwargs):
     """Gives a new shape to an array without changing its data.
 
     .. versionadded:: 2015.2
     """
 
-    return a.reshape(shape)
+    return a.reshape(*shape, **kwargs)
 
 # }}}
 

From 080ec5973765ca3156a1c7d83877d1f3758e6739 Mon Sep 17 00:00:00 2001
From: Emanuel Rietveld <e.j.rietveld@gmail.com>
Date: Tue, 20 Feb 2018 08:04:30 +0900
Subject: [PATCH 11/25] Add get_texref() to ElementwiseKernel

---
 pycuda/elementwise.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 572e0f0b..563612b5 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -469,7 +469,7 @@ def get_elwise_kernel_and_types(arguments, operation,
     func = mod.get_function(name)
     func.prepare("".join(arg.struct_char for arg in arguments))
 
-    return func, arguments
+    return mod, func, arguments
 
 
 def get_elwise_kernel(arguments, operation,
@@ -477,7 +477,7 @@ def get_elwise_kernel(arguments, operation,
     """Return a L{pycuda.driver.Function} that performs the same scalar operation
     on one or several vectors.
     """
-    func, arguments = get_elwise_kernel_and_types(
+    mod, func, arguments = get_elwise_kernel_and_types(
             arguments, operation, name, keep, options, **kwargs)
 
     return func
@@ -491,9 +491,13 @@ def __init__(self, arguments, operation,
         self.gen_kwargs.update(dict(keep=keep, options=options, name=name,
             operation=operation, arguments=arguments))
 
+    def get_texref(self, name, use_range=False):
+        mod, knl, arguments = self.generate_stride_kernel_and_types(use_range=use_range)
+        return mod.get_texref(name)
+
     @memoize_method
     def generate_stride_kernel_and_types(self, use_range):
-        knl, arguments = get_elwise_kernel_and_types(use_range=use_range,
+        mod, knl, arguments = get_elwise_kernel_and_types(use_range=use_range,
                 **self.gen_kwargs)
 
         assert [i for i, arg in enumerate(arguments)
@@ -501,7 +505,7 @@ def generate_stride_kernel_and_types(self, use_range):
                 "ElementwiseKernel can only be used with functions that " \
                 "have at least one vector argument"
 
-        return knl, arguments
+        return mod, knl, arguments
 
     def __call__(self, *args, **kwargs):
         vectors = []
@@ -515,7 +519,7 @@ def __call__(self, *args, **kwargs):
                     + ", ".join(six.iterkeys(kwargs)))
 
         invocation_args = []
-        func, arguments = self.generate_stride_kernel_and_types(
+        mod, func, arguments = self.generate_stride_kernel_and_types(
                 range_ is not None or slice_ is not None)
 
         for arg, arg_descr in zip(args, arguments):

From a7cb9829ae5bae730f789dbef3aaffd17623504e Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 26 Feb 2018 23:46:52 -0600
Subject: [PATCH 12/25] Update bpl-subset, possibly including pypy support

---
 bpl-subset | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bpl-subset b/bpl-subset
index e7c5f513..7ea98ed3 160000
--- a/bpl-subset
+++ b/bpl-subset
@@ -1 +1 @@
-Subproject commit e7c5f5131daca6298b5e8aa48d06e7ecffec2ffa
+Subproject commit 7ea98ed300b63876f78746afda0d7240b478f355

From fb10ffd5b821ba553a83c153b6ebf51ddeab8db2 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 26 Feb 2018 23:48:17 -0600
Subject: [PATCH 13/25] Make characterize.platform_bits work with Pypy (patch
 by Emanuel Rietveld)

---
 pycuda/characterize.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pycuda/characterize.py b/pycuda/characterize.py
index 1c54af7d..2206e588 100644
--- a/pycuda/characterize.py
+++ b/pycuda/characterize.py
@@ -6,7 +6,11 @@
 
 
 def platform_bits():
-    return tuple.__itemsize__ * 8
+    import sys
+    if sys.maxsize > 2**32:
+        return 64
+    else:
+        return 32
 
 
 def has_stack():

From 71ec966f8c97d09469ee7b735cbc8b4c0437d449 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 27 Feb 2018 00:31:12 -0600
Subject: [PATCH 14/25] Fix pytest script-based test invocation

---
 test/test_cumath.py   | 2 +-
 test/test_driver.py   | 2 +-
 test/test_gpuarray.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_cumath.py b/test/test_cumath.py
index 874ccb64..272ab5ac 100644
--- a/test/test_cumath.py
+++ b/test/test_cumath.py
@@ -242,5 +242,5 @@ def test_unary_func_kwargs(self):
     if len(sys.argv) > 1:
         exec (sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
diff --git a/test/test_driver.py b/test/test_driver.py
index f88a1d67..038cf64f 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -961,5 +961,5 @@ def test_import_pyopencl_before_pycuda():
     if len(sys.argv) > 1:
         exec (sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 93cc954f..e38e2fda 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -1153,5 +1153,5 @@ def test_zeros_like_etc(self):
     if len(sys.argv) > 1:
         exec (sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])

From 542cffa2bcfaa915770c2a1fd94071d1288656d7 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Thu, 22 Feb 2018 15:14:24 -0500
Subject: [PATCH 15/25] Fix DeferredFunction.__call__, and change modulelazy to
 deferredmod.

---
 pycuda/deferred.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 104ca60b..1c79ace0 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -319,8 +319,8 @@ class DeferredFunction(object):
     rather than their ``.gpudata`` members; this can be helpful to
     dynamically create kernels.
     '''
-    def __init__(self, modulelazy, funcname):
-        self._modulelazy = modulelazy
+    def __init__(self, deferredmod, funcname):
+        self._deferredmod = deferredmod
         self._funcname = funcname
         self._prepare_args = None
 
@@ -346,7 +346,7 @@ def _fix_texrefs(self, kwargs):
             kwargs['texrefs'] = newtexrefs
 
     def __call__(self, *args, **kwargs):
-        func = self._modulelazy.create_function(self._funcname, args)
+        func = self._deferredmod._delayed_get_function(self._funcname, args)
         self._fix_texrefs(kwargs)
         return func.__call__(*args, **kwargs)
 
@@ -367,7 +367,7 @@ def _do_delayed_prepare(self, func):
     def _generic_prepared_call(self, funcmethodstr, funcmethodargs, funcargs, funckwargs):
         grid = funcmethodargs[0]
         block = funcmethodargs[1]
-        func = self._modulelazy._delayed_get_function(self._funcname, funcargs, grid, block)
+        func = self._deferredmod._delayed_get_function(self._funcname, funcargs, grid, block)
         self._do_delayed_prepare(func)
         newfuncargs = [ getattr(arg, 'gpudata', arg) for arg in funcargs ]
         fullargs = list(funcmethodargs)

From 8d278fbf85f46b825323823b825f3f30d3d21a40 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:34:39 -0500
Subject: [PATCH 16/25] Make sure 'texrefs' keyword arg is re-evaluated every
 time. (Function may change based on kernel call arguments)

---
 pycuda/deferred.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 1c79ace0..2673e9db 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -338,16 +338,20 @@ def _unimplemented(self, _methodname=_methodname, *args, **kwargs):
     def _fix_texrefs(self, kwargs):
         texrefs = kwargs.get('texrefs', None)
         if texrefs is not None:
+            kwargs = kwargs.copy()
             newtexrefs = []
             for texref in texrefs:
                 if isinstance(texref, DeferredVal):
-                    texref = texref._eval()
+                    # don't use _eval() as the cached value may cause
+                    # problems when this function is called again
+                    texref = texref._evalbase()
                 newtexrefs.append(texref)
             kwargs['texrefs'] = newtexrefs
+        return kwargs
 
     def __call__(self, *args, **kwargs):
         func = self._deferredmod._delayed_get_function(self._funcname, args)
-        self._fix_texrefs(kwargs)
+        kwargs = self._fix_texrefs(kwargs)
         return func.__call__(*args, **kwargs)
 
     def param_set_texref(self, *args, **kwargs):
@@ -361,7 +365,7 @@ def _do_delayed_prepare(self, func):
         if self._prepare_args is None:
             raise Exception("prepared_*_call() requires that prepare() be called first")
         (prepare_args, prepare_kwargs) = self._prepare_args
-        self._fix_texrefs(prepare_kwargs)
+        kwargs = self._fix_texrefs(prepare_kwargs)
         func.prepare(*prepare_args, **prepare_kwargs)
 
     def _generic_prepared_call(self, funcmethodstr, funcmethodargs, funcargs, funckwargs):

From 5a4a2fae1fb23041a610fa8ebf7feb326636648b Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:37:17 -0500
Subject: [PATCH 17/25] Store module in cache too.

---
 pycuda/deferred.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index 2673e9db..bdf3ffeb 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -470,12 +470,12 @@ def _delayed_get_function(self, funcname, funcargs, grid, block):
                 source = source.generate()
             if key is None:
                 funckey = (funcname, source)
-        func = funccache.get(funckey, None)
-        if func is None:
+        modfunc = funccache.get(funckey, None)
+        if modfunc is None:
             module = self._delayed_compile(source)
             func = module.get_function(funcname)
-            funccache[funckey] = func
-        return func
+            modfunc = funccache[funckey] = (module, func)
+        return modfunc[1]
 
     def get_function(self, name):
         return DeferredFunction(self, name)

From 05a5400136adeb41c88cad8c3814639a49b89bc1 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:38:14 -0500
Subject: [PATCH 18/25] Send grid and block to _delayed_get_function

---
 pycuda/deferred.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index bdf3ffeb..c7ff9893 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -350,7 +350,11 @@ def _fix_texrefs(self, kwargs):
         return kwargs
 
     def __call__(self, *args, **kwargs):
-        func = self._deferredmod._delayed_get_function(self._funcname, args)
+        block = kwargs.get('block', None)
+        if block is None or not isinstance(block, tuple) or len(block) != 3:
+            raise ValueError("keyword argument 'block' is required, and must be a 3-tuple of integers")
+        grid = kwargs.get('grid', (1,1))
+        func = self._deferredmod._delayed_get_function(self._funcname, args, grid, block)
         kwargs = self._fix_texrefs(kwargs)
         return func.__call__(*args, **kwargs)
 

From 11108fee0d9a6b3c519861a08a9084132dccb653 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:38:43 -0500
Subject: [PATCH 19/25] Fix comment.

---
 pycuda/deferred.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/deferred.py b/pycuda/deferred.py
index c7ff9893..e0ca5258 100644
--- a/pycuda/deferred.py
+++ b/pycuda/deferred.py
@@ -147,7 +147,7 @@ class DeferredVal(object):
     '''
     This is an object that serves as a proxy to an as-yet undetermined
     object, which is only known at the time when either ``_set_val()``
-    or ``_eval()`` is called.  Any calls to methods listed in the class
+    or ``_evalbase()`` is called.  Any calls to methods listed in the class
     attribute ``_deferred_method_dict`` are queued until then, at which
     point the queued method calls are executed in order immediately on
     the new object.

From 1a6228c6fa031e922b8ede9730662840522ef568 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:41:40 -0500
Subject: [PATCH 20/25] Add debug option to ElementwiseSourceModule.

---
 pycuda/elementwise.py | 71 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 8 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 563612b5..7d610034 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -60,12 +60,14 @@ class ElementwiseSourceModule(DeferredSourceModule):
     def __init__(self, arguments, operation,
                  name="kernel", preamble="", loop_prep="", after_loop="",
                  do_range=False, shape_arg_index=None,
+                 debug=False,
                  **compilekwargs):
         super(ElementwiseSourceModule, self).__init__(**compilekwargs)
         self._do_range = do_range
         self._shape_arg_index = shape_arg_index
         self._init_args = (tuple(arguments), operation,
                            name, preamble, loop_prep, after_loop)
+        self._debug = debug
 
     def create_key(self, grid, block, *args):
         (arguments, operation,
@@ -268,6 +270,12 @@ def create_source(self, grid, block, *args):
         arraynames = [ x[0] for x in arrayarginfos ]
 
         defines = DeferredSource()
+        decls = DeferredSource()
+        loop_preop = DeferredSource()
+        loop_inds_calc = DeferredSource()
+        loop_inds_inc = DeferredSource()
+        loop_body = DeferredSource()
+
         for dimnum in range(ndim):
             defines += """
                 #define SHAPE_%d %d
@@ -284,7 +292,6 @@ def create_source(self, grid, block, *args):
                        name, dimnum, dimelemstrides[dimnum],
                        name, dimnum, blockelemstrides[dimnum])
 
-        decls = DeferredSource()
         decls += """
             unsigned GLOBAL_i = cta_start + tid;
         """
@@ -297,7 +304,6 @@ def create_source(self, grid, block, *args):
                 long INDEX_%d;
             """ % (dimnum,)
 
-        loop_inds_calc = DeferredSource()
         loop_inds_calc += """
             unsigned int TMP_GLOBAL_i = GLOBAL_i;
         """
@@ -313,7 +319,6 @@ def create_source(self, grid, block, *args):
                     %s_i += INDEX_%d * ELEMSTRIDE_%s_%d;
                 """ % (name, dimnum, name, dimnum)
 
-        loop_inds_inc = DeferredSource()
         for dimnum in range(ndim):
             loop_inds_inc += """
                     INDEX_%d += BLOCK_STEP_%d;
@@ -341,13 +346,57 @@ def create_source(self, grid, block, *args):
                     }
                 """
 
-        loop_body = DeferredSource()
+        if self._debug:
+            preamble += """
+                #include <stdio.h>
+            """
+            loop_inds_calc += """
+                if (cta_start == 0 && tid == 0) {
+            """
+            loop_inds_calc.indent()
+            loop_inds_calc += r"""
+                printf("=======================\n");
+                printf("CALLING FUNC %s\n");
+                printf("N=%%u\n", (unsigned int)n);
+            """ % (funcname,)
+            for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos:
+                loop_inds_calc += r"""
+                    printf("(%s) %s: ptr=0x%%lx maxoffset(elems)=%s\n", (unsigned long)%s);
+                """ % (funcname, name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)
+            loop_inds_calc.dedent()
+            loop_inds_calc += """
+                }
+            """
+            indtest = DeferredSource()
+            for name in arraynames:
+                indtest += r"""
+                    if (%s_i > %s || %s_i < 0) {
+                """ % (name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)
+                indtest.indent()
+                indtest += r"""
+                        printf("cta_start=%%d tid=%%d GLOBAL_i=%%d %s_i=%%d\n", cta_start, tid, GLOBAL_i, %s_i);
+                        break;
+                """ % (name, name)
+                indtest.dedent()
+                indtest += """
+                    }
+                """
+            loop_preop = indtest + loop_preop
+            after_loop += r"""
+                if (cta_start == 0 && tid == 0) {
+                    printf("DONE CALLING FUNC %s\n");
+                    printf("-----------------------\n");
+                }
+            """ % (funcname,)
+
         if self._do_range:
             loop_body.add("""
               if (step < 0)
               {
                 for (/*void*/; GLOBAL_i > stop; GLOBAL_i += total_threads*step)
                 {
+                  %(loop_preop)s;
+
                   %(operation)s;
 
                   %(loop_inds_inc)s;
@@ -357,12 +406,15 @@ def create_source(self, grid, block, *args):
               {
                 for (/*void*/; GLOBAL_i < stop; GLOBAL_i += total_threads*step)
                 {
+                  %(loop_preop)s;
+
                   %(operation)s;
 
                   %(loop_inds_inc)s;
                 }
               }
             """, format_dict={
+                "loop_preop": loop_preop,
                 "operation": operation,
                 "loop_inds_inc": loop_inds_inc,
             })
@@ -370,11 +422,14 @@ def create_source(self, grid, block, *args):
             loop_body.add("""
               for (/*void*/; GLOBAL_i < n; GLOBAL_i += total_threads)
               {
+                %(loop_preop)s;
+
                 %(operation)s;
 
                 %(loop_inds_inc)s;
               }
             """, format_dict={
+                "loop_preop": loop_preop,
                 "operation": operation,
                 "loop_inds_inc": loop_inds_inc,
             })
@@ -425,23 +480,23 @@ def create_source(self, grid, block, *args):
 def get_elwise_module(arguments, operation,
         name="kernel", keep=False, options=None,
         preamble="", loop_prep="", after_loop="",
-        shape_arg_index=None):
+        **kwargs):
     return ElementwiseSourceModule(arguments, operation,
                                    name=name, preamble=preamble,
                                    loop_prep=loop_prep, after_loop=after_loop,
                                    keep=keep, options=options,
-                                   shape_arg_index=shape_arg_index)
+                                   **kwargs)
 
 def get_elwise_range_module(arguments, operation,
         name="kernel", keep=False, options=None,
         preamble="", loop_prep="", after_loop="",
-        shape_arg_index=None):
+        **kwargs):
     return ElementwiseSourceModule(arguments, operation,
                                    name=name, preamble=preamble,
                                    loop_prep=loop_prep, after_loop=after_loop,
                                    keep=keep, options=options,
                                    do_range=True,
-                                   shape_arg_index=shape_arg_index)
+                                   **kwargs)
 
 def get_elwise_kernel_and_types(arguments, operation,
         name="kernel", keep=False, options=None, use_range=False, **kwargs):

From b7436983b05bbf30343637890dc2f4065d963f5e Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:44:50 -0500
Subject: [PATCH 21/25] Fix index calculation (found using _debug!)

---
 pycuda/elementwise.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 7d610034..5732b2e0 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -339,8 +339,8 @@ def create_source(self, grid, block, *args):
                        dimnum + 1)
                 for name in arraynames:
                     loop_inds_inc += """
-                      %s_i -= DIMELEMSTRIDE_%s_%d;
-                    """ % (name, name, dimnum)
+                      %s_i += ELEMSTRIDE_%s_%d - DIMELEMSTRIDE_%s_%d;
+                    """ % (name, name, dimnum + 1, name, dimnum)
                 loop_inds_inc.dedent()
                 loop_inds_inc += """
                     }

From 7e57a7286f9c9e28094087bddd416ac24a47dd0a Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:46:10 -0500
Subject: [PATCH 22/25] Add shape to the key (so it needs to remain a tuple).

---
 pycuda/elementwise.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 5732b2e0..aca0d903 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -130,7 +130,6 @@ def create_key(self, grid, block, *args):
 
         ndim = len(shape)
         numthreads = block[0]
-        shape = np.array(shape)
 
         # Use index of minimum stride in first array as a hint on how to
         # order the traversal of dimensions.  We could probably do something
@@ -148,7 +147,8 @@ def create_key(self, grid, block, *args):
             do_reverse = True
         if do_reverse:
             shape = shape[::-1]
-        block_step = np.array(shape)
+        shapearr = np.array(shape)
+        block_step = np.array(shapearr)
         tmp = numthreads
         for dimnum in range(ndim):
             newstep = tmp % block_step[dimnum]
@@ -160,7 +160,7 @@ def create_key(self, grid, block, *args):
                 elemstrides = np.array(arg.strides[::-1]) // arg.itemsize
             else:
                 elemstrides = np.array(arg.strides) // arg.itemsize
-            dimelemstrides = elemstrides * shape
+            dimelemstrides = elemstrides * shapearr
             blockelemstrides = elemstrides * block_step
             arrayarginfos.append(
                 (arg_descr.name, tuple(elemstrides), tuple(dimelemstrides), tuple(blockelemstrides))
@@ -172,7 +172,7 @@ def create_key(self, grid, block, *args):
         self._shape = shape
         self._block_step = block_step
 
-        key = (self._init_args, grid, block, tuple(self._arrayarginfos))
+        key = (self._init_args, grid, block, shape, tuple(self._arrayarginfos))
 
         return key
 

From c5de070fae8f1fa04ea82b81a78bde406aa72686 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:46:31 -0500
Subject: [PATCH 23/25] Remove unnecessary format key.

---
 pycuda/elementwise.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index aca0d903..7487f25c 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -463,7 +463,6 @@ def create_source(self, grid, block, *args):
             }
             """, format_dict={
                 "arguments": ", ".join(arg.declarator() for arg in arguments),
-                "operation": operation,
                 "name": funcname,
                 "preamble": preamble,
                 "loop_prep": loop_prep,

From 61bd9084e660b4f5894916f638a95d6db9a1e6d2 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:46:44 -0500
Subject: [PATCH 24/25] Fix kernel name.

---
 pycuda/elementwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 7487f25c..ae158d36 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -806,7 +806,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[z_i] = x[x_i] %s y[y_i]" % operator,
-            "multiply")
+            "binary_op")
 
 
 @context_dependent_memoize

From 33a0dd8fa3943e218384dae0a715c781cdef1f84 Mon Sep 17 00:00:00 2001
From: Syam Gadde <gadde@biac.duke.edu>
Date: Wed, 28 Feb 2018 13:52:38 -0500
Subject: [PATCH 25/25] Fix _array_like_helper to work with non-contiguous
 arrays.

---
 pycuda/gpuarray.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a714f572..6fe18ae3 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1034,10 +1034,10 @@ def _array_like_helper(other_ary, dtype, order):
             order = "F"
         else:
             # array_like routines only return positive strides
-            strides = [np.abs(s) for s in other_ary.strides]
+            _, strides, _ = _compact_positive_strides(other_ary)
             if dtype is not None and dtype != other_ary.dtype:
                 # scale strides by itemsize when dtype is not the same
-                itemsize = other_ary.nbytes // other_ary.size
+                itemsize = other_ary.dtype.itemsize
                 itemsize_ratio = np.dtype(dtype).itemsize / itemsize
                 strides = [int(s*itemsize_ratio) for s in strides]
     elif order not in ["C", "F"]: