From 7d09e44ba71de85da8037ca343aeae3165fd90b0 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Thu, 15 Feb 2018 16:01:32 -0500 Subject: [PATCH 01/25] Fix slicing with negative stride. --- pycuda/gpuarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 16970a2c..bf9af239 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -857,7 +857,7 @@ def __getitem__(self, index): array_stride = self.strides[array_axis] - new_shape.append((stop-start-1)//idx_stride+1) + new_shape.append((abs(stop-start)-1)//abs(idx_stride)+1) new_strides.append(idx_stride*array_stride) new_offset += array_stride*start From 27bcf764fe86c8d6f48313380dd54e2b7ac684ad Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Mon, 19 Feb 2018 11:56:27 -0500 Subject: [PATCH 02/25] Commit only DeferredSourceModule support without changing calling behavior of users of get_elwise_module or get_elwise_range_module, to test backwards compatibility and performance. --- pycuda/deferred.py | 484 ++++++++++++++++++++++++++++++++++++++++++ pycuda/elementwise.py | 469 +++++++++++++++++++++++++++++++++------- 2 files changed, 879 insertions(+), 74 deletions(-) create mode 100644 pycuda/deferred.py diff --git a/pycuda/deferred.py b/pycuda/deferred.py new file mode 100644 index 00000000..9699daa2 --- /dev/null +++ b/pycuda/deferred.py @@ -0,0 +1,484 @@ +""" +This exports a "deferred" implementation of SourceModule, where compilation +is delayed until call-time. Several methods, like get_function(), return +"deferred" values that are also only evaluated at call-time. +""" + +from pycuda.tools import context_dependent_memoize +from pycuda.compiler import compile, SourceModule +import pycuda.driver + +import re + +class DeferredSource(object): + ''' + Source generator that supports user-directed indentation, nesting + ``DeferredSource`` objects, indentation-aware string interpolation, + and deferred generation. + Use ``+=`` or ``add()`` to add source fragments as strings or + other ``DeferredSource`` objects, ``indent()`` or ``dedent()`` to + change base indentation, and ``__call__`` or ``generate()`` to + generate source. + + ''' + def __init__(self, subsources=None, base_indent=0, indent_step=2): + self.base_indent = base_indent + self.indent_step = indent_step + if subsources is None: + subsources = [] + self.subsources = subsources + + def __str__(self): + return self.generate() + + def __repr__(self): + return repr(self.__str__()) + + def __call__(self, indent=0, indent_first=True): + return self.generate(indent, indent_first) + + def generate(self, indent=0, indent_first=True, get_list=False): + if get_list: + retval = [] + else: + retval = '' + do_indent = not indent_first + for subindent, strip_space, subsource, format_dict in self.subsources: + if do_indent: + newindent = self.base_indent + indent + subindent + else: + newindent = 0 + do_indent = True + if isinstance(subsource, DeferredSource): + retval = retval + subsource.generate(indent=(indent + subindent), get_list=get_list) + continue + lines = subsource.split("\n") + regex_space = re.compile(r"^(\s*)(.*?)(\s*)$") + regex_format = re.compile(r"%\(([^\)]*)\)([a-zA-Z])") + minstrip = None + newlines = [] + for line in lines: + linelen = len(line) + space_match = regex_space.match(line) + end_leading_space = space_match.end(1) + begin_trailing_space = space_match.start(3) + if strip_space: + if linelen == end_leading_space: + # all space, ignore + continue + if minstrip is None or end_leading_space < minstrip: + minstrip = end_leading_space + if not format_dict: + newlines.append(line) + continue + newlinelist = None + newline = '' + curpos = 0 + matches = list(regex_format.finditer(line, end_leading_space)) + nummatches = len(matches) + for match in matches: + formatchar = match.group(2) + name = match.group(1) + matchstart = match.start() + matchend = match.end() + repl = format_dict.get(name, None) + if repl is None: + continue + if (isinstance(repl, DeferredSource) and + nummatches == 1 and + matchstart == end_leading_space): + # only one replacement, and only spaces preceding + space = space_match.group(1) + newlinelist = [ space + x + for x in repl.generate(get_list=True) ] + else: + newline = newline + line[curpos:matchstart] + newline = newline + (('%' + formatchar) % (repl,)) + curpos = matchend + if newlinelist is None: + newline = newline + line[curpos:] + newlines.append(newline) + else: + newlines.extend(newlinelist) + newlines.append(line[curpos:]) + indentstr = ' ' * (indent + subindent) + for i, line in enumerate(newlines): + line = indentstr + line[minstrip:] + newlines[i] = line + if get_list: + retval += newlines + else: + retval = retval + "\n".join(newlines) + "\n" + return retval + + def indent(self, indent_step=None): + if indent_step is None: + indent_step = self.indent_step + self.base_indent += indent_step + return self + + def dedent(self, indent_step=None): + if indent_step is None: + indent_step = self.indent_step + self.base_indent -= indent_step + return self + + def format_dict(self, format_dict): + for subsource in self.subsources: + subsource[3] = format_dict + return self + + def add(self, other, strip_space=True, format_dict=None): + self.subsources.append([self.base_indent, strip_space, other, format_dict]) + return self + + def __iadd__(self, other): + self.add(other) + return self + + def __add__(self, other): + newgen = DeferredSource(subsources=self.subsources, + base_indent=self.base_indent, + indent_step=self.indent_step) + newgen.add(other) + return newgen + +class DeferredVal(object): + ''' + This is an object that serves as a proxy to an as-yet undetermined + object, which is only known at the time when either ``_set_val()`` + or ``_eval()`` is called. Any calls to methods listed in the class + attribute ``_deferred_method_dict`` are queued until then, at which + point the queued method calls are executed in order immediately on + the new object. + This class must be subclassed, and the class attribute + ``_deferred_method_dict`` must contain a mapping from defer-able method + names to either ``DeferredVal``, None (same as ``DeferredVal``), or a + subclass, which when instantiated, will be assigned (with ``_set_val()``) + the return value of the method. + There are two ways to set the proxied object. One is to set it + explicitly with ``_set_val(val)``. The other is to override the method + ``_evalbase()`` which should return the new object, and will be called + by ``_eval()``. + ''' + __unimpl = object() + _deferred_method_dict = None # must be set by subclass + + def __init__(self): + self._val_available = False + self._val = None + self._deferred_method_calls = [] + + def __repr__(self): + return self._repr(0) + + def _repr(self, indent): + indentstr = " " * indent + retstrs = [] + retstrs.append("%s" % (self.__class__,)) + for dmc in self._deferred_method_calls: + (name, args, kwargs, retval) = dmc + retstrs.append(" method %s" % (repr(name),)) + for arg in args: + if isinstance(arg, DeferredVal): + retstrs.append(" deferred arg (id=%s)" % (id(arg),)) + retstrs.append(arg._repr(indent + 6)) + else: + retstrs.append(" arg %s" % (repr(arg),)) + for kwname, arg in kwargs.items(): + if isinstance(arg, DeferredVal): + retstrs.append(" deferred kwarg %s (id=%s)" % (repr(kwname), id(arg))) + retstrs.append(arg._repr(indent + 6)) + else: + retstrs.append(" kwarg %s=%s" % (kwname, repr(arg),)) + retstrs.append(" deferred retval (id=%s)" % (id(retval),)) + return "\n".join([(indentstr + retstr) for retstr in retstrs]) + + def _set_val(self, val): + self._val = val + self._val_available = True + self._eval_methods() + return val + + def _evalbase(self): + raise NotImplementedError() + + def _eval_list(self, vals): + newvals = [] + for val in vals: + if isinstance(val, DeferredVal): + val = val._eval() + newvals.append(val) + return newvals + + def _eval_dict(self, valsdict): + newvalsdict = {} + for name, val in valsdict.items(): + if isinstance(val, DeferredVal): + val = val._eval() + newvalsdict[name] = val + return newvalsdict + + def _eval_methods(self): + assert(self._val_available) + val = self._val + for op in self._deferred_method_calls: + (methodname, methodargs, methodkwargs, deferredretval) = op + methodargs = self._eval_list(methodargs) + methodkwargs = self._eval_dict(methodkwargs) + retval = getattr(val, methodname)(*methodargs, **methodkwargs) + deferredretval._set_val(retval) + self._deferred_method_calls = [] + + def _eval(self): + if not self._val_available: + self._val = self._evalbase() + self._val_available = True + self._eval_methods() + return self._val + + def _get_deferred_func(self, _name, _retval): + def _deferred_func(*args, **kwargs): + if not self._val_available: + self._deferred_method_calls.append((_name, args, kwargs, _retval)) + return _retval + args = self._eval_list(args) + kwargs = self._eval_dict(kwargs) + return getattr(self._val, _name)(*newargs, **newkwargs) + _deferred_func.__name__ = _name + ".deferred" + return _deferred_func + + def __getattr__(self, name): + if self.__class__._deferred_method_dict is None: + raise Exception("DeferredVal must be subclassed and the class attribute _deferred_method_dict must be set to a valid dictionary!") + if self._val_available: + return getattr(self._val, name) + deferredclass = self.__class__._deferred_method_dict.get(name, self.__unimpl) + if deferredclass is not self.__unimpl: + if deferredclass is None: + deferredclass = DeferredVal + retval = deferredclass() + return self._get_deferred_func(name, retval) + raise AttributeError("no such attribute (yet): '%s'" % (name,)) + +# we allow all math operators to be deferred +_mathops = ( + '__add__', '__sub__', '__mul__', '__floordiv__', '__mod__', + '__divmod__', '__pow__', '__lshift__', '__rshift__', '__and__', + '__xor__', '__or__', '__div__', '__truediv__', '__radd__', '__rsub__', + '__rmul__', '__rdiv__', '__rtruediv__', '__rfloordiv__', '__rmod__', + '__rdivmod__', '__rpow__', '__rlshift__', '__rrshift__', '__rand__', + '__rxor__', '__ror__', '__iadd__', '__isub__', '__imul__', '__idiv__', + '__itruediv__', '__ifloordiv__', '__imod__', '__ipow__', '__ilshift__', + '__irshift__', '__iand__', '__ixor__', '__ior__', '__pos__', '__abs__', + '__invert__', '__complex__', '__int__', '__long__', '__float__', + '__oct__', '__hex__', '__index__', '__coerce__') +class DeferredNumeric(DeferredVal): + pass +DeferredNumeric._deferred_method_dict = dict((x, DeferredNumeric) + for x in _mathops) +def _get_deferred_attr_func(_name): + def _deferred_func(self, *args, **kwargs): + return self.__getattr__(_name)(*args, **kwargs) + _deferred_func.__name__ = _name + return _deferred_func +for name in _mathops: + setattr(DeferredNumeric, name, _get_deferred_attr_func(name)) + +class DeferredModuleVal(DeferredVal): + _deferred_method_dict = {} + def __init__(self, sourcemodule, methodstr, name): + super(DeferredModuleVal, self).__init__() + self._sourcemodule = sourcemodule + self._methodstr = methodstr + self._name = name + + def _evalbase(self): + return getattr(self._sourcemodule.module, self._methodstr)(self._name) + +class DeferredTexRef(DeferredModuleVal): + _deferred_method_dict = { + "set_array": None, + "set_address": DeferredNumeric, + "set_address_2d": None, + "set_format": None, + "set_address_mode": None, + "set_flags": None, + "get_address": DeferredNumeric, + "get_flags": DeferredNumeric, + } + +class DeferredFunction(object): + ''' + This class is a pseudo-replacement of ``pycuda.driver.Function``, + but takes a ``DeferredSourceModule`` and a function name as an argument, + and queues any call to ``prepare()`` until call-time, at which it + calls out to the ``DeferredSourceModule`` object do create the actual + Function before preparing (if necessary) and calling the underlying + kernel. NOTE: you may now send the actual ``GPUArrays`` as arguments, + rather than their ``.gpudata`` members; this can be helpful to + dynamically create kernels. + ''' + def __init__(self, modulelazy, funcname): + self._modulelazy = modulelazy + self._funcname = funcname + self._prepare_args = None + + def get_unimplemented(_methodname): + def _unimplemented(self, _methodname=_methodname, *args, **kwargs): + raise NotImplementedError("%s does not implement method '%s'" % (type(self), _methodname,)) + return _unimplemented + + for meth_name in ["set_block_shape", "set_shared_size", + "param_set_size", "param_set", "param_seti", "param_setf", + "param_setv", "param_set_texref", + "launch", "launch_grid", "launch_grid_async"]: + setattr(self, meth_name, get_unimplemented(meth_name)) + + def _fix_texrefs(self, kwargs): + texrefs = kwargs.get('texrefs', None) + if texrefs is not None: + newtexrefs = [] + for texref in texrefs: + if isinstance(texref, DeferredVal): + texref = texref._eval() + newtexrefs.append(texref) + kwargs['texrefs'] = newtexrefs + + def __call__(self, *args, **kwargs): + func = self._modulelazy.create_function(self._funcname, args) + self._fix_texrefs(kwargs) + return func.__call__(*args, **kwargs) + + def param_set_texref(self, *args, **kwargs): + raise NotImplementedError() + + def prepare(self, *args, **kwargs): + self._prepare_args = (args, kwargs) + return self + + def _do_delayed_prepare(self, func): + if self._prepare_args is None: + raise Exception("prepared_*_call() requires that prepare() be called first") + (prepare_args, prepare_kwargs) = self._prepare_args + self._fix_texrefs(prepare_kwargs) + func.prepare(*prepare_args, **prepare_kwargs) + + def _generic_prepared_call(self, funcmethodstr, funcmethodargs, funcargs, funckwargs): + grid = funcmethodargs[0] + block = funcmethodargs[1] + func = self._modulelazy._delayed_get_function(self._funcname, funcargs, grid, block) + self._do_delayed_prepare(func) + newfuncargs = [ getattr(arg, 'gpudata', arg) for arg in funcargs ] + fullargs = list(funcmethodargs) + fullargs.extend(newfuncargs) + return getattr(func, funcmethodstr)(*fullargs, **funckwargs) + + def prepared_call(self, grid, block, *args, **kwargs): + return self._generic_prepared_call('prepared_call', (grid, block), args, kwargs) + + def prepared_timed_call(self, grid, block, *args, **kwargs): + return self._generic_prepared_call('prepared_timed_call', (grid, block), args, kwargs) + + def prepared_async_call(self, grid, block, stream, *args, **kwargs): + return self._generic_prepared_call('prepared_async_call', (grid, block, stream), args, kwargs) + +@context_dependent_memoize +def _delayed_compile_aux(source, compileargs): + # re-convert any tuples to lists + newcompileargs = [] + for i, arg in enumerate(compileargs): + if isinstance(arg, tuple): + arg = list(arg) + newcompileargs.append(arg) + cubin = compile(source, *newcompileargs) + + from pycuda.driver import module_from_buffer + return module_from_buffer(cubin) + +class DeferredSourceModule(SourceModule): + ''' + This is an abstract specialization of SourceModule which allows the + delay of creating the actual kernel source until call-time, at which + point the actual arguments are available and their characteristics can + be used to create specific kernels. + To support this, ``get_function()`` returns a ``DeferredFunction`` + object which queues any calls to ``DeferredFunction.prepare()`` and + saves them until future calls to ``DeferredFunction.__call__()`` or + ``DeferredFunction.prepared_*_call()``. NOTE: you may now send actual + ``GPUArrays`` to these functions rather their ``.gpudata`` members; + this can be helpful when creating dynamic kernels. + Likewise, ``get_global()``, ``get_texref()`` and ``get_surfref()`` + return proxy objects that can be stored by ``DeferredFunction.prepare()`` + and will only be evaluated at call-time. + This class must be subclassed and the function ``create_source(self, + grid, block, *args)`` must be overridden, returning the kernel source + (or ``DeferredSource`` object) that should be compiled. ``grid``, + ``block``, and ``*args`` are the same arguments that were sent to the + ``DeferredFunction`` call functions above. + The function ``create_key(self, grid, block, *args)`` is always + called before ``create_source`` and the key returned (if not None) is + used to cache any compiled functions. + ''' + _cache = {} + + def __init__(self, nvcc="nvcc", options=None, keep=False, + no_extern_c=False, arch=None, code=None, cache_dir=None, + include_dirs=[]): + self._arch = arch + # tuples below are so _compileargs can be used as a hash key + if options is not None: + options = tuple(options) + include_dirs = tuple(include_dirs) + self._compileargs = (nvcc, options, keep, no_extern_c, + arch, code, cache_dir, include_dirs) + + def _delayed_compile(self, source): + self._check_arch(self._arch) + + self.module = _delayed_compile_aux(source, self._compileargs) + return self.module + + def create_key(self, grid, block, *funcargs): + return None + + def create_source(self, grid, block, *funcargs): + raise NotImplementedError("create_source must be overridden!") + + def _delayed_get_function(self, funcname, funcargs, grid, block): + ''' + If ``create_key()`` returns non-None, then it is used as the key + to cache compiled functions. Otherwise the return value of + ``create_source()`` is used as the key. + ''' + context = pycuda.driver.Context.get_current() + funccache = DeferredSourceModule._cache.get(context, None) + if funccache is None: + funccache = self._cache[context] = {} + key = self.create_key(grid, block, *funcargs) + funckey = (funcname, key) + if key is None or funckey not in funccache: + source = self.create_source(grid, block, *funcargs) + if isinstance(source, DeferredSource): + source = source.generate() + if key is None: + funckey = (funcname, source) + func = funccache.get(funckey, None) + if func is None: + module = self._delayed_compile(source) + func = module.get_function(funcname) + funccache[funckey] = func + return func + + def get_function(self, name): + return DeferredFunction(self, name) + + def get_global(self, name): + raise NotImplementedError("Deferred globals in element-wise kernels not supported yet") + + def get_texref(self, name): + return DeferredTexRef(self, 'get_texref', name) + + def get_surfref(self, name): + raise NotImplementedError("Deferred surfaces in element-wise kernels not supported yet") + diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index feab0a6b..f0ce0803 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -36,92 +36,413 @@ import numpy as np from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg from pytools import memoize_method +from pycuda.deferred import DeferredSourceModule, DeferredSource + +class ElementwiseSourceModule(DeferredSourceModule): + ''' + This is a ``DeferredSourceModule`` which is backwards-compatible with the + original ``get_elwise_module`` and ``get_elwise_range_module`` (using + ``do_range=True``). However, this class delays the compilation of + kernels until call-time. If you send actual ``GPUArray`` arguments + (instead of their ``.gpudata`` members) when calling the methods + supported by the return value of ``get_function()``, then you get: + * support for array-specific flat indices (i.e. for input array ``z``, + you can index it as ``z[z_i]`` in addition to the old-style ``z[i]``) + * support for non-contiguous (and arbitrarily-strided) arrays, but + only if you use the array-specific indices above. + Array-specific flat indices only really work if all the arrays using them + are the same shape. This shape is also used to optimize index + calculations. By default, the shape is taken from the first argument + that is specified as a pointer/array, but you can override this by + sending ``shape_arg_index=N`` where ``N`` is the zero-based index of the + kernel argument whose shape should be used. + ''' + def __init__(self, arguments, operation, + name="kernel", preamble="", loop_prep="", after_loop="", + do_range=False, shape_arg_index=None, + **compilekwargs): + super(ElementwiseSourceModule, self).__init__(**compilekwargs) + self._do_range = do_range + self._shape_arg_index = shape_arg_index + self._init_args = (arguments, operation, + name, preamble, loop_prep, after_loop) + + def create_key(self, grid, block, *args): + (arguments, operation, + funcname, preamble, loop_prep, after_loop) = self._init_args + shape_arg_index = self._shape_arg_index + + # 'args' is the list of actual parameters being sent to the kernel + # 'arguments' is the list of argument descriptors (VectorArg, ScalarArg) + + arraypairs = [] + contigmatch = True + arrayspecificinds = True + shape = None + size = None + order = None + for i, argpair in enumerate(zip(args, arguments)): + arg, arg_descr = argpair + if isinstance(arg_descr, VectorArg): + # is a GPUArray/DeviceAllocation + arraypairs.append(argpair) + if not arrayspecificinds: + continue + if not hasattr(arg, 'shape'): + # At least one array argument is probably sent as a + # GPUArray.gpudata rather than the GPUArray itself, + # so disable array-specific indices -- caller is on + # their own. + arrayspecificinds = False + continue + curshape = arg.shape + cursize = arg.size + curorder = 'N' + if arg.flags.f_contiguous: + curorder = 'F' + elif arg.flags.c_contiguous: + curorder = 'C' + if shape is None: + shape = curshape + size = cursize + order = curorder + elif curorder == 'N' or order != curorder: + contigmatch = False + elif shape_arg_index is None and shape != curshape: + raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the argument that has the canonical shape with shape_arg_index; found shapes %s and %s" % (shape, curshape)) + if shape_arg_index == i: + shape = curshape + + self._contigmatch = contigmatch + self._arraypairs = arraypairs + self._arrayspecificinds = arrayspecificinds + + key = repr(self._init_args) + + if contigmatch: + return key + + # Arrays are not contiguous or different order + + if grid[1] != 1 or block[1] != 1 or block[2] != 1: + raise Exception("Grid (%s) and block (%s) specifications should have all '1' except in the first element" % (grid, block)) + + ndim = len(shape) + numthreads = block[0] + shape = np.array(shape) + + # Use index of minimum stride in first array as a hint on how to + # order the traversal of dimensions. We could probably do something + # smarter, like tranposing/reshaping arrays if possible to maximize + # performance, but that is probably best done in a pre-processing step. + # Note that this could mess up custom indexing that assumes a + # particular traversal order, but in that case one should probably + # ensure that inputs have the same order, and explicitly send + # shape_arg_index to turn this off. + do_reverse = False + if (shape_arg_index is None and + np.argmin(np.abs(arraypairs[0][0].strides)) > ndim // 2): + print "traversing dimensions in reverse order" + # traverse dimensions in reverse order + do_reverse = True + if do_reverse: + shape = shape[::-1] + block_step = np.array(shape) + tmp = numthreads + for dimnum in range(ndim): + newstep = tmp % block_step[dimnum] + tmp = tmp // block_step[dimnum] + block_step[dimnum] = newstep + arrayarginfos = [] + for arg, arg_descr in arraypairs: + if do_reverse: + elemstrides = np.array(arg.strides[::-1]) // arg.itemsize + else: + elemstrides = np.array(arg.strides) // arg.itemsize + dimelemstrides = elemstrides * shape + blockelemstrides = elemstrides * block_step + arrayarginfos.append( + (arg_descr.name, tuple(elemstrides), tuple(dimelemstrides), tuple(blockelemstrides)) + ) + + self._arrayarginfos = arrayarginfos + self._ndim = ndim + self._numthreads = numthreads + self._shape = shape + self._block_step = block_step + + key = (self._init_args, grid, block, tuple(self._arrayarginfos)) + + return key + + def create_source(self, grid, block, *args): + # Precondition: create_key() must have been run with the same arguments + + (arguments, operation, + funcname, preamble, loop_prep, after_loop) = self._init_args + + contigmatch = self._contigmatch + + if contigmatch: + arraypairs = self._arraypairs + arrayspecificinds = self._arrayspecificinds + + indtype = 'unsigned' + if self._do_range: + indtype = 'long' + + # All arrays are contiguous and same order (or we don't know and + # it's up to the caller to make sure it works) + if arrayspecificinds: + for arg, arg_descr in arraypairs: + preamble = preamble + """ + #define %s_i i + """ % (arg_descr.name,) + if self._do_range: + loop_body = """ + if (step < 0) + { + for (i = start + (cta_start + tid)*step; + i > stop; i += total_threads*step) + { + %(operation)s; + } + } + else + { + for (i = start + (cta_start + tid)*step; + i < stop; i += total_threads*step) + { + %(operation)s; + } + } + """ % { + "operation": operation, + } + else: + loop_body = """ + for (i = cta_start + tid; i < n; i += total_threads) + { + %(operation)s; + } + """ % { + "operation": operation, + } + + return """ + #include + + %(preamble)s + + __global__ void %(name)s(%(arguments)s) + { + unsigned tid = threadIdx.x; + unsigned total_threads = gridDim.x*blockDim.x; + unsigned cta_start = blockDim.x*blockIdx.x; + + %(indtype)s i; + + %(loop_prep)s; + + %(loop_body)s; + + %(after_loop)s; + } + """ % { + "arguments": ", ".join(arg.declarator() for arg in arguments), + "name": funcname, + "preamble": preamble, + "loop_prep": loop_prep, + "after_loop": after_loop, + "loop_body": loop_body, + "indtype": indtype, + } + + # Arrays are not contiguous or different order + + arrayarginfos = self._arrayarginfos + ndim = self._ndim + numthreads = self._numthreads + shape = self._shape + block_step = self._block_step + + arraynames = [ x[0] for x in arrayarginfos ] + + defines = DeferredSource() + for dimnum in range(ndim): + defines += """ + #define SHAPE_%d %d + #define BLOCK_STEP_%d %d + """ % (dimnum, shape[dimnum], + dimnum, block_step[dimnum]) + for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos: + basename = "%s_%d" % (name, dimnum) + defines += """ + #define ELEMSTRIDE_%s_%d %d + #define DIMELEMSTRIDE_%s_%d %d + #define BLOCKELEMSTRIDE_%s_%d %d + """ % (name, dimnum, elemstrides[dimnum], + name, dimnum, dimelemstrides[dimnum], + name, dimnum, blockelemstrides[dimnum]) + + decls = DeferredSource() + decls += """ + unsigned GLOBAL_i = cta_start + tid; + """ + for name in arraynames: + decls += """ + long %s_i = 0; + """ % (name,) + for dimnum in range(ndim): + decls += """ + long INDEX_%d; + """ % (dimnum,) + + loop_inds_calc = DeferredSource() + loop_inds_calc += """ + unsigned int TMP_GLOBAL_i = GLOBAL_i; + """ + for dimnum in range(ndim): + loop_inds_calc += """ + INDEX_%d = TMP_GLOBAL_i %% SHAPE_%d; + TMP_GLOBAL_i = TMP_GLOBAL_i / SHAPE_%d; + """ % (dimnum, dimnum, + dimnum) + + for name in arraynames: + loop_inds_calc += """ + %s_i += INDEX_%d * ELEMSTRIDE_%s_%d; + """ % (name, dimnum, name, dimnum) + + loop_inds_inc = DeferredSource() + for dimnum in range(ndim): + loop_inds_inc += """ + INDEX_%d += BLOCK_STEP_%d; + """ % (dimnum, dimnum) + for name in arraynames: + loop_inds_inc += """ + %s_i += BLOCKELEMSTRIDE_%s_%d; + """ % (name, name, dimnum) + if dimnum < ndim - 1: + loop_inds_inc += """ + if (INDEX_%d > SHAPE_%d) { + """ % (dimnum, dimnum) + loop_inds_inc.indent() + loop_inds_inc += """ + INDEX_%d -= SHAPE_%d; + INDEX_%d ++; + """ % (dimnum, dimnum, + dimnum + 1) + for name in arraynames: + loop_inds_inc += """ + %s_i -= DIMELEMSTRIDE_%s_%d; + """ % (name, name, dimnum) + loop_inds_inc.dedent() + loop_inds_inc += """ + } + """ + + loop_body = DeferredSource() + if self._do_range: + loop_body.add(""" + if (step < 0) + { + for (/*void*/; GLOBAL_i > stop; GLOBAL_i += total_threads*step) + { + %(operation)s; + + %(loop_inds_inc)s; + } + } + else + { + for (/*void*/; GLOBAL_i < stop; GLOBAL_i += total_threads*step) + { + %(operation)s; + + %(loop_inds_inc)s; + } + } + """, format_dict={ + "operation": operation, + "loop_inds_inc": loop_inds_inc, + }) + else: + loop_body.add(""" + for (/*void*/; GLOBAL_i < n; GLOBAL_i += total_threads) + { + %(operation)s; + %(loop_inds_inc)s; + } + """, format_dict={ + "operation": operation, + "loop_inds_inc": loop_inds_inc, + }) -def get_elwise_module(arguments, operation, - name="kernel", keep=False, options=None, - preamble="", loop_prep="", after_loop=""): - from pycuda.compiler import SourceModule - return SourceModule(""" - #include + source = DeferredSource() - %(preamble)s + source.add(""" + #include + #include - __global__ void %(name)s(%(arguments)s) - { + %(defines)s - unsigned tid = threadIdx.x; - unsigned total_threads = gridDim.x*blockDim.x; - unsigned cta_start = blockDim.x*blockIdx.x; - unsigned i; + %(preamble)s - %(loop_prep)s; + __global__ void %(name)s(%(arguments)s) + { - for (i = cta_start + tid; i < n; i += total_threads) - { - %(operation)s; - } + unsigned tid = threadIdx.x; + unsigned total_threads = gridDim.x*blockDim.x; + unsigned cta_start = blockDim.x*blockIdx.x; - %(after_loop)s; - } - """ % { - "arguments": ", ".join(arg.declarator() for arg in arguments), - "operation": operation, - "name": name, - "preamble": preamble, - "loop_prep": loop_prep, - "after_loop": after_loop, - }, - options=options, keep=keep) + %(decls)s + %(loop_prep)s; -def get_elwise_range_module(arguments, operation, - name="kernel", keep=False, options=None, - preamble="", loop_prep="", after_loop=""): - from pycuda.compiler import SourceModule - return SourceModule(""" - #include - - %(preamble)s - - __global__ void %(name)s(%(arguments)s) - { - unsigned tid = threadIdx.x; - unsigned total_threads = gridDim.x*blockDim.x; - unsigned cta_start = blockDim.x*blockIdx.x; - long i; - - %(loop_prep)s; - - if (step < 0) - { - for (i = start + (cta_start + tid)*step; - i > stop; i += total_threads*step) - { - %(operation)s; - } - } - else - { - for (i = start + (cta_start + tid)*step; - i < stop; i += total_threads*step) - { - %(operation)s; + %(loop_inds_calc)s; + + %(loop_body)s; + + %(after_loop)s; } - } - - %(after_loop)s; - } - """ % { - "arguments": ", ".join(arg.declarator() for arg in arguments), - "operation": operation, - "name": name, - "preamble": preamble, - "loop_prep": loop_prep, - "after_loop": after_loop, - }, - options=options, keep=keep) + """, format_dict={ + "arguments": ", ".join(arg.declarator() for arg in arguments), + "operation": operation, + "name": funcname, + "preamble": preamble, + "loop_prep": loop_prep, + "after_loop": after_loop, + "defines": defines, + "decls": decls, + "loop_inds_calc": loop_inds_calc, + "loop_body": loop_body, + }) + + return source + +def get_elwise_module(arguments, operation, + name="kernel", keep=False, options=None, + preamble="", loop_prep="", after_loop="", + shape_arg_index=None): + return ElementwiseSourceModule(arguments, operation, + name=name, preamble=preamble, + loop_prep=loop_prep, after_loop=after_loop, + keep=keep, options=options, + shape_arg_index=shape_arg_index) + +def get_elwise_range_module(arguments, operation, + name="kernel", keep=False, options=None, + preamble="", loop_prep="", after_loop="", + shape_arg_index=None): + return ElementwiseSourceModule(arguments, operation, + name=name, preamble=preamble, + loop_prep=loop_prep, after_loop=after_loop, + keep=keep, options=options, + do_range=True, + shape_arg_index=shape_arg_index) def get_elwise_kernel_and_types(arguments, operation, name="kernel", keep=False, options=None, use_range=False, **kwargs): From be4014c5eb894897a31fea880a30a6e3c3cbe9b0 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Mon, 19 Feb 2018 12:24:20 -0500 Subject: [PATCH 03/25] Smarter _new_like_me that handles discontiguous input. Have copy() use it too. --- pycuda/gpuarray.py | 119 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 103 insertions(+), 16 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index bf9af239..cb78f5c6 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -246,13 +246,14 @@ def set_async(self, ary, stream=None): return self.set(ary, async=True, stream=stream) def get(self, ary=None, pagelocked=False, async=False, stream=None): + slicer = None if ary is None: if pagelocked: ary = drv.pagelocked_empty(self.shape, self.dtype) else: ary = np.empty(self.shape, self.dtype) - strides = _compact_strides(self) + self, strides, slicer = _compact_positive_strides(self) ary = _as_strided(ary, strides=strides) else: if self.size != ary.size: @@ -269,13 +270,15 @@ def get(self, ary=None, pagelocked=False, async=False, stream=None): if self.size: _memcpy_discontig(ary, self, async=async, stream=stream) + if slicer: + ary = ary[slicer] return ary def get_async(self, stream=None, ary=None): return self.get(ary=ary, async=True, stream=stream) - def copy(self): - new = GPUArray(self.shape, self.dtype, self.allocator) + def copy(self, order="K"): + new = self._new_like_me(order=order) _memcpy_discontig(new, self) return new @@ -375,15 +378,52 @@ def _div(self, other, out, stream=None): return out - def _new_like_me(self, dtype=None, order="C"): - strides = None + def _new_like_me(self, dtype=None, order="K"): + slicer, selflist = _flip_negative_strides((self,)) + self = selflist[0] + ndim = self.ndim + shape = self.shape + mystrides = self.strides + mydtype = self.dtype + myitemsize = mydtype.itemsize if dtype is None: - dtype = self.dtype - if dtype == self.dtype: - strides = self.strides - - return self.__class__(self.shape, dtype, - allocator=self.allocator, strides=strides, order=order) + dtype = mydtype + else: + dtype = np.dtype(dtype) + itemsize = dtype.itemsize + if order == "K": + if self.flags.c_contiguous: + order = "C" + elif self.flags.f_contiguous: + order = "F" + if order == "C": + newstrides = _c_contiguous_strides(itemsize, shape) + elif order == "F": + newstrides = _f_contiguous_strides(itemsize, shape) + else: + maxstride = mystrides[0] + maxstrideind = 0 + for i in range(1, ndim): + curstride = mystrides[i] + if curstride > maxstride: + maxstrideind = i + maxstride = curstride + mymaxoffset = (maxstride / myitemsize) * shape[maxstrideind] + if mymaxoffset <= self.size: + # it's probably safe to just allocate and pass strides + # XXX (do we need to calculate full offset for [-1,-1,-1...]?) + newstrides = tuple((x // myitemsize) * itemsize for x in mystrides) + else: + # just punt and choose something close + if ndim > 1 and maxstrideind == 0: + newstrides = _c_contiguous_strides(itemsize, shape) + else: + newstrides = _f_contiguous_strides(itemsize, shape) + retval = self.__class__(shape, dtype, + allocator=self.allocator, strides=newstrides) + if slicer: + retval = retval[slicer] + return retval # operators --------------------------------------------------------------- def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None): @@ -1012,15 +1052,21 @@ def conj(self): def to_gpu(ary, allocator=drv.mem_alloc): """converts a numpy array to a GPUArray""" - result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary)) + ary, newstrides, slicer = _compact_positive_strides(ary) + result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides) result.set(ary) + if slicer: + result = result[slicer] return result def to_gpu_async(ary, allocator=drv.mem_alloc, stream=None): """converts a numpy array to a GPUArray""" - result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary)) + ary, newstrides, slicer = _compact_positive_strides(ary) + result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides) result.set_async(ary, stream) + if slicer: + result = result[slicer] return result @@ -1180,8 +1226,41 @@ class Info(Record): # }}} -def _compact_strides(a): - # Compute strides to have same order as self, but packed +def _flip_negative_strides(arrays): + # If arrays have negative strides, flip them. Return a list + # ``(slicer, arrays)`` where ``slicer`` is a tuple of slice objects + # used to flip the arrays (or ``None`` if there was no flipping), + # and ``arrays`` is the list of flipped arrays. + # NOTE: Every input array A must have the same value for the following + # expression: np.sign(A.strides) + # NOTE: ``slicer`` is its own inverse, so ``A[slicer][slicer] == A`` + if isinstance(arrays, GPUArray): + raise TypeError("_flip_negative_strides expects a list of GPUArrays") + slicer = None + ndim = arrays[0].ndim + shape = arrays[0].shape + for t in zip(range(ndim), *[np.sign(x.strides) for x in arrays]): + axis = t[0] + stride_sign = t[1] + if len(arrays) > 1: + if not np.all(t[2:] == stride_sign): + raise ValueError("found differing signs in dimension %d: %s" % (axis, t[1:])) + if stride_sign == -1: + if slicer is None: + slicer = [slice(None)] * ndim + slicer[axis] = slice(None, None, -1) + if slicer is not None: + slicer = tuple(slicer) + arrays = [x[slicer] for x in arrays] + return slicer, arrays + + +def _compact_positive_strides(a): + # Flip ``a``'s axes if there are any negative strides, then compute + # strides to have same order as a, but packed. Return flipped ``a`` + # and packed strides. + slicer, alist = _flip_negative_strides((a,)) + a = alist[0] info = sorted( (a.strides[axis], a.shape[axis], axis) for axis in range(len(a.shape))) @@ -1191,7 +1270,7 @@ def _compact_strides(a): for _, dim, axis in info: strides[axis] = stride stride *= dim - return strides + return a, strides, slicer def _memcpy_discontig(dst, src, async=False, stream=None): @@ -1216,6 +1295,14 @@ def _memcpy_discontig(dst, src, async=False, stream=None): dst[...] = src return + dst_gpudata = 0 + src_gpudata = 0 + if isinstance(src, GPUArray): + src_gpudata = src.gpudata + if isinstance(dst, GPUArray): + dst_gpudata = dst.gpudata + src, dst = _flip_negative_strides((src, dst))[1] + if src.flags.forc and dst.flags.forc: shape = [src.size] src_strides = dst_strides = [src.dtype.itemsize] From afc32518a26e142ccb69aa8062b93f1f12946c9c Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 21 Feb 2018 10:36:31 -0500 Subject: [PATCH 04/25] Make sure key is hashable. --- pycuda/deferred.py | 7 +++++-- pycuda/elementwise.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pycuda/deferred.py b/pycuda/deferred.py index 9699daa2..f8436702 100644 --- a/pycuda/deferred.py +++ b/pycuda/deferred.py @@ -417,8 +417,11 @@ class DeferredSourceModule(SourceModule): ``block``, and ``*args`` are the same arguments that were sent to the ``DeferredFunction`` call functions above. The function ``create_key(self, grid, block, *args)`` is always - called before ``create_source`` and the key returned (if not None) is - used to cache any compiled functions. + called before ``create_source`` and the key returned is used to cache + any compiled functions. Default return value of ``create_key()`` is + None, which means to use the function name and generated source as the + key. The return value of ``create_key()`` must be usable as a hash + key. ''' _cache = {} diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index f0ce0803..d0c559f7 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -64,7 +64,7 @@ def __init__(self, arguments, operation, super(ElementwiseSourceModule, self).__init__(**compilekwargs) self._do_range = do_range self._shape_arg_index = shape_arg_index - self._init_args = (arguments, operation, + self._init_args = (tuple(arguments), operation, name, preamble, loop_prep, after_loop) def create_key(self, grid, block, *args): From 9cb80f75537ae699c1b77445d96efc53a873c583 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 21 Feb 2018 11:24:01 -0500 Subject: [PATCH 05/25] Allow existing kernel calls to use non-contiguous arrays by sending actual GPUArrays and using ARRAY_i indices (rather than just i). --- pycuda/cumath.py | 10 +-- pycuda/curandom.py | 2 +- pycuda/elementwise.py | 73 ++++++++++----------- pycuda/gpuarray.py | 146 +++++++++++++++++++++--------------------- 4 files changed, 116 insertions(+), 115 deletions(-) diff --git a/pycuda/cumath.py b/pycuda/cumath.py index dbae5bd6..1a8dcb5b 100644 --- a/pycuda/cumath.py +++ b/pycuda/cumath.py @@ -42,7 +42,7 @@ def f(array, stream_or_out=None, **kwargs): func = elementwise.get_unary_func_kernel(func_name, array.dtype) func.prepared_async_call(array._grid, array._block, stream, - array.gpudata, out.gpudata, array.mem_size) + array, out, array.mem_size) return out return f @@ -77,7 +77,7 @@ def fmod(arg, mod, stream=None): func = elementwise.get_fmod_kernel() func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size) + arg, mod, result, arg.mem_size) return result @@ -94,7 +94,7 @@ def frexp(arg, stream=None): func = elementwise.get_frexp_kernel() func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) + arg, sig, expt, arg.mem_size) return sig, expt @@ -111,7 +111,7 @@ def ldexp(significand, exponent, stream=None): func = elementwise.get_ldexp_kernel() func.prepared_async_call(significand._grid, significand._block, stream, - significand.gpudata, exponent.gpudata, result.gpudata, + significand, exponent, result, significand.mem_size) return result @@ -129,7 +129,7 @@ def modf(arg, stream=None): func = elementwise.get_modf_kernel() func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, intpart.gpudata, fracpart.gpudata, + arg, intpart, fracpart, arg.mem_size) return fracpart, intpart diff --git a/pycuda/curandom.py b/pycuda/curandom.py index e1c68428..d31ee01a 100644 --- a/pycuda/curandom.py +++ b/pycuda/curandom.py @@ -240,7 +240,7 @@ def rand(shape, dtype=np.float32, stream=None): raise NotImplementedError; func.prepared_async_call(result._grid, result._block, stream, - result.gpudata, np.random.randint(2**31-1), result.size) + result, np.random.randint(2**31-1), result.size) return result diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index d0c559f7..6471af65 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -117,9 +117,8 @@ def create_key(self, grid, block, *args): self._arraypairs = arraypairs self._arrayspecificinds = arrayspecificinds - key = repr(self._init_args) - if contigmatch: + key = repr(self._init_args) return key # Arrays are not contiguous or different order @@ -526,7 +525,7 @@ def __call__(self, *args, **kwargs): "deal with non-contiguous arrays") vectors.append(arg) - invocation_args.append(arg.gpudata) + invocation_args.append(arg) else: invocation_args.append(arg) @@ -573,9 +572,9 @@ def get_take_kernel(dtype, idx_dtype, vec_count=1): "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) for i in range(vec_count)) body = ( - ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + ("%(idx_tp)s src_idx = idx[idx_i];\n" % ctx) + "\n".join( - "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) + "dest%d[dest%d_i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i, i) for i in range(vec_count))) mod = get_elwise_module(args, body, "take", preamble=preamble) @@ -618,11 +617,12 @@ def get_copy_insn(i): return ("dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)) - body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n" - "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + body = (("%(idx_tp)s src_idx = gmem_src_idx[gmem_src_idx_i];\n" + "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx) + "\n".join(get_copy_insn(i) for i in range(vec_count))) - mod = get_elwise_module(args, body, "take_put", preamble=preamble) + mod = get_elwise_module(args, body, "take_put", + preamble=preamble, shape_arg_index=0) func = mod.get_function("take_put") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] @@ -652,11 +652,12 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1): ] + [ScalarArg(np.intp, "n")] body = ( - "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx - + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) + "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx + + "\n".join("dest%d[dest_idx] = src%d[src%d_i];" % (i, i, i) for i in range(vec_count))) - func = get_elwise_module(args, body, "put").get_function("put") + func = get_elwise_module(args, body, "put", + shape_arg_index=0).get_function("put") func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char) return func @@ -668,7 +669,7 @@ def get_copy_kernel(dtype_dest, dtype_src): "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, - "dest[i] = src[i]", + "dest[dest_i] = src[src_i]", "copy") @@ -700,13 +701,13 @@ def get_linear_combination_kernel(summand_descriptors, args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i)) - summands.append("a%d*x%d[i]" % (i, i)) + summands.append("a%d*x%d[x%d_i]" % (i, i, i)) args.append(VectorArg(dtype_z, "z")) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module(args, - "z[i] = " + " + ".join(summands), + "z[z_i] = " + " + ".join(summands), "linear_combination", preamble="\n".join(preamble), loop_prep=";\n".join(loop_prep)) @@ -727,7 +728,7 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = a*x[i] + b*y[i]", + "z[z_i] = a*x[x_i] + b*y[y_i]", "axpbyz") @@ -738,7 +739,7 @@ def get_axpbz_kernel(dtype_x, dtype_z): "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z) }, - "z[i] = a * x[i] + b", + "z[z_i] = a * x[x_i] + b", "axpb") @@ -750,7 +751,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = x[i] %s y[i]" % operator, + "z[z_i] = x[x_i] %s y[y_i]" % operator, "multiply") @@ -761,7 +762,7 @@ def get_rdivide_elwise_kernel(dtype_x, dtype_z): "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = y / x[i]", + "z[z_i] = y / x[x_i]", "divide_r") @@ -773,7 +774,7 @@ def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s(x[i], y[i])" % func, + "z[z_i] = %s(x[x_i], y[y_i])" % func, func+"_kernel") @@ -785,7 +786,7 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s(x[i], y)" % func, + "z[z_i] = %s(x[x_i], y)" % func, func+"_kernel") @@ -809,7 +810,7 @@ def get_fill_kernel(dtype): "%(tp)s a, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = a", + "z[z_i] = a", "fill") @@ -819,7 +820,7 @@ def get_reverse_kernel(dtype): "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = y[n-1-i]", + "z[z_i] = y[n-1-y_i]", "reverse") @@ -830,7 +831,7 @@ def get_real_kernel(dtype, real_dtype): "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, - "z[i] = real(y[i])", + "z[z_i] = real(y[y_i])", "real") @@ -841,7 +842,7 @@ def get_imag_kernel(dtype, real_dtype): "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, - "z[i] = imag(y[i])", + "z[z_i] = imag(y[y_i])", "imag") @@ -851,7 +852,7 @@ def get_conj_kernel(dtype): "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = pycuda::conj(y[i])", + "z[z_i] = pycuda::conj(y[y_i])", "conj") @@ -861,7 +862,7 @@ def get_arange_kernel(dtype): "%(tp)s *z, %(tp)s start, %(tp)s step" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = start + i*step", + "z[z_i] = start + z_i*step", "arange") @@ -876,7 +877,7 @@ def get_pow_kernel(dtype): "%(tp)s value, %(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = %s(y[i], value)" % func, + "z[z_i] = %s(y[y_i], value)" % func, "pow_method") @@ -893,7 +894,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s(x[i], y[i])" % func, + "z[z_i] = %s(x[x_i], y[y_i])" % func, "pow_method") @@ -901,7 +902,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): def get_fmod_kernel(): return get_elwise_kernel( "float *arg, float *mod, float *z", - "z[i] = fmod(arg[i], mod[i])", + "z[z_i] = fmod(arg[arg_i], mod[mod_i])", "fmod_kernel") @@ -909,7 +910,7 @@ def get_fmod_kernel(): def get_modf_kernel(): return get_elwise_kernel( "float *x, float *intpart ,float *fracpart", - "fracpart[i] = modf(x[i], &intpart[i])", + "fracpart[fracpart_i] = modf(x[x_i], &intpart[intpart_i])", "modf_kernel") @@ -919,8 +920,8 @@ def get_frexp_kernel(): "float *x, float *significand, float *exponent", """ int expt = 0; - significand[i] = frexp(x[i], &expt); - exponent[i] = expt; + significand[significand_i] = frexp(x[x_i], &expt); + exponent[exponent_i] = expt; """, "frexp_kernel") @@ -929,7 +930,7 @@ def get_frexp_kernel(): def get_ldexp_kernel(): return get_elwise_kernel( "float *sig, float *expt, float *z", - "z[i] = ldexp(sig[i], int(expt[i]))", + "z[z_i] = ldexp(sig[sig_i], int(expt[expt_i]))", "ldexp_kernel") @@ -943,7 +944,7 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None): "tp_in": dtype_to_ctype(in_dtype), "tp_out": dtype_to_ctype(out_dtype), }, - "z[i] = %s(y[i])" % func_name, + "z[z_i] = %s(y[y_i])" % func_name, "%s_kernel" % func_name) @@ -955,7 +956,7 @@ def get_if_positive_kernel(crit_dtype, dtype): VectorArg(dtype, "else_"), VectorArg(dtype, "result"), ], - "result[i] = crit[i] > 0 ? then_[i] : else_[i]", + "result[result_i] = crit[crit_i] > 0 ? then_[then__i] : else_[else__i]", "if_positive") @@ -967,5 +968,5 @@ def get_scalar_op_kernel(dtype_x, dtype_y, operator): "tp_y": dtype_to_ctype(dtype_y), "tp_a": dtype_to_ctype(dtype_x), }, - "y[i] = x[i] %s a" % operator, + "y[y_i] = x[x_i] %s a" % operator, "scalarop_kernel") diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index cb78f5c6..b4d72e2b 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -134,7 +134,7 @@ def func(self, other): self.dtype, other.dtype, result.dtype, operator) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, + self, other, result, self.mem_size) return result @@ -143,7 +143,7 @@ def func(self, other): func = elementwise.get_scalar_op_kernel( self.dtype, result.dtype, operator) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other, result.gpudata, + self, other, result, self.mem_size) return result @@ -300,47 +300,47 @@ def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" assert self.shape == other.shape - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc or not other.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) if add_timer is not None: add_timer(3*self.size, func.prepared_timed_call(self._grid, - selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size)) + selffac, self, otherfac, other, + out, self.mem_size)) else: func.prepared_async_call(self._grid, self._block, stream, - selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size) + selffac, self, otherfac, other, + out, self.mem_size) return out def _axpbz(self, selffac, other, out, stream=None): """Compute ``out = selffac * self + other``, where `other` is a scalar.""" - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") func = elementwise.get_axpbz_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, - selffac, self.gpudata, - other, out.gpudata, self.mem_size) + selffac, self, + other, out, self.mem_size) return out def _elwise_multiply(self, other, out, stream=None): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "*") func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + self, other, + out, self.mem_size) return out @@ -350,31 +350,31 @@ def _rdiv_scalar(self, other, out, stream=None): y = n / self """ - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other, - out.gpudata, self.mem_size) + self, other, + out, self.mem_size) return out def _div(self, other, out, stream=None): """Divides an array by another array.""" - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc or not other.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") assert self.shape == other.shape func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "/") func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + self, other, + out, self.mem_size) return out @@ -554,7 +554,7 @@ def fill(self, value, stream=None): """fills the array with the specified value""" func = elementwise.get_fill_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, stream, - value, self.gpudata, self.mem_size) + value, self, self.mem_size) return self @@ -640,7 +640,7 @@ def __abs__(self): out_dtype=out_dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, self.mem_size) + self, result, self.mem_size) return result @@ -651,9 +651,9 @@ def _pow(self, other, new): """ if isinstance(other, GPUArray): - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc or not other.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") assert self.shape == other.shape @@ -666,14 +666,14 @@ def _pow(self, other, new): self.dtype, other.dtype, result.dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, + self, other, result, self.mem_size) return result else: - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") if new: result = self._new_like_me() @@ -681,7 +681,7 @@ def _pow(self, other, new): result = self func = elementwise.get_pow_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, None, - other, self.gpudata, result.gpudata, + other, self, result, self.mem_size) return result @@ -713,23 +713,23 @@ def reverse(self, stream=None): as one-dimensional. """ - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") result = self._new_like_me() func = elementwise.get_reverse_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, result.gpudata, + self, result, self.mem_size) return result def astype(self, dtype, stream=None): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") if dtype == self.dtype: return self.copy() @@ -738,7 +738,7 @@ def astype(self, dtype, stream=None): func = elementwise.get_copy_kernel(dtype, self.dtype) func.prepared_async_call(self._grid, self._block, stream, - result.gpudata, self.gpudata, + result, self, self.mem_size) return result @@ -750,9 +750,9 @@ def reshape(self, *shape, **kwargs): order = kwargs.pop("order", "C") # TODO: add more error-checking, perhaps - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") if isinstance(shape[0], tuple) or isinstance(shape[0], list): shape = tuple(shape[0]) @@ -978,7 +978,7 @@ def real(self): func = elementwise.get_real_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, + self, result, self.mem_size) return result @@ -989,9 +989,9 @@ def real(self): def imag(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) @@ -1003,7 +1003,7 @@ def imag(self): func = elementwise.get_imag_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, + self, result, self.mem_size) return result @@ -1013,9 +1013,9 @@ def imag(self): def conj(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + #if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") if self.flags.f_contiguous: order = "F" @@ -1025,7 +1025,7 @@ def conj(self): func = elementwise.get_conj_kernel(dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, + self, result, self.mem_size) return result @@ -1219,7 +1219,7 @@ class Info(Record): func = elementwise.get_arange_kernel(dtype) func.prepared_async_call(result._grid, result._block, kwargs.get("stream"), - result.gpudata, start, step, size) + result, start, step, size) return result @@ -1244,7 +1244,7 @@ def _flip_negative_strides(arrays): stride_sign = t[1] if len(arrays) > 1: if not np.all(t[2:] == stride_sign): - raise ValueError("found differing signs in dimension %d: %s" % (axis, t[1:])) + raise ValueError("found differing signs in strides for dimension %d (strides for all arrays: %s)" % (axis, [x.strides for x in arrays])) if stride_sign == -1: if slicer is None: slicer = [slice(None)] * ndim @@ -1431,7 +1431,7 @@ def take(a, indices, out=None, stream=None): a.bind_to_texref_ext(tex_src[0], allow_double_hack=True, allow_complex_hack=True) func.prepared_async_call(out._grid, out._block, stream, - indices.gpudata, out.gpudata, indices.size) + indices, out, indices.size) return out @@ -1473,8 +1473,8 @@ def make_func_for_chunk_size(chunk_size): a.bind_to_texref_ext(tex_src[i], allow_double_hack=True) func.prepared_async_call(indices._grid, indices._block, stream, - indices.gpudata, - *([o.gpudata for o in out[chunk_slice]] + indices, + *([o for o in out[chunk_slice]] + [indices.size])) return out @@ -1538,8 +1538,8 @@ def make_func_for_chunk_size(chunk_size): a.bind_to_texref_ext(src_tr, allow_double_hack=True) func.prepared_async_call(src_indices._grid, src_indices._block, stream, - dest_indices.gpudata, src_indices.gpudata, - *([o.gpudata for o in out[chunk_slice]] + dest_indices, src_indices, + *([o for o in out[chunk_slice]] + src_offsets_list[chunk_slice] + [src_indices.size])) @@ -1583,9 +1583,9 @@ def make_func_for_chunk_size(chunk_size): func = make_func_for_chunk_size(vec_count-start_i) func.prepared_async_call(dest_indices._grid, dest_indices._block, stream, - dest_indices.gpudata, - *([o.gpudata for o in out[chunk_slice]] - + [i.gpudata for i in arrays[chunk_slice]] + dest_indices, + *([o for o in out[chunk_slice]] + + [i for i in arrays[chunk_slice]] + [dest_indices.size])) return out @@ -1637,7 +1637,7 @@ def if_positive(criterion, then_, else_, out=None, stream=None): out = empty_like(then_) func.prepared_async_call(criterion._grid, criterion._block, stream, - criterion.gpudata, then_.gpudata, else_.gpudata, out.gpudata, + criterion, then_, else_, out, criterion.size) return out @@ -1652,14 +1652,14 @@ def f(a, b, out=None, stream=None): a.dtype, b.dtype, out.dtype, use_scalar=False) func.prepared_async_call(a._grid, a._block, stream, - a.gpudata, b.gpudata, out.gpudata, a.size) + a, b, out, a.size) elif isinstance(a, GPUArray): if out is None: out = empty_like(a) func = elementwise.get_binary_minmax_kernel(which, a.dtype, a.dtype, out.dtype, use_scalar=True) func.prepared_async_call(a._grid, a._block, stream, - a.gpudata, b, out.gpudata, a.size) + a, b, out, a.size) else: # assuming b is a GPUArray if out is None: out = empty_like(b) @@ -1667,7 +1667,7 @@ def f(a, b, out=None, stream=None): b.dtype, b.dtype, out.dtype, use_scalar=True) # NOTE: we switch the order of a and b here! func.prepared_async_call(b._grid, b._block, stream, - b.gpudata, a, out.gpudata, b.size) + b, a, out, b.size) return out return f From b38c75fec1c9ad01307e1ea1e91dac2920a499d5 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 21 Feb 2018 13:44:34 -0500 Subject: [PATCH 06/25] Fix variable names. --- pycuda/deferred.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/deferred.py b/pycuda/deferred.py index f8436702..104ca60b 100644 --- a/pycuda/deferred.py +++ b/pycuda/deferred.py @@ -244,7 +244,7 @@ def _deferred_func(*args, **kwargs): return _retval args = self._eval_list(args) kwargs = self._eval_dict(kwargs) - return getattr(self._val, _name)(*newargs, **newkwargs) + return getattr(self._val, _name)(*args, **kwargs) _deferred_func.__name__ = _name + ".deferred" return _deferred_func From 1f6486ba7afbdef52d2752bf85f509646c98349c Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 21 Feb 2018 13:45:16 -0500 Subject: [PATCH 07/25] Non-contiguous is OK now. --- pycuda/gpuarray.py | 72 ++-------------------------------------------- 1 file changed, 3 insertions(+), 69 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index b4d72e2b..e2bb3e54 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -118,17 +118,9 @@ def splay(n, dev=None): def _make_binary_op(operator): def func(self, other): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - if isinstance(other, GPUArray): assert self.shape == other.shape - if not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - result = self._new_like_me() func = elementwise.get_binary_op_kernel( self.dtype, other.dtype, result.dtype, @@ -300,9 +292,6 @@ def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" assert self.shape == other.shape - #if not self.flags.forc or not other.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) @@ -320,10 +309,6 @@ def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None): def _axpbz(self, selffac, other, out, stream=None): """Compute ``out = selffac * self + other``, where `other` is a scalar.""" - #if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - func = elementwise.get_axpbz_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, selffac, self, @@ -332,10 +317,6 @@ def _axpbz(self, selffac, other, out, stream=None): return out def _elwise_multiply(self, other, out, stream=None): - #if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "*") func.prepared_async_call(self._grid, self._block, stream, @@ -350,10 +331,6 @@ def _rdiv_scalar(self, other, out, stream=None): y = n / self """ - #if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, self, other, @@ -364,10 +341,6 @@ def _rdiv_scalar(self, other, out, stream=None): def _div(self, other, out, stream=None): """Divides an array by another array.""" - #if not self.flags.forc or not other.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - assert self.shape == other.shape func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, @@ -651,10 +624,6 @@ def _pow(self, other, new): """ if isinstance(other, GPUArray): - #if not self.flags.forc or not other.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - assert self.shape == other.shape if new: @@ -671,10 +640,6 @@ def _pow(self, other, new): return result else: - #if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - if new: result = self._new_like_me() else: @@ -713,10 +678,6 @@ def reverse(self, stream=None): as one-dimensional. """ - #if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - result = self._new_like_me() func = elementwise.get_reverse_kernel(self.dtype) @@ -727,10 +688,6 @@ def reverse(self, stream=None): return result def astype(self, dtype, stream=None): - #if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - if dtype == self.dtype: return self.copy() @@ -750,9 +707,6 @@ def reshape(self, *shape, **kwargs): order = kwargs.pop("order", "C") # TODO: add more error-checking, perhaps - #if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") if isinstance(shape[0], tuple) or isinstance(shape[0], list): shape = tuple(shape[0]) @@ -970,11 +924,7 @@ def real(self): if issubclass(dtype.type, np.complexfloating): from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) - if self.flags.f_contiguous: - order = "F" - else: - order = "C" - result = self._new_like_me(dtype=real_dtype, order=order) + result = self._new_like_me(dtype=real_dtype) func = elementwise.get_real_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, @@ -989,17 +939,9 @@ def real(self): def imag(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - #if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) - if self.flags.f_contiguous: - order = "F" - else: - order = "C" - result = self._new_like_me(dtype=real_dtype, order=order) + result = self._new_like_me(dtype=real_dtype) func = elementwise.get_imag_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, @@ -1013,15 +955,7 @@ def imag(self): def conj(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - #if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - - if self.flags.f_contiguous: - order = "F" - else: - order = "C" - result = self._new_like_me(order=order) + result = self._new_like_me() func = elementwise.get_conj_kernel(dtype) func.prepared_async_call(self._grid, self._block, None, From edcc44af0dd5d8bc65ffe5672ee9c0e63438162c Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 21 Feb 2018 14:36:40 -0500 Subject: [PATCH 08/25] Forgot to remove non-contiguity check. --- pycuda/elementwise.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 6471af65..572e0f0b 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -520,10 +520,6 @@ def __call__(self, *args, **kwargs): for arg, arg_descr in zip(args, arguments): if isinstance(arg_descr, VectorArg): - if not arg.flags.forc: - raise RuntimeError("elementwise kernel cannot " - "deal with non-contiguous arrays") - vectors.append(arg) invocation_args.append(arg) else: From e23c943366db0e1487be3d3cd0e77a183acae42a Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 21 Feb 2018 15:26:47 -0500 Subject: [PATCH 09/25] Allow setting scalars. --- pycuda/gpuarray.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index e2bb3e54..bf094dde 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -912,7 +912,11 @@ def __getitem__(self, index): strides=tuple(new_strides)) def __setitem__(self, index, value): - _memcpy_discontig(self[index], value) + if isinstance(value, GPUArray) or isinstance(value, np.ndarray): + return _memcpy_discontig(self[index], value) + + # Let's assume it's a scalar + self[index].fill(value) # }}} @@ -1229,12 +1233,6 @@ def _memcpy_discontig(dst, src, async=False, stream=None): dst[...] = src return - dst_gpudata = 0 - src_gpudata = 0 - if isinstance(src, GPUArray): - src_gpudata = src.gpudata - if isinstance(dst, GPUArray): - dst_gpudata = dst.gpudata src, dst = _flip_negative_strides((src, dst))[1] if src.flags.forc and dst.flags.forc: From 9cfaf9736e05a2637b3795820f55ebbc7bba9fdd Mon Sep 17 00:00:00 2001 From: "Gregory R. Lee" Date: Wed, 6 Dec 2017 17:38:14 -0500 Subject: [PATCH 10/25] fix: update signature of gpuarray.reshape to match the GPUArray method --- pycuda/gpuarray.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index bf094dde..a714f572 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -671,7 +671,7 @@ def __ipow__(self, other): """ return self._pow(other,new=False) - + def reverse(self, stream=None): """Return this array in reversed order. The array is treated @@ -1542,13 +1542,13 @@ def transpose(a, axes=None): return a.transpose(axes) -def reshape(a, shape): +def reshape(a, *shape, **kwargs): """Gives a new shape to an array without changing its data. .. versionadded:: 2015.2 """ - return a.reshape(shape) + return a.reshape(*shape, **kwargs) # }}} From 080ec5973765ca3156a1c7d83877d1f3758e6739 Mon Sep 17 00:00:00 2001 From: Emanuel Rietveld Date: Tue, 20 Feb 2018 08:04:30 +0900 Subject: [PATCH 11/25] Add get_texref() to ElementwiseKernel --- pycuda/elementwise.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 572e0f0b..563612b5 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -469,7 +469,7 @@ def get_elwise_kernel_and_types(arguments, operation, func = mod.get_function(name) func.prepare("".join(arg.struct_char for arg in arguments)) - return func, arguments + return mod, func, arguments def get_elwise_kernel(arguments, operation, @@ -477,7 +477,7 @@ def get_elwise_kernel(arguments, operation, """Return a L{pycuda.driver.Function} that performs the same scalar operation on one or several vectors. """ - func, arguments = get_elwise_kernel_and_types( + mod, func, arguments = get_elwise_kernel_and_types( arguments, operation, name, keep, options, **kwargs) return func @@ -491,9 +491,13 @@ def __init__(self, arguments, operation, self.gen_kwargs.update(dict(keep=keep, options=options, name=name, operation=operation, arguments=arguments)) + def get_texref(self, name, use_range=False): + mod, knl, arguments = self.generate_stride_kernel_and_types(use_range=use_range) + return mod.get_texref(name) + @memoize_method def generate_stride_kernel_and_types(self, use_range): - knl, arguments = get_elwise_kernel_and_types(use_range=use_range, + mod, knl, arguments = get_elwise_kernel_and_types(use_range=use_range, **self.gen_kwargs) assert [i for i, arg in enumerate(arguments) @@ -501,7 +505,7 @@ def generate_stride_kernel_and_types(self, use_range): "ElementwiseKernel can only be used with functions that " \ "have at least one vector argument" - return knl, arguments + return mod, knl, arguments def __call__(self, *args, **kwargs): vectors = [] @@ -515,7 +519,7 @@ def __call__(self, *args, **kwargs): + ", ".join(six.iterkeys(kwargs))) invocation_args = [] - func, arguments = self.generate_stride_kernel_and_types( + mod, func, arguments = self.generate_stride_kernel_and_types( range_ is not None or slice_ is not None) for arg, arg_descr in zip(args, arguments): From a7cb9829ae5bae730f789dbef3aaffd17623504e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 26 Feb 2018 23:46:52 -0600 Subject: [PATCH 12/25] Update bpl-subset, possibly including pypy support --- bpl-subset | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bpl-subset b/bpl-subset index e7c5f513..7ea98ed3 160000 --- a/bpl-subset +++ b/bpl-subset @@ -1 +1 @@ -Subproject commit e7c5f5131daca6298b5e8aa48d06e7ecffec2ffa +Subproject commit 7ea98ed300b63876f78746afda0d7240b478f355 From fb10ffd5b821ba553a83c153b6ebf51ddeab8db2 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 26 Feb 2018 23:48:17 -0600 Subject: [PATCH 13/25] Make characterize.platform_bits work with Pypy (patch by Emanuel Rietveld) --- pycuda/characterize.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pycuda/characterize.py b/pycuda/characterize.py index 1c54af7d..2206e588 100644 --- a/pycuda/characterize.py +++ b/pycuda/characterize.py @@ -6,7 +6,11 @@ def platform_bits(): - return tuple.__itemsize__ * 8 + import sys + if sys.maxsize > 2**32: + return 64 + else: + return 32 def has_stack(): From 71ec966f8c97d09469ee7b735cbc8b4c0437d449 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 27 Feb 2018 00:31:12 -0600 Subject: [PATCH 14/25] Fix pytest script-based test invocation --- test/test_cumath.py | 2 +- test/test_driver.py | 2 +- test/test_gpuarray.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_cumath.py b/test/test_cumath.py index 874ccb64..272ab5ac 100644 --- a/test/test_cumath.py +++ b/test/test_cumath.py @@ -242,5 +242,5 @@ def test_unary_func_kwargs(self): if len(sys.argv) > 1: exec (sys.argv[1]) else: - from py.test.cmdline import main + from pytest import main main([__file__]) diff --git a/test/test_driver.py b/test/test_driver.py index f88a1d67..038cf64f 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -961,5 +961,5 @@ def test_import_pyopencl_before_pycuda(): if len(sys.argv) > 1: exec (sys.argv[1]) else: - from py.test.cmdline import main + from pytest import main main([__file__]) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 93cc954f..e38e2fda 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -1153,5 +1153,5 @@ def test_zeros_like_etc(self): if len(sys.argv) > 1: exec (sys.argv[1]) else: - from py.test.cmdline import main + from pytest import main main([__file__]) From 542cffa2bcfaa915770c2a1fd94071d1288656d7 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Thu, 22 Feb 2018 15:14:24 -0500 Subject: [PATCH 15/25] Fix DeferredFunction.__call__, and change modulelazy to deferredmod. --- pycuda/deferred.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pycuda/deferred.py b/pycuda/deferred.py index 104ca60b..1c79ace0 100644 --- a/pycuda/deferred.py +++ b/pycuda/deferred.py @@ -319,8 +319,8 @@ class DeferredFunction(object): rather than their ``.gpudata`` members; this can be helpful to dynamically create kernels. ''' - def __init__(self, modulelazy, funcname): - self._modulelazy = modulelazy + def __init__(self, deferredmod, funcname): + self._deferredmod = deferredmod self._funcname = funcname self._prepare_args = None @@ -346,7 +346,7 @@ def _fix_texrefs(self, kwargs): kwargs['texrefs'] = newtexrefs def __call__(self, *args, **kwargs): - func = self._modulelazy.create_function(self._funcname, args) + func = self._deferredmod._delayed_get_function(self._funcname, args) self._fix_texrefs(kwargs) return func.__call__(*args, **kwargs) @@ -367,7 +367,7 @@ def _do_delayed_prepare(self, func): def _generic_prepared_call(self, funcmethodstr, funcmethodargs, funcargs, funckwargs): grid = funcmethodargs[0] block = funcmethodargs[1] - func = self._modulelazy._delayed_get_function(self._funcname, funcargs, grid, block) + func = self._deferredmod._delayed_get_function(self._funcname, funcargs, grid, block) self._do_delayed_prepare(func) newfuncargs = [ getattr(arg, 'gpudata', arg) for arg in funcargs ] fullargs = list(funcmethodargs) From 8d278fbf85f46b825323823b825f3f30d3d21a40 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:34:39 -0500 Subject: [PATCH 16/25] Make sure 'texrefs' keyword arg is re-evaluated every time. (Function may change based on kernel call arguments) --- pycuda/deferred.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pycuda/deferred.py b/pycuda/deferred.py index 1c79ace0..2673e9db 100644 --- a/pycuda/deferred.py +++ b/pycuda/deferred.py @@ -338,16 +338,20 @@ def _unimplemented(self, _methodname=_methodname, *args, **kwargs): def _fix_texrefs(self, kwargs): texrefs = kwargs.get('texrefs', None) if texrefs is not None: + kwargs = kwargs.copy() newtexrefs = [] for texref in texrefs: if isinstance(texref, DeferredVal): - texref = texref._eval() + # don't use _eval() as the cached value may cause + # problems when this function is called again + texref = texref._evalbase() newtexrefs.append(texref) kwargs['texrefs'] = newtexrefs + return kwargs def __call__(self, *args, **kwargs): func = self._deferredmod._delayed_get_function(self._funcname, args) - self._fix_texrefs(kwargs) + kwargs = self._fix_texrefs(kwargs) return func.__call__(*args, **kwargs) def param_set_texref(self, *args, **kwargs): @@ -361,7 +365,7 @@ def _do_delayed_prepare(self, func): if self._prepare_args is None: raise Exception("prepared_*_call() requires that prepare() be called first") (prepare_args, prepare_kwargs) = self._prepare_args - self._fix_texrefs(prepare_kwargs) + kwargs = self._fix_texrefs(prepare_kwargs) func.prepare(*prepare_args, **prepare_kwargs) def _generic_prepared_call(self, funcmethodstr, funcmethodargs, funcargs, funckwargs): From 5a4a2fae1fb23041a610fa8ebf7feb326636648b Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:37:17 -0500 Subject: [PATCH 17/25] Store module in cache too. --- pycuda/deferred.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pycuda/deferred.py b/pycuda/deferred.py index 2673e9db..bdf3ffeb 100644 --- a/pycuda/deferred.py +++ b/pycuda/deferred.py @@ -470,12 +470,12 @@ def _delayed_get_function(self, funcname, funcargs, grid, block): source = source.generate() if key is None: funckey = (funcname, source) - func = funccache.get(funckey, None) - if func is None: + modfunc = funccache.get(funckey, None) + if modfunc is None: module = self._delayed_compile(source) func = module.get_function(funcname) - funccache[funckey] = func - return func + modfunc = funccache[funckey] = (module, func) + return modfunc[1] def get_function(self, name): return DeferredFunction(self, name) From 05a5400136adeb41c88cad8c3814639a49b89bc1 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:38:14 -0500 Subject: [PATCH 18/25] Send grid and block to _delayed_get_function --- pycuda/deferred.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pycuda/deferred.py b/pycuda/deferred.py index bdf3ffeb..c7ff9893 100644 --- a/pycuda/deferred.py +++ b/pycuda/deferred.py @@ -350,7 +350,11 @@ def _fix_texrefs(self, kwargs): return kwargs def __call__(self, *args, **kwargs): - func = self._deferredmod._delayed_get_function(self._funcname, args) + block = kwargs.get('block', None) + if block is None or not isinstance(block, tuple) or len(block) != 3: + raise ValueError("keyword argument 'block' is required, and must be a 3-tuple of integers") + grid = kwargs.get('grid', (1,1)) + func = self._deferredmod._delayed_get_function(self._funcname, args, grid, block) kwargs = self._fix_texrefs(kwargs) return func.__call__(*args, **kwargs) From 11108fee0d9a6b3c519861a08a9084132dccb653 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:38:43 -0500 Subject: [PATCH 19/25] Fix comment. --- pycuda/deferred.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/deferred.py b/pycuda/deferred.py index c7ff9893..e0ca5258 100644 --- a/pycuda/deferred.py +++ b/pycuda/deferred.py @@ -147,7 +147,7 @@ class DeferredVal(object): ''' This is an object that serves as a proxy to an as-yet undetermined object, which is only known at the time when either ``_set_val()`` - or ``_eval()`` is called. Any calls to methods listed in the class + or ``_evalbase()`` is called. Any calls to methods listed in the class attribute ``_deferred_method_dict`` are queued until then, at which point the queued method calls are executed in order immediately on the new object. From 1a6228c6fa031e922b8ede9730662840522ef568 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:41:40 -0500 Subject: [PATCH 20/25] Add debug option to ElementwiseSourceModule. --- pycuda/elementwise.py | 71 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 563612b5..7d610034 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -60,12 +60,14 @@ class ElementwiseSourceModule(DeferredSourceModule): def __init__(self, arguments, operation, name="kernel", preamble="", loop_prep="", after_loop="", do_range=False, shape_arg_index=None, + debug=False, **compilekwargs): super(ElementwiseSourceModule, self).__init__(**compilekwargs) self._do_range = do_range self._shape_arg_index = shape_arg_index self._init_args = (tuple(arguments), operation, name, preamble, loop_prep, after_loop) + self._debug = debug def create_key(self, grid, block, *args): (arguments, operation, @@ -268,6 +270,12 @@ def create_source(self, grid, block, *args): arraynames = [ x[0] for x in arrayarginfos ] defines = DeferredSource() + decls = DeferredSource() + loop_preop = DeferredSource() + loop_inds_calc = DeferredSource() + loop_inds_inc = DeferredSource() + loop_body = DeferredSource() + for dimnum in range(ndim): defines += """ #define SHAPE_%d %d @@ -284,7 +292,6 @@ def create_source(self, grid, block, *args): name, dimnum, dimelemstrides[dimnum], name, dimnum, blockelemstrides[dimnum]) - decls = DeferredSource() decls += """ unsigned GLOBAL_i = cta_start + tid; """ @@ -297,7 +304,6 @@ def create_source(self, grid, block, *args): long INDEX_%d; """ % (dimnum,) - loop_inds_calc = DeferredSource() loop_inds_calc += """ unsigned int TMP_GLOBAL_i = GLOBAL_i; """ @@ -313,7 +319,6 @@ def create_source(self, grid, block, *args): %s_i += INDEX_%d * ELEMSTRIDE_%s_%d; """ % (name, dimnum, name, dimnum) - loop_inds_inc = DeferredSource() for dimnum in range(ndim): loop_inds_inc += """ INDEX_%d += BLOCK_STEP_%d; @@ -341,13 +346,57 @@ def create_source(self, grid, block, *args): } """ - loop_body = DeferredSource() + if self._debug: + preamble += """ + #include + """ + loop_inds_calc += """ + if (cta_start == 0 && tid == 0) { + """ + loop_inds_calc.indent() + loop_inds_calc += r""" + printf("=======================\n"); + printf("CALLING FUNC %s\n"); + printf("N=%%u\n", (unsigned int)n); + """ % (funcname,) + for name, elemstrides, dimelemstrides, blockelemstrides in arrayarginfos: + loop_inds_calc += r""" + printf("(%s) %s: ptr=0x%%lx maxoffset(elems)=%s\n", (unsigned long)%s); + """ % (funcname, name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name) + loop_inds_calc.dedent() + loop_inds_calc += """ + } + """ + indtest = DeferredSource() + for name in arraynames: + indtest += r""" + if (%s_i > %s || %s_i < 0) { + """ % (name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name) + indtest.indent() + indtest += r""" + printf("cta_start=%%d tid=%%d GLOBAL_i=%%d %s_i=%%d\n", cta_start, tid, GLOBAL_i, %s_i); + break; + """ % (name, name) + indtest.dedent() + indtest += """ + } + """ + loop_preop = indtest + loop_preop + after_loop += r""" + if (cta_start == 0 && tid == 0) { + printf("DONE CALLING FUNC %s\n"); + printf("-----------------------\n"); + } + """ % (funcname,) + if self._do_range: loop_body.add(""" if (step < 0) { for (/*void*/; GLOBAL_i > stop; GLOBAL_i += total_threads*step) { + %(loop_preop)s; + %(operation)s; %(loop_inds_inc)s; @@ -357,12 +406,15 @@ def create_source(self, grid, block, *args): { for (/*void*/; GLOBAL_i < stop; GLOBAL_i += total_threads*step) { + %(loop_preop)s; + %(operation)s; %(loop_inds_inc)s; } } """, format_dict={ + "loop_preop": loop_preop, "operation": operation, "loop_inds_inc": loop_inds_inc, }) @@ -370,11 +422,14 @@ def create_source(self, grid, block, *args): loop_body.add(""" for (/*void*/; GLOBAL_i < n; GLOBAL_i += total_threads) { + %(loop_preop)s; + %(operation)s; %(loop_inds_inc)s; } """, format_dict={ + "loop_preop": loop_preop, "operation": operation, "loop_inds_inc": loop_inds_inc, }) @@ -425,23 +480,23 @@ def create_source(self, grid, block, *args): def get_elwise_module(arguments, operation, name="kernel", keep=False, options=None, preamble="", loop_prep="", after_loop="", - shape_arg_index=None): + **kwargs): return ElementwiseSourceModule(arguments, operation, name=name, preamble=preamble, loop_prep=loop_prep, after_loop=after_loop, keep=keep, options=options, - shape_arg_index=shape_arg_index) + **kwargs) def get_elwise_range_module(arguments, operation, name="kernel", keep=False, options=None, preamble="", loop_prep="", after_loop="", - shape_arg_index=None): + **kwargs): return ElementwiseSourceModule(arguments, operation, name=name, preamble=preamble, loop_prep=loop_prep, after_loop=after_loop, keep=keep, options=options, do_range=True, - shape_arg_index=shape_arg_index) + **kwargs) def get_elwise_kernel_and_types(arguments, operation, name="kernel", keep=False, options=None, use_range=False, **kwargs): From b7436983b05bbf30343637890dc2f4065d963f5e Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:44:50 -0500 Subject: [PATCH 21/25] Fix index calculation (found using _debug!) --- pycuda/elementwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 7d610034..5732b2e0 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -339,8 +339,8 @@ def create_source(self, grid, block, *args): dimnum + 1) for name in arraynames: loop_inds_inc += """ - %s_i -= DIMELEMSTRIDE_%s_%d; - """ % (name, name, dimnum) + %s_i += ELEMSTRIDE_%s_%d - DIMELEMSTRIDE_%s_%d; + """ % (name, name, dimnum + 1, name, dimnum) loop_inds_inc.dedent() loop_inds_inc += """ } From 7e57a7286f9c9e28094087bddd416ac24a47dd0a Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:46:10 -0500 Subject: [PATCH 22/25] Add shape to the key (so it needs to remain a tuple). --- pycuda/elementwise.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 5732b2e0..aca0d903 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -130,7 +130,6 @@ def create_key(self, grid, block, *args): ndim = len(shape) numthreads = block[0] - shape = np.array(shape) # Use index of minimum stride in first array as a hint on how to # order the traversal of dimensions. We could probably do something @@ -148,7 +147,8 @@ def create_key(self, grid, block, *args): do_reverse = True if do_reverse: shape = shape[::-1] - block_step = np.array(shape) + shapearr = np.array(shape) + block_step = np.array(shapearr) tmp = numthreads for dimnum in range(ndim): newstep = tmp % block_step[dimnum] @@ -160,7 +160,7 @@ def create_key(self, grid, block, *args): elemstrides = np.array(arg.strides[::-1]) // arg.itemsize else: elemstrides = np.array(arg.strides) // arg.itemsize - dimelemstrides = elemstrides * shape + dimelemstrides = elemstrides * shapearr blockelemstrides = elemstrides * block_step arrayarginfos.append( (arg_descr.name, tuple(elemstrides), tuple(dimelemstrides), tuple(blockelemstrides)) @@ -172,7 +172,7 @@ def create_key(self, grid, block, *args): self._shape = shape self._block_step = block_step - key = (self._init_args, grid, block, tuple(self._arrayarginfos)) + key = (self._init_args, grid, block, shape, tuple(self._arrayarginfos)) return key From c5de070fae8f1fa04ea82b81a78bde406aa72686 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:46:31 -0500 Subject: [PATCH 23/25] Remove unnecessary format key. --- pycuda/elementwise.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index aca0d903..7487f25c 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -463,7 +463,6 @@ def create_source(self, grid, block, *args): } """, format_dict={ "arguments": ", ".join(arg.declarator() for arg in arguments), - "operation": operation, "name": funcname, "preamble": preamble, "loop_prep": loop_prep, From 61bd9084e660b4f5894916f638a95d6db9a1e6d2 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:46:44 -0500 Subject: [PATCH 24/25] Fix kernel name. --- pycuda/elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 7487f25c..ae158d36 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -806,7 +806,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): "tp_z": dtype_to_ctype(dtype_z), }, "z[z_i] = x[x_i] %s y[y_i]" % operator, - "multiply") + "binary_op") @context_dependent_memoize From 33a0dd8fa3943e218384dae0a715c781cdef1f84 Mon Sep 17 00:00:00 2001 From: Syam Gadde Date: Wed, 28 Feb 2018 13:52:38 -0500 Subject: [PATCH 25/25] Fix _array_like_helper to work with non-contiguous arrays. --- pycuda/gpuarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index a714f572..6fe18ae3 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1034,10 +1034,10 @@ def _array_like_helper(other_ary, dtype, order): order = "F" else: # array_like routines only return positive strides - strides = [np.abs(s) for s in other_ary.strides] + _, strides, _ = _compact_positive_strides(other_ary) if dtype is not None and dtype != other_ary.dtype: # scale strides by itemsize when dtype is not the same - itemsize = other_ary.nbytes // other_ary.size + itemsize = other_ary.dtype.itemsize itemsize_ratio = np.dtype(dtype).itemsize / itemsize strides = [int(s*itemsize_ratio) for s in strides] elif order not in ["C", "F"]: