+
    :i                     <    ^ RI Ht ^ RIHt ^ RIt^ RIHt RR ltR# )    )cuda)driverN)numpy_supportc           	     T  aa \        V R^ 4      pV'       gz   V P                  w  r4V P                  P                  V,          V P                  P                  3p\        P
                  P                  P                  WC3VV P                  VR7      p\        P                  ! V P                  4      o\        P                  ! 4       P                  p\        \        P                  ! ^\        P                   ! V^4      ^,          4      4      p\        Wg,          4      pW^,           3o\        P"                  VV3R l4       p	\        VP                  ^ ,          V,          ^,           4      \        VP                  ^,          V,          ^,           4      3p
W3pWW3,          ! W4       V# )a  Compute the transpose of 'a' and store it into 'b', if given,
and return it. If 'b' is not given, allocate a new array
and return that.

This implements the algorithm documented in
http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

:param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
    the device its stream will be used to perform the transpose (and to copy
    `b` to the device if necessary).
stream)dtyper   c                   < \         P                  P                  S
S	R 7      p\         P                  P                  p\         P                  P
                  p\         P                  P                  \         P                  P                  ,          p\         P                  P
                  \         P                  P
                  ,          pWc,           pWT,           pWd,           V P                  ^ ,          8  d:   WS,           V P                  ^,          8  d   WV,           WS,           3,          W$V3&   \         P                  ! 4        WP                  ^ ,          8  d*   WqP                  ^,          8  d   W#V3,          WV3&   R# R# R# ))shaper   N)
r   sharedarray	threadIdxxyblockIdxblockDimr
   syncthreads)inputoutputtiletxtybxbyr   r   dt
tile_shapes   &&       Z/var/www/html/photoedit/myenv/lib/python3.14/site-packages/numba/cuda/kernels/transpose.pykerneltranspose.<locals>.kernel)   s     {{  z <^^^^]]__t}}.]]__t}}.GG7U[[^#%++a.(@ b"'!12DRL||A1||A#6B<Fa4L $7    )getattrr
   r   itemsizer   cudadrvdevicearrayDeviceNDArraynps
from_dtyper   
get_deviceMAX_THREADS_PER_BLOCKintmathpowlogjit)abr   colsrowsstridestpb
tile_widthtile_heightr   blocksthreadsr   r   s   &&          @@r   	transposer8      sH    Q!$FWW
''""T)177+;+;;LL$$22L''	 3  
	 B




3
3CTXXa#q!1A!567Jc&'KA~.J	XX( ($ k)A-.AGGAJ4Ka4O0PPF%G
7"#A)Hr   )N)	numbar   numba.cuda.cudadrv.driverr   r*   numba.npr   r%   r8    r   r   <module>r=      s     ,  ):r   