
    ,id                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ ej                   j#                  ej                   j%                  e            Zej                   j)                  ed      gZej                   j)                  ed	      Zd
gZda e j2                         d        Z e j2                         d        Z G d de      Zd ZddddddZddddddZ dZ! e"e!      Z#d Z$ e%d  e&d      D              Z'de'd<   de'd<   de'd<   d Z(d  Z) G d! d"e      Z* G d# d$e      Z+y)%    N)Path)knobs)compile_module_from_src)_allocation)	GPUTarget)	GPUDriverincludeliblibcuda.so.1c            	         t         j                  j                  x} r| gS t        j                  ddg      j                  d      }|j                         D cg c]  }d|v s|j                         d    }}|D cg c]!  }t        j                  j                  |      # }}t        j                  d      }|r^|s\|j                  d      D cg c]B  }t        j                  j                  t        j                  j                  |d            sA|D }}d	}|r|d
t        |      z  z  }|dz  }n
|dz  }|dz  }t        d |D              sJ |       |S c c}w c c}w c c}w )Nz/sbin/ldconfigz-pignore)errorsr   LD_LIBRARY_PATH:zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      K   | ]A  }t         j                  j                  t         j                  j                  |d              C yw)r   N)ospathexistsjoin).0r   s     g/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/triton/backends/nvidia/driver.py	<genexpr>zlibcuda_dirs.<locals>.<genexpr>(   s,     Sdrww~~bggll4@ASs   AA	)r   nvidialibcuda_path
subprocesscheck_outputdecode
splitlinessplitr   r   dirnamegetenvr   r   strany)	env_libcuda_pathlibslinelocslocdirsenv_ld_library_pathdirmsgs	            r   libcuda_dirsr.      sW    <<4444 !!""$4d#;<CC8CTD *.):UnPT>TDJJLUDU,01SBGGOOC 1D1))$564288=sPRPWPWP\P\]`bpPqArss
&C2SY>>KKMM>>SdSSXUXXSK V1 ts   	E#E>&E AE%E%c                  $    t         gt               S N)libdevice_dirr.        r   library_dirsr4   ,   s    +LN++r3   c                   $     e Zd Z fdZd Z xZS )	CudaUtilsc                 d    t        | d      st        t        |   |       | _        | j                  S )Ninstance)hasattrsuperr6   __new__r8   )cls	__class__s    r   r;   zCudaUtils.__new__8   s*    sJ' C8=CL||r3   c                 x   t        t        t        j                  j	                  t
        d            j                         dt               t        t              }|j                  a
|j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        y )Nzdriver.c
cuda_utilssrcnamer4   include_dirs	libraries)r   r   r   r   r   r!   	read_textr4   rC   rD   PyCUtensorMapload_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_tma_descriptor)selfmods     r   __init__zCudaUtils.__init__=   s    %RWW\\':67AAC%%
 ))??%(%>%>",/,L,L)$'$<$<!#&#:#: r3   )__name__
__module____qualname__r;   rN   __classcell__r=   s   @r   r6   r6   6   s    
;r3   r6   c                     | d   dk(  ry| j                  d      ryi ddddd	d
dddddddddddddddddddddddddd|    S )Nr   *CUdeviceptr
tensordescCUtensorMapi1int8_ti8i16int16_ti32int32_ti64int64_tu1uint8_tu8u16uint16_tu32uint32_tu64uint64_tfp16doublebf16fp32f32fp64	nvTmaDesc)
startswith)tys    r   	ty_to_cpprt   S   s    	!u|	}}\"hh 	y 	y	
 	y 	i 	i 	z 	z 	z 	 	 	 	x 	  	]!" 	#
 
r3   rf   rh   rj   )rk   rm   rn   ro   rp   	pack_fp16	pack_bf16	pack_fp32	pack_fp64iiiKKppOOOOOOc                    fd}fdfdfd ||j                               }t        |      D ci c]  \  }}||
 }}}dj                  |j                         D cg c]
  } |       c}      }t        |z   }	g }
|j                         D ]  } ||
        t        |
      D ci c]  \  }}||
 }}}t	        |      dkD  r)ddj                  d |j                         D              z   nd}g }|j                         D ]P  \  }}|d	k(  r|t        v r|j                  t        |    d
|        2|j                  t        |       d
|        R dj                  |      }g }|j                         D ]u  \  }}|d   dk(  r|j                  d| d       $|t        v r|j                  d| d       B|dk(  r|j                  d|        \|d	k7  sb|j                  d|        w t        t	        |            }d}|j                         D cg c]  \  }}|d   dk(  rd| d| d| d| d	 }}}|j                         D cg c]  \  }}|dk(  rd| d| d| d }}}|j                         D cg c])  \  }}|t        v rt        |    d| dt        |    d| d+ }}}|j                         D cg c]  \  }}|d	k7  sd|  }}}|j                  d        |j                  d!       d"t	        |      dkD  rd|z   nd d#dj                  |       d$|j                  |j                         D cg c]  \  }} |       d| d% c}}       d&|	 d'| d(|j                  |       d|j                  |       d|j                  |       d)t	        |      dkD  rddj                  |      z   nd d*}|S c c}}w c c}w c c}}w c c}}w c c}}w c c}}w c c}}w c c}}w )+Nc                    g }d}| D ]1  }t        |t              r|j                  d      r
r
|   nd }|dz  }t        j                  d|      }|j                  d      }|j                  d      }|j                  d      dz   }|J|j                  d|z          t        d|z        D ]  }	|j                  d        |j                  d	       n|j                  d
       t        |      D ]  }	|j                  d        t        |      D ]  }	|j                  d        !|j                  |       4 
r|t        
      k(  sJ |S )Nr   rW      ztensordesc<([^[>]*)\[([^]]*)\]   ,rU   r`   rY   rq   r^   )

isinstancer#   rr   rematchgroupcountappendrangelen)	signatureoutputtensordesc_idxsigmetar   dtypeshapendim_tensordesc_metas             r   _expand_signaturez(make_launcher.<locals>._expand_signature   sC     	#C#s#|(D:I~6t!#!CSIAA{{3'!+<MM#+. #1t8_ -e,-MM$'MM+.t )AMM%()t )AMM%() c"9	#< #nO8L&LLLr3   c                 j    t        | t              r| D ]  } ||        y |j                  |        y r0   )r   tupler   )r   r   x_flatten_signatures      r   r   z)make_launcher.<locals>._flatten_signature   s4    c5! ."1f-. MM#r3   c                     t        | t              r!dj                  t        |             }d| dS | d   dk(  ry| dv ryt	        |       S )Nr~   []r   rU   z	PyObject*	constexprrq   )r   r   r   maprt   )rs   val_extracted_types     r   r   z&make_launcher.<locals>._extracted_type   sT    b% ((334Cse1:a5C<++}r3   c                     t        | t              r!dj                  t        |             }d| dS | d   dk(  ry| dv ry| j	                  d      ryd	d
ddddddddd
t        |          S )N ()r   rU   Or   rW   dlbhiLBHIK)
rl   longrZ   r]   r_   ra   rc   rf   rh   rj   )r   r   r   r   rr   rt   )rs   r   	format_ofs     r   r   z make_launcher.<locals>.format_of   s    b% ''#i,-Cse1:a5C<++==&
 B- 	r3   r   r   z, c              3   ,   K   | ]  \  }}d |   yw)z&_argNr2   )r   r   rs   s      r   r   z make_launcher.<locals>.<genexpr>   s      LB5 Ls   r   z argrU   ptr_infoz.dev_ptr_arg_storagerq   z*tma_ptrz
  zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;zCUtensorMap* tma_ptrz = getTmaDesc(_argz); if (!tma_ptrz) return NULL;z _argz_storage = z(_argz);z&argz&global_scratchz&profile_scratcha  
#include "cuda.h"
#include <dlfcn.h>
#include <stdbool.h>
#include <stdlib.h>
#define PY_SSIZE_T_CLEAN
#include <Python.h>

typedef struct {
  PyObject_HEAD;
  _Alignas(128) CUtensorMap tensorMap;
} PyCUtensorMapObject;

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch, CUdeviceptr profile_scratchz) {
  void *params[] = { au   };
  if (gridX*gridY*gridZ > 0) {
    // 4 attributes that we can currently pass maximum
    CUlaunchAttribute launchAttr[4];
    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
    if (cuLaunchKernelExHandle == NULL) {
      cuLaunchKernelExHandle = getLaunchKernelExHandle();
    }
    CUlaunchConfig config;
    config.gridDimX = gridX * num_ctas;
    config.gridDimY = gridY;
    config.gridDimZ = gridZ;

    config.blockDimX = 32 * num_warps;
    config.blockDimY = 1;
    config.blockDimZ = 1;
    config.sharedMemBytes = shared_memory;
    config.hStream = stream;
    config.attrs = launchAttr;
    int num_attrs = 0;

    if (launch_pdl != 0) {
      CUlaunchAttribute pdlAttr = { .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1};
      launchAttr[num_attrs] = pdlAttr;
      ++num_attrs;
    }

    if (launch_cooperative_grid != 0) {
      CUlaunchAttribute coopAttr = { .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1};
      launchAttr[num_attrs] = coopAttr;
      ++num_attrs;
    }

    if (num_ctas != 1) {
      CUlaunchAttribute clusterAttr = {};
      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      clusterAttr.value.clusterDim.x = num_ctas;
      clusterAttr.value.clusterDim.y = 1;
      clusterAttr.value.clusterDim.z = 1;
      launchAttr[num_attrs] = clusterAttr;
      ++num_attrs;

      CUlaunchAttribute clusterSchedulingAttr = {};
      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      launchAttr[num_attrs] = clusterSchedulingAttr;
      ++num_attrs;
    }

    // num_ctas == 16 is non-portable. Does work for H100 and B200 tho
    config.numAttrs = num_attrs;
    if (num_ctas == 16) {
      CUDA_CHECK(cuFuncSetAttribute(
          function,
          CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
          1
      ));
    }

    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static PyObject* data_ptr_str = NULL;
static PyObject* py_tensor_map_type = NULL;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str);
  if (!ret) {
    PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
    ptr_info.valid = false;
    goto cleanup;
  }
  if (!PyLong_Check(ret)) {
    PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
    ptr_info.valid = false;
    goto cleanup;
  }
  ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
  if(!ptr_info.dev_ptr)
    return ptr_info;
  uint64_t dev_ptr;
  int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
  if (status == CUDA_ERROR_INVALID_VALUE) {
      PyErr_Format(PyExc_ValueError,
                   "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
      ptr_info.valid = false;
  } else if (status != CUDA_SUCCESS) {
      CUDA_CHECK(status);  // Catch any other cuda API errors
      ptr_info.valid = false;
  }
  ptr_info.dev_ptr = dev_ptr;
cleanup:
  Py_XDECREF(ret);
  return ptr_info;

}

static inline CUtensorMap* getTmaDesc(PyObject *obj) {
  if (sizeof(CUtensorMap*) != 8) {
    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
    return NULL;
  }

if (Py_TYPE(obj) != (PyTypeObject*)py_tensor_map_type) {
    PyErr_Format(PyExc_TypeError, "object must be of type PyCUtensorMap, got %s", Py_TYPE(obj)->tp_name);
    return NULL;
}

  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
  if (align_128 != 0) {
    PyErr_Format(PyExc_ValueError, "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld", align_128);
    return NULL;
  }
  return map;
}

static void ensureCudaContext() {
  CUcontext pctx;
  CUDA_CHECK(cuCtxGetCurrent(&pctx));
  if (!pctx) {
    // Ensure device context.
    CUdevice device;
    CUDA_CHECK(cuDeviceGet(&device, 0));
    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
    CUDA_CHECK(cuCtxSetCurrent(pctx));
  }
}

static uint16_t pack_fp16(double f) {
    uint16_t result;
    // from https://github.com/python/pythoncapi-compat
#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
#else
    PyFloat_Pack2(f, (unsigned char*)&result, 1);
#endif
    return result;
}

static uint16_t pack_bf16(double f) {
    float f32 = (float)f;
    uint32_t u32 = *(uint32_t*)&f32;
    return (uint16_t)(u32 >> 16);
}

static uint32_t pack_fp32(double f) {
    float f32 = (float)f;
    return *(uint32_t*)&f32;
}

static uint64_t pack_fp64(double f) {
    return *(uint64_t*)&f;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
  ensureCudaContext();

  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  int launch_cooperative_grid;
  int launch_pdl;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
  PyObject *global_scratch_obj = NULL;
  PyObject *profile_scratch_obj = NULL;
  ;z
  if(!PyArg_ParseTuple(args, "aM  ", &gridX, &gridY, &gridZ,
                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj, &profile_scratch_obj,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hooka   )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory;
  if (!PyArg_ParseTuple(kernel_metadata, "iii", &num_warps, &num_ctas, &shared_memory)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  CUdeviceptr global_scratch = 0;
  if (global_scratch_obj != Py_None) {
    DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1);
    if (!global_scratch_info.valid) {
      return NULL;
    }
    global_scratch = global_scratch_info.dev_ptr;
  }

  CUdeviceptr profile_scratch = 0;
  if (profile_scratch_obj != Py_None) {
    DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1);
    if (!profile_scratch_info.valid) {
      return NULL;
    }
    profile_scratch = profile_scratch_info.dev_ptr;
  }

  // raise exception asap
  z
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch, profile_scratchap  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  Py_RETURN_NONE;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  data_ptr_str = PyUnicode_InternFromString("data_ptr");
  if(data_ptr_str == NULL) {
    return NULL;
  }
  PyObject* driver_mod = PyImport_ImportModule("triton.backends.nvidia.driver");
  if (driver_mod == NULL) {
    return NULL;
  }
  py_tensor_map_type = PyObject_GetAttrString(driver_mod, "PyCUtensorMap");
  if (py_tensor_map_type == NULL) {
    return NULL;
  }

  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)values	enumerater   _BASE_ARGS_FORMATr   itemsFLOAT_STORAGE_TYPEr   rt   r   FLOAT_PACK_FUNCTION)	constantsr   r   r   expand_signaturer   srs   args_formatformatflat_signaturer   	args_listarg_decl_list	arg_declsinternal_args_listparamsnewline	ptr_decls	tma_declsfloat_storage_declsrA   r   r   r   s     `                   @@@r   make_launcherr      s   %N. ))9)9);<"+,<"=>$!QA>I>''93C3C3EFR9R=FGK,FN! 03/0"+N";<$!QA<I<PST]P^abPbtyy L)//:K LLLhjI M" <2##  $6r$:#;4s!CD  IbM?$qc!:;< 		-(I" 	22a5C<%%8&<=%%%%QCx&89;%%n5;%%QCj1	2 3y>"F G __&Ara5C< !#5aS1#=MaSPdeI  foetetev\a\]_a qc!3A3oaSWI  __&Ar## b!
"%s+6I"6M5NeTUSVVXY 
 '0oo&7MUQ2;LQCjMFM
MM#$
MM$%5pj EH  IR  ES  VW  EW  qu  xA  qA  ]_  p` `yy() {*v <<	@QRuq"OB'(aS2RST U  &x (Q R[P[ %\J <<	 
<<	 
<<#$% &r [^  _q  [r  uv  [v  sw  z~  zC  zC  DV  zW  sW  |~  r 2}PCb
 JM ?F =8

 Nh Ss5   O*O"7O')!O- O3.O9O?#O?Pc              #   $   K   | ]  }||f 
 y wr0   r2   )r   r   s     r   r   r   \  s     :1A:s      
      	   c           
      B   |L| j                   g| j                  | j                  | j                  dk(  | j                  | j                  S |d   }|d   }|d   }|d   }|d   }| j                  }| j                  }|d   dk(  sJ | j                  dk(  rdnd	}	|rt	        |      }|dxx   d
z  cc<   t
        j                  j                  j                  j                  j                  | j                   j                         ||t        |   ||||	      }
|
g||S )Nnanswizzle	elem_size	elem_type
block_size
fp4_paddedr   r|   r   r}   )baser   stridespaddinglisttritonruntimedriveractiveutilsrK   data_ptrTMA_DTYPE_DEVICE_TO_HOST)argmetadatar   r   r   r   r   r   r   r   cu_tensor_maps              r   make_tensordesc_argr   b  s2    c399cs{{cCKK54Hc399cWZWbWbccy!G%I%I,'J,'JIIEkkG2;!;;%'aQGUb	Q	NN))0066JJ +	M ,E,G,,r3   c           
      v    t        d |j                         D              }|s S t        t        |j                               D cg c]*  \  }}t	        |t
              s|j                  d      s)|, c}}      rt              t              k(  sJ sd gt              z   fd}|S c c}}w )Nc              3   b   K   | ]'  }t        |t              xr |j                  d        ) yw)rW   N)r   r#   rr   )r   r   s     r   r   z)wrap_handle_tensordesc.<locals>.<genexpr>  s)     rX[jc2Ss~~l7SSrs   -/rW   c                      t        | d t               }d}t        | t        d        D ]>  \  }}|v r$|j                  t	        ||                |dz  }.|j                  |       @  | S )Nr   r|   )r   _BASE_ARGS_FORMAT_LENr   extendr   r   )args
final_argsr   r   r   launchertensordesc_indicesr   s        r   innerz%wrap_handle_tensordesc.<locals>.inner  s    $5 567
%:%; <= 	'FAs&&!!"5c?>;Z"[\!#!!#&	' $$r3   )r$   r   setr   r   r#   rr   r   )r   r   r   has_tensor_desc_argr   r   r   r   s   ` `    @r   wrap_handle_tensordescr     s    r_h_o_o_qrr"9#3#3#56pvq#*S#:NSVSaSabnSopr#o"6#>P:Q"QQQ&3'9#::	% L! 	qs   	B5
"B5
4B5
c                       e Zd Zd Zd Zy)CudaLauncherc                    t        d      rj                  n	t               }fd}|j                         D ci c]  \  }} ||      | }}}j                  j                         D ci c]  \  }}||
 }}}t        |dd       }t        |||      t        dt               t        t              }	t        |dd      | _        t        |	j                  ||      | _        |j                  | _        |j                  | _        |j                   | _        |j"                  | _        |j$                  | _        |j&                  | _        y c c}}w c c}}w )Nr   c                 t    t        | t              r&j                  j                  j	                  |       fS | S r0   )r   r#   fn	arg_namesindex)r   rA   s    r   <lambda>z'CudaLauncher.__init__.<locals>.<lambda>  s-    Z3=OSVV--33A69 UV r3   r   __triton_launcherr@   num_ctasr|   )r9   r   dictr   r   getattrr   r   r4   rC   rD   r   r   launchglobal_scratch_sizeglobal_scratch_alignprofile_scratch_sizeprofile_scratch_alignlaunch_cooperative_grid
launch_pdl)
rL   rA   r   r   arg_idxidxvaluer   r   rM   s
    `        r   rN   zCudaLauncher.__init__  s'   %,S+%>CMMDF	V;D??;LMZS%WS\5(M	M25--2E2E2GHJCS%ZH	H!(,=tDIy/B%$%%
  *a8,SZZOT#+#?#? $,$A$A!$,$A$A!%-%C%C"'/'G'G$"--' NHs   E/Ec                 .     fd} | j                    j                  t        j                        } | j                   j
                  t        j                        }	  j                  | j                   j                  ||	g	|  y )Nc                 x    | dkD  r4z  z  }|	j                   z  | z  }|j                         } |||
      S y Nr   )r   get)sizealign	allocator	grid_size
alloc_sizealloc_fngridXgridYgridZrL   streams         r   allocate_scratchz/CudaLauncher.__call__.<locals>.allocate_scratch  sH    ax!EME1	&6=
$==?
E6::r3   )
r   r   r   
_allocatorr  r  _profile_allocatorr   r  r  )
rL   r  r  r  r  functionr   r  global_scratchprofile_scratchs
   `````     r   __call__zCudaLauncher.__call__  s    	 	 *$*B*BDD]D]_j_u_uv*4+D+DdF`F`+6+I+IKE5%4;W;WY]YhYh"O	<6:	<r3   N)rO   rP   rQ   rN   r  r2   r3   r   r   r     s    .0<r3   r   c                   b     e Zd Z fdZd Zd Zd Zed        Zde	de	fdZ
d	 Zd
 Zd Z xZS )
CudaDriverc                 V    t               | _        t        | _        t        |           y r0   )r6   r   r   launcher_clsr:   rN   )rL   r=   s    r   rN   zCudaDriver.__init__  s    [
(r3   c                 ~    | j                         }| j                  |      }|d   dz  |d   z   }d}t        d||      S )Nr   r   r|       cuda)get_current_deviceget_device_capabilityr   )rL   device
capability	warp_sizes       r   get_current_targetzCudaDriver.get_current_target  sK    ((*//7
]R'*Q-7
	Y77r3   c                 J    dd l }|j                  d| j                               S )Nr   r#  )torchr&  r$  rL   r+  s     r   get_active_torch_devicez"CudaDriver.get_active_torch_device  s    ||FD$;$;$=>>r3   c                 "    dd l }|j                  S r
  )r+  r#  r,  s     r   get_device_interfacezCudaDriver.get_device_interface  s    zzr3   c                      	 dd l } | j                  j                         xr | j                  j                  d u S # t
        $ r Y yw xY w)Nr   F)r+  r#  is_availableversionhipImportError)r+  s    r   	is_activezCudaDriver.is_active  sC    	::**,L%--2C2Ct2KL 		s   7: 	AArs   returnc                     t        |      S r0   )rt   )rL   rs   s     r   map_python_to_cpp_typez!CudaDriver.map_python_to_cpp_type  s    }r3   c                     ddl m} |S )Nr   )do_bench)triton.testingr:  )rL   r:  s     r   get_benchmarkerzCudaDriver.get_benchmarker  s
    +r3   c                 b    dd l }d}|j                  t        |dz        |j                  d      S )Nr   i      r#  )r   r&  )r+  emptyint)rL   r+  
cache_sizes      r   get_empty_cache_for_benchmarkz(CudaDriver.get_empty_cache_for_benchmark  s.    
 '
{{3zQ/uyy{PPr3   c                 $    |j                          y r0   )zero_)rL   caches     r   clear_cachezCudaDriver.clear_cache  s    r3   )rO   rP   rQ   rN   r)  r-  r/  staticmethodr5  r#   r8  r<  rB  rF  rR   rS   s   @r   r  r    sN    
8?    Qr3   r  ),	functoolsr   r   r   r   pathlibr   r   triton.runtime.buildr   triton.runtimer   triton.backends.compilerr   triton.backends.driverr   r   r!   realpath__file__r   rC   r1   rD   rF   	lru_cacher.   r4   objectr6   rt   r   r   r   r   r   r   r   r   r   r   r   r   r  r2   r3   r   <module>rR     sw    	   	   8 & . ,
''//"''**84
5Wi01We,	  . , ,; ;:
4     $ -. Yz  :b	::         $-N2(<6 (<V/ /r3   