o
    灛iF                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ ejejeZejedgZdd	 Ze  d
d Zdd ZG dd deZdd Zdd ZG dd deZG dd deZdS )    N)Path)_build)get_cache_manager)	GPUTarget)	GPUDriverincludec                    s   dd l }| dkrd S dd lddlm}m}m}mm m} G  fdddj	}
|||||||}zdj}W n   Y d S | g|_||_dd }	fd	d
}
|||
|	rqt|	S d S )Nr   Linux)c_charc_intc_size_tc_void_pc_char_pPOINTERc                       s   e Zd Zdfd fgZdS )z8_find_already_mmapped_dylib_on_linux.<locals>.DlPhdrInfo	dlpi_addr	dlpi_nameN)__name__
__module____qualname___fields_ )r   r   r   \/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/triton/backends/amd/driver.py
DlPhdrInfo   s    r   z	libc.so.6i      c                    s@   | j j}tt|}|jv r ||tt| dS dS )Nr   r   )	contentsr   r   osfsdecodenamememmoveminlen)infosizedatar   p)ctypeslib_namemax_path_lengthr   r   callback2   s   
z6_find_already_mmapped_dylib_on_linux.<locals>.callback)platformsystemr$   r	   r
   r   r   r   r   	Structure	CFUNCTYPECDLLdl_iterate_phdrargtypesrestypecreate_string_bufferr   r   	string_at)r%   r(   r	   r
   r   r   r   
callback_tr-   pathr'   r   )r   r   r$   r%   r&   r   $_find_already_mmapped_dylib_on_linux   s&    
	r4   c                     s  d t d} | r |  rt j| r| S td|  d  t }|r8t j|r.|S td| d  g }dd l}| }|	 }|j
rN|g| }|D ]}t j|dd	 }t j|re|  S || qPt d
}|r|dD ]}t j| }	t j|	r|	  S ||	 qwtddg }
 fdd|
 D }|D ]}t j|r|  S || qt jd }t j|r|S || td  d| )Nzlibamdhip64.soTRITON_LIBHIP_PATHzTRITON_LIBHIP_PATH 'z' does not point to a valid zmemory mapped 'z'' in process does not point to a valid r   torchlibLD_LIBRARY_PATH:z/sbin/ldconfigz-pc                    s&   g | ]}|   r| d  qS ))stripendswithsplit).0liner%   r   r   
<listcomp>p   s   & z2_get_path_to_hip_runtime_dylib.<locals>.<listcomp>z/opt/rocm/lib/zcannot locate z after attempted paths )r   getenvr<   r3   existsRuntimeErrorr4   sitegetsitepackagesgetusersitepackagesENABLE_USER_SITEjoinappendr=   
subprocesscheck_outputdecode
splitlines)env_libhip_pathmmapped_pathpathsrE   site_packages	user_siter3   env_ld_library_pathdflibslocsloccommon_install_pathr   r@   r   _get_path_to_hip_runtime_dylib@   sP   



r[   c              	   C   s&  t | d }t|}|| d}|d u rzt T}tj	
|d}t|d}||  W d    n1 s;w   Y  t|||g tg }t|d}|j| | ddd}W d    n1 sfw   Y  W d    n1 suw   Y  dd l}	|	j||}
|	j|
}|
j| |S )	Nzutf-8z.sozmain.cwrbT)binaryr   )hashlibsha256encode	hexdigestr   get_filetempfileTemporaryDirectoryr   r3   rI   openwriter   include_dirputreadimportlib.utilutilspec_from_file_locationmodule_from_specloaderexec_module)srcr   keycache
cache_pathtmpdirsrc_pathrV   so	importlibspecmodr   r   r   compile_module_from_src   s(   
r{   c                       s$   e Zd Z fddZdd Z  ZS )HIPUtilsc                    s"   t | dstt| | | _| jS )Ninstance)hasattrsuperr|   __new__r}   )cls	__class__r   r   r      s   
zHIPUtils.__new__c                 C   sH   t  }ttjtd }|d|d}t|d}|j	| _	|j
| _
d S )Nzdriver.cz/*py_libhip_search_path*/r   	hip_utils)r[   r   r   r3   rI   dirname	read_textreplacer{   load_binaryget_device_properties)selflibhip_pathrq   rz   r   r   r   __init__   s   
zHIPUtils.__init__)r   r   r   r   r   __classcell__r   r   r   r   r|      s    r|   c                 C   8   | d dkrdS ddddddd	d
dddddddd|  S )Nr   *hipDeviceptr_tint32_tint8_tint16_tint64_tuint32_tuint8_tuint16_tuint64_tfloatdoublei1i8i16i32i64u1u8u16u32u64fp16bf16fp32f32fp64r   tyr   r   r   	ty_to_cpp   (   r   c                    sV  t |}ddd | D }dd  dd d fd	d
| D }d| }t |dkr?dddd | D  nd}t }	fdd
| D }
d|	 dt |dkr]d| nd dddd |
D  d| dd fdd
| D  d| d| dddd
 | D  dt |dkrdddd | D  nd d}|S )N, c                 s   s&    | ]\}}t | d | V  qdS )z argN)r   r>   ir   r   r   r   	<genexpr>   s   $ z make_launcher.<locals>.<genexpr>c                 S   r   )Nr   r   	PyObject*r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _extracted_type   r   z&make_launcher.<locals>._extracted_typec                 S   s"   dddddddddd	d
dd|  S )NOrV   rU   lbhr   BHIK)r   r   r   longr   r   r   r   r   r   r   r   r   r   r   r   r   	format_of   s   z make_launcher.<locals>.format_of c                    s   g | ]} |qS r   r   )r>   r   )r   r   r   r   rA          z!make_launcher.<locals>.<listcomp>	iiiKKOOOOr   c                 s   s    | ]
\}}d | V  qdS )z&_argNr   r   r   r   r   r      s    c                    s   g | ]}| vr|qS r   r   r>   r   )	constantsr   r   rA      r   a;  
#define __HIP_PLATFORM_AMD__
#include <hip/hip_runtime.h>
#include <Python.h>
#include <dlfcn.h>
#include <stdbool.h>
#include <dlfcn.h>

// The list of paths to search for the HIP runtime library. The caller Python
// code should substitute the search path placeholder.
static const char *hipLibSearchPaths[] = {"a  "};

// The list of HIP dynamic library symbols and their signature we are interested
// in this file.
#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN)                     \
  FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError)                     \
  FOR_EACH_ERR_FN(hipModuleLaunchKernel, hipFunction_t f,                     \
                  unsigned int gridDimX, unsigned int gridDimY,               \
                  unsigned int gridDimZ, unsigned int blockDimX,              \
                  unsigned int blockDimY, unsigned int blockDimZ,             \
                  unsigned int sharedMemBytes, hipStream_t stream,            \
                  void **kernelParams, void **extra)                          \
  FOR_EACH_ERR_FN(hipPointerGetAttribute, void *data,                         \
                  hipPointer_attribute attribute, hipDeviceptr_t ptr)

// The HIP symbol table for holding resolved dynamic library symbols.
struct HIPSymbolTable {
#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...)                             \
  hipError_t (*hipSymbolName)(__VA_ARGS__);
#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...)                             \
  const char *(*hipSymbolName)(__VA_ARGS__);

  HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
};

static struct HIPSymbolTable hipSymbolTable;

bool initSymbolTable() {
  // Use the HIP runtime library loaded into the existing process if it exits.
  void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
  if (lib) {
    // printf("[triton] chosen loaded libamdhip64.so in the process\n");
  }

  // Otherwise, go through the list of search paths to dlopen the first HIP
  // driver library.
  if (!lib) {
    int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
    for (int i = 0; i < n; ++i) {
      void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
      if (handle) {
        lib = handle;
        // printf("[triton] chosen %s\n", hipLibSearchPaths[i]);
      }
    }
  }
  if (!lib) {
    PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
    return false;
  }

  // Resolve all symbols we are interested in.
  dlerror(); // Clear existing errors
  const char *error = NULL;
#define QUERY_EACH_FN(hipSymbolName, ...)                                     \
  *(void **)&hipSymbolTable.hipSymbolName = dlsym(lib, #hipSymbolName);       \
  error = dlerror();                                                          \
  if (error) {                                                               \
    PyErr_SetString(PyExc_RuntimeError,                                       \
                    "cannot query " #hipSymbolName " from libamdhip64.so");   \
    dlclose(lib);                                                             \
    return false;                                                             \
  }

  HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)

  return true;
}

static inline void gpuAssert(hipError_t code, const char *file, int line)
{
   if (code != HIP_SUCCESS)
   {
      const char* prefix = "Triton Error [HIP]: ";
       const char* str = hipSymbolTable.hipGetErrorString(code);
      char err[1024] = {0};
      snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str );
      PyErr_SetString(PyExc_RuntimeError, err);
   }
}

#define HIP_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t functionz>) {
  // printf("_launch hip kernel\n");
  void *params[] = { c                 s   s    | ]}d | V  qdS )z&argNr   r   r   r   r   r   M  s    zw };
  if (gridX*gridY*gridZ > 0) {
      HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, aw  *num_warps, 1, 1, shared_memory, stream, params, 0));
    }
  }

typedef struct _DevicePtrInfo {
    hipDeviceptr_t dev_ptr;
    bool valid;
} DevicePtrInfo;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){
    PyObject *empty_tuple = PyTuple_New(0);
    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
    Py_DECREF(empty_tuple);
    Py_DECREF(ptr);
    if (!PyLong_Check(ret)) {
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
      ptr_info.valid = false;
      return ptr_info;
    }
    ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
    if(!ptr_info.dev_ptr)
      return ptr_info;
    uint64_t dev_ptr;
    hipError_t status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
    if (status == hipErrorInvalidValue) {
        PyErr_Format(PyExc_ValueError,
                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
        ptr_info.valid = false;
    }
    ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr;
    Py_DECREF(ret);
    return ptr_info;
  }
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
  return ptr_info;
}

static PyObject* launch(PyObject* self, PyObject* args) {
   // printf("launch\n");
  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
   c                    s$   g | ]\}} | d | dqS )z _arg; r   r   )r   r   r   rA     s   $ z
  if(!PyArg_ParseTuple(args, "z", &gridX, &gridY, &gridZ, &_stream, &_function,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hook a=  )) {
    return NULL;
  }

  // extract kernel metadata
  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    return NULL;
  }
  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
  }


  // raise exception asap
  r   c                 S   s>   g | ]\}}|d  dkrd| d| d| d| d	ndqS )r   r   zDevicePtrInfo ptr_infoz = getPointer(_argr   z); if (!ptr_infoz.valid) return NULL;r   r   r   r   r   r   rA     s   > z;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_functionc                 s   s6    | ]\}}|d  dkrd| dnd| V  qdS )r   r   ptr_infoz.dev_ptr_argNr   r   r   r   r   r     s   4 an  );

  if(launch_exit_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
  }

  if(PyErr_Occurred()) {
    return NULL;
  }
  // return None
  Py_INCREF(Py_None);
  return Py_None;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  if (!initSymbolTable()) {
    return NULL;
  }
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)r   rI   itemsvaluesr[   keys)r   	signatureids	warp_size
start_desc	arg_declsargs_formatformat	args_listr   paramsrq   r   )r   r   r   r   make_launcher   sP   ,
]_a        4  *5  `r   c                   @   s   e Zd Zdd Zdd ZdS )HIPLauncherc                    s   dt dr
jjnt i}t drjnt }fdd  fdd| D } fddj D }t||||j	t
d	}|j| _d S )
Nids_of_const_exprsfnr   c                    s   t | tr jj| S | S N)
isinstancestrr   	arg_namesindex)r   )rq   r   r   <lambda>  s    z&HIPLauncher.__init__.<locals>.<lambda>c                       i | ]	\}} ||qS r   r   r>   rr   valuecst_keyr   r   
<dictcomp>      z(HIPLauncher.__init__.<locals>.<dictcomp>c                    r   r   r   r   r   r   r   r     r   __triton_launcher)r~   r   
constexprstupler   dictr   r   r   r   r{   launch)r   rq   metadatar   r   r   rz   r   )r   rq   r   r     s   
zHIPLauncher.__init__c                 O   s   | j |i | d S r   )r   )r   argskwargsr   r   r   __call__  s   zHIPLauncher.__call__N)r   r   r   r   r   r   r   r   r   r     s    
r   c                       s0   e Zd Z fddZedd Zdd Z  ZS )	HIPDriverc                    s   t    t | _t| _d S r   )r   r   r|   utilsr   launcher_cls)r   r   r   r   r     s   

zHIPDriver.__init__c                  C   s   dd l } | jjd uS )Nr   )r6   versionhip)r6   r   r   r   	is_active  s   zHIPDriver.is_activec                 C   s:   |   }| j|}|d }|d }td|dd |S )NarchwarpSizer   r9   r   )get_current_devicer   r   r   r=   )r   devicedevice_propertiesr   r   r   r   r   get_current_target  s
   zHIPDriver.get_current_target)r   r   r   r   staticmethodr   r   r   r   r   r   r   r     s
    
r   )	functoolsr   r_   rK   rd   pathlibr   triton.runtime.buildr   triton.runtime.cacher   triton.backends.compilerr   triton.backends.driverr   r3   r   realpath__file__rI   rh   r4   	lru_cacher[   r{   objectr|   r   r   r   r   r   r   r   r   <module>   s.    0
>  