o
    iJ                     @   sx  d dl Z d dlZd dlmZmZmZ d dlZd dlmZ	 de j
fddZde j
fddZde j
fd	d
ZdeddfddZde j
fddZde j
fddZde j
fddZdee fddZ				d,dededee dee dee dedeeef fddZG dd  d ZG d!d" d"Z	d-d#eeef d$eee  deeeed"f f fd%d&Z	d.d'ed(ed)edefd*d+ZdS )/    N)AnyOptionalUnion)_get_device_indexreturnc                  C   s\   t jdkrtdtjjd  d} ntd} | j| _| j	| _
| j| _| j| _| j| _| S )Nwin32	amdhip64_r   .dllzlibamdhip64.so)sysplatformctypesCDLLtorchversionhiphipGetErrorStringcuGetErrorStringhipModuleLoadDatacuModuleLoadDatahipModuleGetFunctioncuModuleGetFunctionhipModuleLaunchKernelcuLaunchKernelhipFuncSetAttributecuFuncSetAttribute)lib r   S/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/torch/cuda/_utils.py_get_hip_runtime_library   s   

r   c                   C   s   t jdkr
tdS tdS )Nr   z
nvcuda.dllzlibcuda.so.1)r
   r   r   r   r   r   r   r   _get_cuda_library   s   


r   c                   C      t jjrt S t S N)r   r   r   r   r   r   r   r   r   _get_gpu_runtime_library    s   r"   resultc                 C   sR   | dkrd S t  }t }|| t | |jd ur |j nd}td| )Nr   Unknown CUDA errorCUDA error: )r   c_char_pr"   r   byrefvaluedecodeRuntimeError)r#   err_strlibcudaerror_messager   r   r   _check_cuda(   s   r.   c                  C   s   t jdkr ddtjjd dtjjd g} td|  d}ntd}|j|_	|j
|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_|S )	Nr    0r      hiprtcr	   zlibhiprtc.so)r
   r   joinr   r   r   r   r   hiprtcGetErrorStringnvrtcGetErrorStringhiprtcCreateProgramnvrtcCreateProgramhiprtcDestroyProgramnvrtcDestroyProgramhiprtcCompileProgramnvrtcCompileProgramhiprtcGetCodeSizenvrtcGetPTXSizehiprtcGetCodenvrtcGetPTXhiprtcGetProgramLogSizenvrtcGetProgramLogSizehiprtcGetProgramLognvrtcGetProgramLoghiprtcAddNameExpressionnvrtcAddNameExpressionhiprtcGetLoweredNamenvrtcGetLoweredName)version_strr   r   r   r   _get_hiprtc_library4   s   
"
rI   c               	   C   sr   t tjjdd } tjdkrd|  dg}nd|  dg}|D ]}zt|W   S  t	y4   Y q!w t	d)	N.r   r   nvrtc64_z0_0.dllzlibnvrtc.so.zlibnvrtc.soz Could not find any NVRTC library)
intr   r   cudasplitr
   r   r   r   OSError)major_version
nvrtc_libslib_namer   r   r   _get_nvrtc_libraryI   s   

rS   c                   C   r    r!   )r   r   r   rI   rS   r   r   r   r   _get_gpu_rtc_library\   s   rT   c                     s>   ddl m} m} dh  fdd|D }tjjr||  |S )z
    Get HIPCC/NVCC flags that are compatible with NVRTC compilation.

    Returns:
        List of HIPCC/NVCC flags that can be safely used with NVRTC.
    r   )COMMON_HIPCC_FLAGSCOMMON_NVCC_FLAGSz--expt-relaxed-constexprc                    s   g | ]}| vr|qS r   r   .0flagnvrtc_unsupported_flagsr   r   
<listcomp>s   s    z1_get_gpu_rtc_compatible_flags.<locals>.<listcomp>)torch.utils.cpp_extensionrU   rV   r   r   r   extend)rU   rV   compatible_flagsr   rZ   r   _get_gpu_rtc_compatible_flagse   s   

r`   Fkernel_sourcekernel_namecompute_capabilitycuda_include_dirsnvcc_optionsauto_pchc              	      s  ddl }t d dtddf fdd}| d}|du r8|j|j }	|jjr0|	j	 }n|	j
 |	j }g }
|jjrI|
d|   n
|
d	|   dd
lm} |d}|D ]}|
d|   q_|r}|D ]}|
d|   qp|rt|jjdksJ d|du rg }|d |r|D ]
}|
|d qt }|
dd |D  t|
}tj| |
 }t }|t||| d ddd |d}||| |||}| krt }|t| t|j}|| td|j   t }|!|t| t|j}|"|| t }|#||t| |jdurJ|j  }nd}$t| |jjr\|j%n|j}||fS )a  
    Compiles a CUDA kernel using NVRTC and returns the PTX code.

    Args:
        kernel_source (str): The CUDA kernel source code as a string
        kernel_name (str): The name of the kernel function to compile
        compute_capability (str, None): The compute capability to target (e.g., "86").
                                           If None, will detect from current device.
        cuda_include_dirs (list, None): List of directories containing CUDA headers
        nvcc_options (list, None): Additional options to pass to NVRTC
        auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)

    Returns:
        Tuple[bytes, str]: The compiled PTX code and mangled kernel name
    r   Nr#   r   c                    sL   |  kr$t  }| t | |jd ur|j nd}td| d S )Nr$   r%   )r   r&   r5   r'   r(   r)   r*   )r#   r+   r-   NVRTC_SUCCESSlibnvrtcr   r   check_nvrtc   s   

z#_nvrtc_compile.<locals>.check_nvrtcutf-8z--offload-arch=z--gpu-architecture=sm_)include_pathsrM   z-Iz12.8zPCH requires CUDA 12.8+z--pchc                 S   s   g | ]}| d qS )rk   )encoderW   r   r   r   r\      s    z"_nvrtc_compile.<locals>.<listcomp>z.cuzKernel compilation failed:
r/   )&
torch.cudarT   rL   rm   rM   get_device_propertiescurrent_devicer   r   gcnArchNamemajorminorappendr]   rl   strr`   r^   lenr   r&   c_void_pr7   r'   rE   r;   c_size_trA   create_string_bufferr(   rC   r*   r)   r=   r?   rG   r9   raw)ra   rb   rc   rd   re   rf   r   rj   source_bytespropsoptionsrl   cuda_include_paths	cuda_path	directoryoptionnvrtc_compatible_flagsnum_optionsoptions_arrayprogc_kernel_namereslog_sizelogptx_sizeptxc_mangled_namemangled_name	ptx_bytesr   rg   r   _nvrtc_compile}   s   




r   c                   @   s2   e Zd ZdejddfddZdeddfdd	ZdS )
_CudaModulemoduler   Nc                 C   s   || _ i | _d S r!   )_module_kernels)selfr   r   r   r   __init__  s   
z_CudaModule.__init__name_CudaKernelc              
   C   s   || j v r
| j | S ddlm} | }t }zt|t|| j|	d t
|| j}|| j |< |W S  tyJ } z	td| d|d }~ww )Nr   )r"   rk   zNo kernel named 'z' in this module)r   torch.cuda._utilsr"   r   rw   r.   r   r'   r   rm   r   r*   AttributeError)r   r   r"   r,   funckernelerrr   r   r   __getattr__  s$   


z_CudaModule.__getattr__)__name__
__module____qualname__r   rw   r   ru   r   r   r   r   r   r     s    r   c                   @   s   e Zd ZdZdejdejddfddZ						dd
eeeef deeeef de	e
 dede	e ddfddZdeddfddZdS )r   zT
    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
    r   r   r   Nc                 C   s   || _ || _d| _d S )Nr   )r   r   _max_shared_mem_bytes)r   r   r   r   r   r   r   3  s   
z_CudaKernel.__init__   r   r   r   gridblockargs
shared_memstreamc                 C   s  ddl }|jj }|sg }g }g }	|D ]Y}
t|
|jr?|
js*|
jr&|
 s*t	dt
|
 }|| |	t
| qt|
trRt
|
}|	t
| qt|
tret
|
}|	t
| qtdt|
 t
jt|	  }t|	D ]\}}
t
|
t
j||< qz|du rddl}|j }|dkr| jdks|| jkr| jdkrdnd| j d}td	| d
| dt|| j|d |d |d |d |d |d ||j|d dS )a  
        Call the compiled CUDA kernel

        Args:
            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
            block (tuple): Block dimensions (block_x, block_y, block_z)
            args (list): List of arguments to pass to the kernel.
                         PyTorch tensor arguments will be automatically converted to pointers.
            shared_mem (int): Shared memory size in bytes
            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
        r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type:    znot configuredzonly z bytes configuredzKernel requires z' bytes of shared memory (>= 48KB), but ze. Call kernel.set_shared_memory_config(shared_mem) after compilation and before launching the kernel.r   r1   ) r   rM   _utilsr"   
isinstanceTensoris_cudais_cpu	is_pinned
ValueErrorr   rw   data_ptrrt   r'   rL   c_intfloatc_double	TypeErrortyperv   	enumeratecastrn   current_streamr   r*   r.   r   r   _as_parameter_)r   r   r   r   r   r   r   r,   processed_argsc_argsargptrr   r   c_args_arrayiconfigured_msgr   r   r   __call__8  sl   






z_CudaKernel.__call__shared_mem_bytesc                 C   s   |dk r	|| _ d S t }tj }tjjr|jdkrdnd}nt|dd}||kr4t	d| d| dd	}t
|| j|| || _ d S )
Nr   gfx950i   i  shared_memory_per_block_optinzRequested shared memory (z bytes) exceeds device limit (z= bytes). Consider reducing block size or shared memory usage.   )r   r"   r   rM   ro   r   r   rq   getattrr*   r.   r   r   )r   r   r,   device_propsmax_shared_mem+cudaFuncAttributeMaxDynamicSharedMemorySizer   r   r   set_shared_memory_config  s4   

z$_CudaKernel.set_shared_memory_config)r   r   Nr   N)r   r   r   __doc__r   rw   r   tuplerL   r   listr   r   r   r   r   r   r   r   .  s,    
ar   r   kernel_namesc           	   	   C   s   ddl }t }t| tr| d} t }|j }| t	|
t||  W d   n1 s2w   Y  |s=t|S i }|D ]}t }t	|t|||d t||||< qA|S )a,  
    Loads a CUDA module from PTX code and returns a module object that can access kernels.

    Args:
        ptx (bytes or str): The PTX code to load
        kernel_names (list, optional): List of kernel names to extract from the module.
                                      If None, will return a module object with __getattr__.

    Returns:
        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
    r   Nrk   )rn   r"   r   ru   rm   r   rw   rM   r   r.   r   r'   r   r   r   )	r   r   r   r,   r   r   kernelsr   r   r   r   r   _cuda_load_module  s*   


r   deviceoptional	allow_cpuc                 C   s   t | tr| S t | trt| } t | tjr2|r&| jdvr%td|  n| jdkr2td|  tj sAt | tj	jrA| j
S t| ||S )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

    If :attr:`device` is a torch.device object, returns the device index if it
    is a CUDA device. Note that for a CUDA device without a specified index,
    i.e., ``torch.device('cuda')``, this will return the current default CUDA
    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
    CPU devices will be accepted and ``-1`` will be returned in this case.

    If :attr:`device` is a Python integer, it is returned as is.

    If :attr:`device` is ``None``, this will return the current default CUDA
    device if :attr:`optional` is ``True``.
    )rM   cpuz(Expected a cuda or cpu device, but got: rM   z!Expected a cuda device, but got: )r   rL   ru   r   r   r   r   jitis_scriptingrM   idx_torch_get_device_index)r   r   r   r   r   r   r     s   





r   )NNNFr!   )FF)r   r
   typingr   r   r   r   torch._utilsr   r   r   r   r   r"   rL   r.   rI   rS   rT   r   ru   r`   boolr   bytesr   r   r   dictr   r   r   r   r   <module>   sl    	

  


1