o
    灛i3                     @   s  d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
Z
d dlmZmZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ e
 defdd	Ze
 d
d Ze
 defddZe
ddd Ze	ddG dd dZG dd deZdS )    )BaseBackend	GPUTarget)irpassesllvmnvidia)	dataclassN)AnyTupleOptional)Pathbinaryc                 C   s   t jd|   ddt jt jtd| g}|D ]5}t j|rPt j	|rPt
j|dgt
jd}|d urPtjd|dtjd	}|d urP||d
f  S qtd|  )NTRITON__PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )osenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r   pathsr   resultversion r.   a/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/triton/backends/nvidia/compiler.py_path_to_binary   s   r0   c                  C   s    t tdd dgd} | S )Nptxasr   r   r   )r"   r#   r0   r'   )r-   r.   r.   r/   get_ptxas_version!   s   r2   returnc                 C   sZ   t | tsJ tt| d\}}|dkrd| S |dkr!d| S |dkr)d| S td)	zK
    Get the highest PTX version supported by the current CUDA driver.
    .   P      F   
   ?   z'Triton only support CUDA 10.0 or higher)
isinstancestrmapintsplitr*   )cuda_versionmajorminorr.   r.   r/   ptx_get_version'   s   rC   c                 C   s@   t | d}t|  W  d    S 1 sw   Y  d S )Nrb)openhashlibsha256read	hexdigest)r   fr.   r.   r/   	file_hash7   s   $rK   T)frozenc                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZe	e ed< d	Z
eed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dd Zdd ZdS )CUDAOptions   	num_warpsr   num_ctas   
num_stagesNmaxnreg)r   r   r   cluster_dimsptx_versionTenable_fp_fusionFallow_fp8e4nvallow_fp8e4b15tf32default_dot_input_precision)rY   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namec                 C   s   t tjd }| jd u ri nt| j}|dd s%tdt|d |d< t	
| dt|  | jdkr?| j| jd @ dksCJ dd S )	Nlib	libdeviceTRITON_LIBDEVICE_PATHzlibdevice.10.bcr_   r   r   znum_warps must be a power of 2)r   r   parentr_   dictr   r   getenvr<   object__setattr__tupleitemsrO   )selfdefault_libdirr_   r.   r.   r/   __post_init__Q   s    zCUDAOptions.__post_init__c                 C   sX   t | j}tdd t|d D |d< ddd t| D }t|d	 S )Nc                 s   s     | ]\}}|t |fV  qd S N)rK   ).0kvr.   r.   r/   	<genexpr>\   s    z#CUDAOptions.hash.<locals>.<genexpr>r_   _c                 S   s   g | ]\}}| d | qS )-r.   )rq   namevalr.   r.   r/   
<listcomp>]       z$CUDAOptions.hash.<locals>.<listcomp>r   )
rg   __dict__rk   sortedr   rl   rF   rG   encoderI   )rm   	hash_dictkeyr.   r.   r/   hashZ   s   
zCUDAOptions.hash)__name__
__module____qualname__rO   r>   __annotations__rP   rR   rS   r   rT   rk   rU   rV   boolrW   rX   rZ   r<   r]   r
   r^   r_   rg   r`   rb   ro   r   r.   r.   r.   r/   rM   =   s$   
 	rM   c                       s   e Zd ZedefddZdeddf fddZdefdd	Zd
d Z	dd Z
dd Zedd Zedd Zedd Zedd Zedd Zdd Ze dd Z  ZS )CUDABackendtargetc                 C   s
   | j dkS )Nra   )backend)r   r.   r.   r/   supports_targetc   s   
zCUDABackend.supports_targetr3   Nc                    s.   t  | |j| _t| jtsJ d| _d S )Ncubin)super__init__arch
capabilityr;   r>   
binary_ext)rm   r   	__class__r.   r/   r   g   s   
zCUDABackend.__init__c                    sX    fddt j D }| jdk|d< | jdk |d< | jdkr!dnd|d	< t d
i |S )Nc                    s   i | ]}| v r| | qS r.   r.   )rq   rr   optsr.   r/   
<dictcomp>n   rz   z-CUDABackend.parse_options.<locals>.<dictcomp>Y   rW   Z   rX   i   @r   r^   r.   )rM   __dataclass_fields__keysr   )rm   r   argsr.   r   r/   parse_optionsm   s
   zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r      )rO   rP   sharedrT   )rm   metadatar.   r.   r/   pack_metadatat   s   zCUDABackend.pack_metadatac                 C   s<   dd l m  m  m} d| jdkr|ji}|S |ji}|S )Nr   convert_custom_typesr6   )triton.language.extra.cudalanguageextrara   r   convert_custom_float8_sm80convert_custom_float8_sm70)rm   ra   codegen_fnsr.   r.   r/   get_codegen_implementation~   s   z&CUDABackend.get_codegen_implementationc                 C   s   t | d S rp   )r   load_dialects)rm   ctxr.   r.   r/   r      s   zCUDABackend.load_dialectsc                 C   s   t | j}|  tj| tj| tj	| tj
| tj| tj| tj| tj| ||  | S rp   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointeradd_combineadd_canonicalizeradd_reorder_broadcastadd_cseadd_licmadd_symbol_dcerun)modr   optpmr.   r.   r/   	make_ttir   s   
zCUDABackend.make_ttirc                 C   s  t  }|jd ur|jd |_|jd |_|jd |_t| j}|	  t
j|d| |jd|j t
j| |d dkrFt
j| t j
j|| t
j| t
j| t
j| t
j| t
j||dk t
j| |d dkrt
j| t
j||j t
j| t
j||dk t
j| t
j| t
j| t
j| t
j | |d d	krt j
j!| t j
j"| t
j#| |$|  |j|j|jf|d
< | S )Nr   r   r   zcuda:    r9      r6   	   rT   )%r   ClusterInforT   clusterDimXclusterDimYclusterDimZr   r   r   r   r   r   add_convert_to_ttgpuirrO   rP   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operandsr   r    add_combine_tensor_select_and_ifadd_pipelinerR   add_prefetchadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_tma_loweringr   r   )r   r   r   r   cluster_infor   r.   r.   r/   
make_ttgir   sF   

zCUDABackend.make_ttgirc                 C   s  |  d}|d ur|d  |9  < | }t|j}|  tjj| tj	| tj
| tj
| tj| tjj|| tjj| tj
| tj| tj| tj| tjdddkrrtj| || t  t }t||}t| |j d ur|! D ]}	|	" s|	# r|	$|j  q|j%rdd |j%D }
t&||
 t'|tj( |  d|d< t)|}~~|S )	Nz"triton_gpu.num-warp-groups-per-ctarO   TRITON_DISABLE_LINE_INFO0c                 S   s   g | ]\}}|qS r.   r.   )rq   rw   r   r.   r.   r/   ry      s    z)CUDABackend.make_llir.<locals>.<listcomp>ztriton_gpu.sharedr   )*get_int_attrr   r   r   r   r   r   r   %add_decompose_unsupported_conversionsr   convertadd_scf_to_cfadd_index_to_llvmiradd_allocate_shared_memoryadd_to_llvmirr   add_nvgpu_to_llvmadd_arith_to_llvmirr   r   r   r   r   r   r   llvmiradd_di_scoper   r   init_targets	to_moduleset_nvvm_reflect_ftzrS   get_functionsis_declarationis_external_linkageset_nvvm_maxnregr_   link_extern_libsoptimize_moduleOPTIMIZE_O3r<   )srcr   optionsr   num_warp_groupsr   r   r   llvm_modrr   r+   retr.   r.   r/   	make_llir   sL   



zCUDABackend.make_llirc              	   C   s   |j }|d u rtd\}}t|}td|}d}|dkrdnd| }	d| }
t| ||	|
dg|jd	}td
|}t	|dksCJ |d |d< |d  d|d  }tj
dd| |tjd}t
dd|}tjdddkrytd t| |S )Nr1   S   znvptx64-nvidia-cudar   sm_90asm_z+ptxznvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r   r   rw   r9   r4   z\.version \d+\.\d+z	.version r   z,\s*debug|debug,\s*r   NVPTX_ENABLE_DUMPr   1z // -----// NVPTX Dump //----- //)rU   r0   rC   minr   translate_to_asmrV   r%   findalllensubr(   r   r   r   print)r   r   r   r   rU   ru   r@   llvm_ptx_versiontripleprocfeaturesr   namesr.   r.   r/   make_ptx   s&   

zCUDABackend.make_ptxc                 C   s  t d\}}tjddddE}tjdddd"}||  |  |jd }tjd	r0d
nd}	|j	r7d
nd}
|dkr?dnd}tjdddkrb| |	 |
 d| | |j d| d|j }n| |	 |
 d| | |j d| d|j }zwz
t
j|ddd W nP t
jy } zCt|j}| }W d    n1 sw   Y  |jdkrtd| |jdtj krtd|j d| td|j d| d }~ww W tj|jrt|j tj|jrt|j ntj|jrt|j tj|jrt|j w w t|d }| }W d    n	1 s#w   Y  tj|r4t| W d    n1 s?w   Y  W d    |S W d    |S 1 sXw   Y  |S )!Nr1   Fwz.ptx)deletemodesuffixrz.logz.or   r   z
 -lineinfoz --fmad=falser   za  DISABLE_PTXAS_OPTr   r   z  -v --opt-level 0 --gpu-name=sm_z -o z 2> z -v --gpu-name=sm_T)shellcheck   z$Internal Triton PTX codegen error: 
   zPlease run `ptxas z+` to confirm that this is a bug in `ptxas`
z`ptxas` failed with error code z: 
rD   )r0   tempfileNamedTemporaryFilewriteflushrw   r   r   r   rV   r"   r   CalledProcessErrorrE   rH   
returncoder*   signalSIGSEGVr   r    remove)r   r   r   r   r1   ru   fsrcflogfbin	line_infofmadr  cmdelog_filelogrJ   r   r.   r.   r/   
make_cubin  sh   

0.



*$$$zCUDABackend.make_cubinc                    s^    fdd|d<  fdd|d<  fdd|d<  fdd|d	<  fd
d|d< d S )Nc                    s    | | S rp   )r   r   r   r   rm   r.   r/   <lambda><  s    z(CUDABackend.add_stages.<locals>.<lambda>r   c                        | | jS rp   )r   r   r!  r"  r.   r/   r#  =      ttgirc                    r$  rp   )r   r   r!  r"  r.   r/   r#  >  r%  llirc                    r$  rp   )r  r   r!  r"  r.   r/   r#  ?  r%  ptxc                    r$  rp   )r   r   r!  r"  r.   r/   r#  @  r%  r   r.   )rm   stagesr   r.   r"  r/   
add_stages;  s
   zCUDABackend.add_stagesc                 C   s   t  }| d| j S )Nrv   )r2   r   )rm   r-   r.   r.   r/   r   B  s   zCUDABackend.hash)r   r   r   staticmethodr   r   r   r	   r   r   r   r   r   r   r   r  r   r*  	functools	lru_cacher   __classcell__r.   r.   r   r/   r   a   s*    


(
0

(r   ) triton.backends.compilerr   r   triton._C.libtritonr   r   r   r   dataclassesr   r,  typingr	   r
   r   rF   r%   r  r  r   r"   pathlibr   r-  r<   r0   r2   r>   rC   rK   rM   r   r.   r.   r.   r/   <module>   s.    

#