o
    灛i                     @   s  d dl Z d dlmZ d dlZd dlZd dlZd dlmZ	 d dl
mZ ddlmZ d dlmZ ddlmZ dd	lmZ G d
d dZG dd dZeddG dd dZdd Zdd Zdd Zdd Zdd ZejeejgdZejeej gdZ!ejeej"gdZ#G dd dZ$G dd  d Z%d!d" Z&d#d$ Z'd%d& Z(G d'd( d(Z)G d)d* d*e)Z*G d+d, d,e)Z+d-d. Z,d/d0 Z-d1d2 Z.d3d4 Z/e% Z0g d5Z1G d6d7 d7Z2G d8d9 d9Z3dS ):    N)Tuple)	dataclass   )InterpreterError)partial   )interpreter)irc                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )TensorHandlec                 C   s   || _ || _i | _dS )a  
            data: numpy array
            dtype: triton type, either pointer_type or scalar_type.
            we don't store block_type here because the shape information is already availale in the data field
            attr: a dictionary of attributes
        N)datadtypeattr)selfr   r    r   \/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/triton/runtime/interpreter.py__init__   s   
zTensorHandle.__init__c                 C   s   t | j S N)boolr   allr   r   r   r   __bool__      zTensorHandle.__bool__c                 C   s$   | j }t|dr|j}t|ds|S )N
element_ty)r   hasattrr   )r   r   r   r   r   get_element_ty    s
   

zTensorHandle.get_element_tyc                 C   s   t | j | jS r   )r
   r   copyr   r   r   r   r   clone&      zTensorHandle.clonec                 C   s   || j |< d S r   )r   )r   keyvaluer   r   r   set_attr)   r   zTensorHandle.set_attrN)__name__
__module____qualname__r   r   r   r   r    r   r   r   r   r
      s    r
   c                   @   s   e Zd Zdd Zdd ZdS )BlockPointerHandlec                 C   s(   || _ || _|| _|| _|| _|| _d S r   )baseshapestridesoffsetstensor_shapeorder)r   r%   r&   r'   r(   r)   r*   r   r   r   r   /   s   
zBlockPointerHandle.__init__c           
      C   s   | j  }|jd }| j}t| j j| j}tj| jtd}t	t
|D ]?}dgt
| }|| ||< | j| jt||  |}	|||	 | j| j tj }||v rct||	| j| jk }q$t|| j jj}||fS )N   r   r   )r%   r   primitive_bitwidthr)   npbroadcast_tor   onesr   rangelenr(   arangereshaper'   astypeuint64logical_andr&   r
   r   scalar)
r   boundary_checkdtype_ttn_bytesr)   ptrsmasksdim
bcast_dimsoffr   r   r   materialize_pointers7   s   

  z'BlockPointerHandle.materialize_pointersN)r!   r"   r#   r   rA   r   r   r   r   r$   -   s    r$   T)frozenc                   @   sr   e Zd ZU dZeed< dZeed< dZe	ed< dZ
eed< dZeed< d	Ze	ed
< dZee	 ed< dZeed< dS )InterpreterOptionsNextern_libsFdebugarchTallow_fp8e4nvallow_fp8e4b15tf32default_dot_input_precision)rI   tf32x3ieeeallowed_dot_input_precisionsr   max_num_imprecise_acc_default)r!   r"   r#   rD   dict__annotations__rE   r   rF   strrG   rH   rJ   rM   r   rN   intr   r   r   r   rC   H   s   
 rC   c                 C   sD   | t jkrt jS | t jkrt jS | t jkrt jS | t jkr t jS | S r   )	r.   uint8int8uint16int16uint32int32r6   int64r,   r   r   r   _get_signed_np_dtypeT   s   



rZ   c                 C   st  t | tjrttjS i tjtttjttjtj	ttj	tj
ttj
tjttjtjttjtjttjtjttjtjttjtjttjtjttjtjttjtjttjtjttjtjttjtjttjtjttjtjttji}t | tjrt | jtjrttjS || j S ||  S r   )
isinstancetlpointer_typer.   r   r6   int1r   float16float32float64rT   rS   rV   rU   rX   rW   rY   bfloat16float8e5float8e5b16
float8e4nv
float8e4b8float8e4b15
block_typer   )tt_dtypenp_typesr   r   r   _get_np_dtype`   sX   	

rk   c                 C   s  t td|j }t td|j }tj|  |d}||jd ? d@ }|j|j d }|j|j d }	|d|j> d @ }
|j}|j}||j? d|> d @ tj}|dk}t	|rtj
|tjd}t|jD ]}|
|? d@ }|j| ||dk< qh|
dk}d||  ||< || |||@ < |
| || > d|j> d @ |
|< tdt|| | d|	> d }||}||}|j|jkr|
|j|j ? d|j> d @ }|tjjkr|
d|j|j d > @ }||dk }||}n|
||j|j > d|j> d @ }|dk}t	|rH||j? d|> d @ tj}|dk}||@ }tj
|tjd}d| || |  ||< || || ? d|j||  > B ||< ||jd > ||j> B |B }|| jS )Nuintr,   r   r   )getattrr.   r-   
frombuffertobytesfp_mantissa_widthexponent_biasr5   rX   any
zeros_liker1   maximumminimum_irROUNDING_MODERTNEr4   r&   )inputinput_dtypeoutput_dtyperounding_modeinput_uint_dtypeoutput_unint_dtype	input_binsigninput_exponent_widthoutput_exponent_widthsignificand
bias_inputbias_outputexponentsubnormal_indexbit_posi	bit_indexzero_significand_indexexponent_outputsign_outputsignificand_outputcut_offnon_zero_exponent_indexshiftoutputr   r   r   _convert_float   sl   
$


r   c                 C   s
   t | S r   )matherfxr   r   r   _erf   s   
r   c                 C   s   t | t | d? S )N@   )rR   )abr   r   r   
_umulhi_64   s   r   )otypesc                   @   s   e Zd Zedd ZdS )ExtraFunctionsc                 C   s   t || j|||S r   )r\   tensorcreate_fp_to_fphandle)ry   dst_tyfp_downcast_rounding_builderr   r   r   _convert_custom_types   s   z$ExtraFunctions._convert_custom_typesN)r!   r"   r#   staticmethodr   r   r   r   r   r      s    r   c                   @   s^  e Zd Zejjejjejjejjejjejjejj	ejj	iZ
ejjejjejjejjejjejjejjejjejjejjejjejjejjejjejjejjejjejjejjejji
ZdddZdd Zdd Zd	d
 Zdd Zdd Zdd Zdd Zdd Z dd Z!dd Z"dd Z#dd Z$dd Z%dd  Z&d!d" Z'd#d$ Z(d%d& Z)d'd( Z*d)d* Z+d+d, Z,d-d. Z-d/d0 Z.d1d2 Z/d3d4 Z0d5d6 Z1d7d8 Z2d9d: Z3d;d< Z4d=d> Z5d?d@ Z6dAdB Z7dCdD Z8dEdF Z9dGdH Z:dIdJ Z;dKdL Z<dMdN Z=dOdP Z>dQdR Z?dSdT Z@dUdV ZAdWdX ZBdYdX ZCdZdX ZDd[dX ZEd\dX ZFd]dX ZGd^dX ZHd_d` ZIdadb ZJdcdd ZKdedX ZLdfdX ZMdgdX ZNdhdX ZOdidX ZPdjdX ZQdkdX ZRdldX ZSdmdX ZTdndX ZUdodX ZVdpdX ZWdqdX ZXdrdX ZYdsdX ZZdtdX Z[dudX Z\dvdX Z]dwdX Z^dxdX Z_dydX Z`dzdX Zad{dX Zbd|dX Zcd}dX Zdd~dX ZeddX ZfddX ZgddX ZhddX ZiddX ZjddX ZkddX ZlddX ZmddX ZnddX ZoddX ZpddX ZqddX ZrddX ZsddX ZtddX ZuddX ZvddX ZwddX ZxddX ZyddX ZzddX Z{dd Z|dd Z}dd Z~dd ZddX ZddX Zdd Zdd Zdd ZddX ZddX ZddX ZddX ZddX ZddX ZddX ZddX ZddX ZddX ZddX Zdd Zdd ZddX Zdd Zdd Zdd Zdd Zdd Zdd Zdd ZddĄ ZddƄ ZddȄ Zddʄ Zdd̄ Zdd΄ ZddЄ Zdd҄ ZddԄ Zddք Zdd؄ Zddڄ Zdd܄ Zddބ Zdd Zdd Zdd Zdd ZdS )InterpreterBuilderreturnNc                 C   s$   d | _ t | _i | _tj| jd< d S )Nconvert_custom_types)rF   rC   optionscodegen_fnsr   r   r   r   r   r   r      s   zInterpreterBuilder.__init__c                 C   sR   || j d k std|| j d k std|| j d k s!td|||f| _d S )Nr   zx >= grid_dim[0]r   zy >= grid_dim[1]r   zz >= grid_dim[2])grid_dim
ValueErrorgrid_idxr   r   yzr   r   r   set_grid_idx   s   zInterpreterBuilder.set_grid_idxc                 C   s   |||f| _ d S r   )r   )r   nxnynzr   r   r   set_grid_dim      zInterpreterBuilder.set_grid_dimc                 C      t jS r   )r\   r_   r   r   r   r   get_half_ty      zInterpreterBuilder.get_half_tyc                 C   r   r   )r\   rb   r   r   r   r   get_bf16_ty  r   zInterpreterBuilder.get_bf16_tyc                 C   r   r   )r\   r`   r   r   r   r   get_float_ty  r   zInterpreterBuilder.get_float_tyc                 C   r   r   )r\   ra   r   r   r   r   get_double_ty	  r   z InterpreterBuilder.get_double_tyc                 C   r   r   )r\   rT   r   r   r   r   get_int8_ty  r   zInterpreterBuilder.get_int8_tyc                 C   r   r   )r\   rS   r   r   r   r   get_uint8_ty  r   zInterpreterBuilder.get_uint8_tyc                 C   r   r   )r\   rV   r   r   r   r   get_int16_ty  r   zInterpreterBuilder.get_int16_tyc                 C   r   r   )r\   rU   r   r   r   r   get_uint16_ty  r   z InterpreterBuilder.get_uint16_tyc                 C   r   r   )r\   rX   r   r   r   r   get_int32_ty  r   zInterpreterBuilder.get_int32_tyc                 C   r   r   )r\   rW   r   r   r   r   get_uint32_ty  r   z InterpreterBuilder.get_uint32_tyc                 C   r   r   )r\   rY   r   r   r   r   get_int64_ty  r   zInterpreterBuilder.get_int64_tyc                 C   r   r   )r\   r6   r   r   r   r   get_uint64_ty!  r   z InterpreterBuilder.get_uint64_tyc                 C   r   r   )r\   re   r   r   r   r   get_fp8e4nv_ty$  r   z!InterpreterBuilder.get_fp8e4nv_tyc                 C   r   r   )r\   rg   r   r   r   r   get_fp8e4b15_ty'  r   z"InterpreterBuilder.get_fp8e4b15_tyc                 C   r   r   )r\   rf   r   r   r   r   get_fp8e4b8_ty*  r   z!InterpreterBuilder.get_fp8e4b8_tyc                 C   r   r   )r\   rc   r   r   r   r   get_fp8e5_ty-  r   zInterpreterBuilder.get_fp8e5_tyc                 C   r   r   )r\   rd   r   r   r   r   get_fp8e5b16_ty0  r   z"InterpreterBuilder.get_fp8e5b16_tyc                 C      t ||S r   )r\   r]   )r   elt_ty
addr_spacer   r   r   
get_ptr_ty3     zInterpreterBuilder.get_ptr_tyc                 C   r   r   )r\   rh   )r   r   r&   r   r   r   get_block_ty6  r   zInterpreterBuilder.get_block_tyc                 C   s   t tj|gtjdtjS Nr,   )r
   r.   arraybool_r\   r^   r   r   r   r   r   get_int19     zInterpreterBuilder.get_int1c                 C      t tj|gtjdtjS r   )r
   r.   r   rS   r\   r   r   r   r   	get_uint8<  r   zInterpreterBuilder.get_uint8c                 C   r   r   )r
   r.   r   rT   r\   r   r   r   r   get_int8?  r   zInterpreterBuilder.get_int8c                 C   r   r   )r
   r.   r   rU   r\   r   r   r   r   
get_uint16B  r   zInterpreterBuilder.get_uint16c                 C   r   r   )r
   r.   r   rV   r\   r   r   r   r   	get_int16E  r   zInterpreterBuilder.get_int16c                 C   r   r   )r
   r.   r   rW   r\   r   r   r   r   
get_uint32H  r   zInterpreterBuilder.get_uint32c                 C   r   r   )r
   r.   r   rX   r\   r   r   r   r   	get_int32K  r   zInterpreterBuilder.get_int32c                 C   r   r   )r
   r.   r   r6   r\   r   r   r   r   
get_uint64N  r   zInterpreterBuilder.get_uint64c                 C   r   r   )r
   r.   r   rY   r\   r   r   r   r   	get_int64Q  r   zInterpreterBuilder.get_int64c                 C   r   r   )r
   r.   r   r_   r\   r   r   r   r   get_fp16T  r   zInterpreterBuilder.get_fp16c                 C   r   r   )r
   r.   r   r`   r\   r   r   r   r   get_fp32W  r   zInterpreterBuilder.get_fp32c                 C   r   r   )r
   r.   r   ra   r\   r   r   r   r   get_fp64Z  r   zInterpreterBuilder.get_fp64c                 C   s   t tjdgt|d|S Nr   r,   )r
   r.   r   rk   )r   typer   r   r   get_null_value]  r   z!InterpreterBuilder.get_null_valuec                 C   s2   | j d u r	tdttj| j | gtjdtjS )Nzgrid_idx is Noner,   )r   r   r
   r.   r   rX   r\   r   axisr   r   r   create_get_program_ida  s   
 z(InterpreterBuilder.create_get_program_idc                 C   s    t tj| j| gtjdtjS r   )r
   r.   r   r   rX   r\   r   r   r   r   create_get_num_programsf  s    z*InterpreterBuilder.create_get_num_programsc                 C   s0   t tj|jtdtj}d }| ||||||S r   )r
   r.   	ones_liker   r   r\   r^   create_masked_load)r   ptr_0_1is_volatilemaskotherr   r   r   create_loadj  s   zInterpreterBuilder.create_loadc                 C   s*   t tj|jtdtj}| |||d d S r   )r
   r.   r   r   r   r\   r^   create_masked_store)r   r   valr   r   r   r   r   r   create_storeo  s   zInterpreterBuilder.create_storec           
      C   sN   |  }t|}|d u rttj|j|d|}t|j|j|j|}	t|	|S r   )r   rk   r
   r.   rs   r   _interpreterload)
r   r<   r   r   cache_modifiereviction_policyr   r:   dtype_npretr   r   r   r   s  s   
z%InterpreterBuilder.create_masked_loadc                 C   s   t |j|j|jS r   )r   storer   )r   r<   r   r   r   r   r   r   r   r   {     z&InterpreterBuilder.create_masked_storec                 C   st   |j j}|j}|tjkr|tjks|tjkr.|tjkr.t|j||d t|}t	||jS t	|j
t||jS r   )r   r8   r\   rb   r`   r   r   viewrk   r
   r5   )r   srcdst_typesrc_element_typedst_element_typer   r   r   r   	cast_impl  s   zInterpreterBuilder.cast_implc                 C      |  ||S r   r  r   r   r   r   r   r   <lambda>      zInterpreterBuilder.<lambda>c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   r  )r   r   r   	is_signedr   r   r   r    r  c                 C   s4   |j j}|j}t|j|||t|}t||jS r   )r   r8   r   r   r   rk   r
   )r   r   r   r|   r  r  r   r   r   r   r     s   z"InterpreterBuilder.create_fp_to_fpc                 C   s   t |jt||jS r   )r
   r   r   rk   r8   r  r   r   r   create_bitcast     z!InterpreterBuilder.create_bitcastc                 C   s   t ||j|j|jjS r   r
   r   r   r8   )r   lhsrhsopr   r   r   	binary_op  r  zInterpreterBuilder.binary_opc                 C      |  ||tjS r   r  r.   addr   r  r  r   r   r   r        c                 C   r  r   r  r.   multiplyr  r   r   r   r    r  c                 C   r  r   r  r.   divider  r   r   r   r    r  c                 C   r  r   )r  r.   	remainderr  r   r   r   r    r  c                 C   r  r   r  r.   subtractr  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   create_idivr  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   r  r.   fmodr  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   r  r  r   r   r   r    r  c                 C   r  r   )r  r.   
left_shiftr  r   r   r   r    r  c                 C   r  r   )r  r.   right_shiftr  r   r   r   r    r  c                 C   r  r   r  r.   ru   r  r   r   r   r    r  c                 C   r  r   r#  r  r   r   r   r    r  c                 C   r  r   r#  r  r   r   r   r    r  c                 C   r  r   r#  r  r   r   r   r    r  c                 C   r  r   r  r.   rt   r  r   r   r   r    r  c                 C   r  r   r$  r  r   r   r   r    r  c                 C   r  r   r$  r  r   r   r   r    r  c                 C   r  r   r$  r  r   r   r   r    r  c                 C   r  r   r  r.   
less_equalr  r   r   r   r    r  c                 C   r  r   r  r.   lessr  r   r   r   r    r  c                 C   r  r   r  r.   greater_equalr  r   r   r   r    r  c                 C   r  r   r  r.   greaterr  r   r   r   r    r  c                 C   r  r   r%  r  r   r   r   r    r  c                 C   r  r   r'  r  r   r   r   r    r  c                 C   r  r   r)  r  r   r   r   r    r  c                 C   r  r   r+  r  r   r   r   r    r  c                 C   r  r   r  r.   equalr  r   r   r   r    r  c                 C   r  r   r  r.   	not_equalr  r   r   r   r    r  c                 C   r  r   r'  r  r   r   r   r    r  c                 C   r  r   r+  r  r   r   r   r    r  c                 C   r  r   r%  r  r   r   r   r    r  c                 C   r  r   r)  r  r   r   r   r    r  c                 C   r  r   r-  r  r   r   r   r    r  c                 C   r  r   r/  r  r   r   r   r    r  c                 C   r  r   r'  r  r   r   r   r    r  c                 C   r  r   r+  r  r   r   r   r    r  c                 C   r  r   r%  r  r   r   r   r    r  c                 C   r  r   r)  r  r   r   r   r    r  c                 C   r  r   r-  r  r   r   r   r    r  c                 C   r  r   r/  r  r   r   r   r    r  c                 C   r  r   )r  r.   bitwise_andr  r   r   r   r    r  c                 C   r  r   )r  r.   bitwise_xorr  r   r   r   r    r  c                 C   r  r   )r  r.   
bitwise_orr  r   r   r   r    r  c                 C   s&   t |jt|j|j |j |jjS r   )r
   r   r.   r   r   r8   r  r   r   r   r    s   &zInterpreterBuilder.create_idivc                 C   sD   t |jj}t |jj}|j||_|j||_| ||tjS r   )rZ   r   r   r5   r  r.   r"  )r   r  r  	lhs_dtype	rhs_dtyper   r   r   create_ashr  s
   zInterpreterBuilder.create_ashrc                 C   s   |j j}|tjks|tjkrtt|j |j |jjS ttd|j	d d  }|j 
|}|j 
|}t|||j	d ? }t|
||jjS )Nrl   r+   r   )r   r   r.   rY   r6   r
   np_umulhi_u64r8   rm   itemsizer5   r  )r   r  r  r   compute_dtypelhs_datarhs_dataret_datar   r   r   create_umulhi  s   z InterpreterBuilder.create_umulhic                 C   s   t ||j|j|j|jjS r   r  )r   r  r  r   r  r   r   r   
ternary_op     zInterpreterBuilder.ternary_opc                 C      |  |||tjS r   )r>  r.   clip)r   arglohipropagate_nansr   r   r   r        c                 C   r@  r   )r>  r.   where)r   condr  r  r   r   r   r    rF  c                 C   s   t |j|j |j |jjS r   r  r   r   r   r   
create_fma  r?  zInterpreterBuilder.create_fmac                 C   s   t ||j|jjS r   r  )r   rB  r  r   r   r   unary_op  r   zInterpreterBuilder.unary_opc                 C   sZ   |j }|jd }ttd|j }|j|}d|> d }||@ t|}t||j jS )Nr   rl   )	r   r-   rm   r.   r   r   rk   r
   r8   )r   rB  r:   mask_bitwidthnp_uint_dtyper   r   r   r   r   r   create_fabs  s   
zInterpreterBuilder.create_fabsc                 C      |  |tjS r   )rJ  r.   cosr   rB  r   r   r   r        c                 C   rN  r   )rJ  r.   exprP  r   r   r   r    rQ  c                 C   rN  r   )rJ  r.   exp2rP  r   r   r   r    rQ  c                 C   rN  r   )rJ  r.   absrP  r   r   r   r    rQ  c                 C   rN  r   )rJ  r.   floorrP  r   r   r   r    rQ  c                 C   rN  r   )rJ  r.   ceilrP  r   r   r   r    rQ  c                 C   rN  r   )rJ  r.   logrP  r   r   r   r    rQ  c                 C   rN  r   )rJ  r.   log2rP  r   r   r   r    rQ  c                 C   rN  r   rJ  r.   sqrtrP  r   r   r   r  	  rQ  c                 C   rN  r   rY  rP  r   r   r   r  
  rQ  c                 C   rN  r   )rJ  r.   sinrP  r   r   r   r    rQ  c                 C   s0   |j jtjkrt|j nt|j }t||jjS r   )r   r   r.   r`   np_erf_fp32np_erf_fp64r
   r8   )r   rB  r   r   r   r   
create_erf  s   "zInterpreterBuilder.create_erfc                 C   s   t dt|j |jjS )Nr   )r
   r.   rZ  r   r   r8   rP  r   r   r   create_rsqrt  r   zInterpreterBuilder.create_rsqrtc                 C   s   t |j||jjS r   )r
   r   r4   r   r8   )r   rB  r&   allow_reorderr   r   r   r    s    c                 C      t t|j||jjS r   )r
   r.   	transposer   r   r8   )r   rB  permr   r   r   create_trans  r  zInterpreterBuilder.create_transc                 C   s   |j }|j }|jjdkr|j s|jjdkr6|j r6t||jtjd tj}t||jtjd tj}t	tj
|||j jd|j  |jjS )Nr+   r,   )r   r   r-   is_floatingr   r\   r_   r   r.   r
   matmulr8   )r   r   r   dinput_precisionmax_num_imprecise_acca_datab_datar   r   r   
create_dot  s   $zInterpreterBuilder.create_dotc                 C   s   t tj||tjdtjS r   )r
   r.   r3   rX   r\   )r   startstopr   r   r   create_make_range#  r   z$InterpreterBuilder.create_make_rangec                 C   s"   t tj|j|d|fdd tjS )Nr   )binsr1   )r
   r.   	histogramr   r\   rX   )r   r   rp  r   r   r   create_histogram&     "z#InterpreterBuilder.create_histogramc                 C   s<   |  }|j}td|d }t|j||jtj  |jS )Nr   r+   )	r   r-   maxr
   r   r5   r.   r6   r   )r   r   offsetr:   element_bitwidthelement_bytewidthr   r   r   create_addptr+  s    z InterpreterBuilder.create_addptrc                 C   s   | |\}}| }	t|	}
|d u rd }n.|tjjkr(ttj|j	|
d|	}n|tjj
kr=ttj|j	td|
d|	}ntd| | ||||||S )Nr,   nanzunsupported padding option )rA   r   rk   rv   PADDING_OPTIONPAD_ZEROr
   r.   rs   r   PAD_NAN	full_likefloatr   r   )r   r   r9   padding_optionr   r   r   r<   r=   r:   r   r   r   r   r   create_tensor_pointer_load2  s   z-InterpreterBuilder.create_tensor_pointer_loadc                 C   s    | |\}}| |||||S r   )rA   r   )r   r   r   r9   r   r   r<   r=   r   r   r   create_tensor_pointer_storeA  s   z.InterpreterBuilder.create_tensor_pointer_storec                 C   ra  r   )r
   r.   expand_dimsr   r   r8   )r   rB  r   r   r   r   create_expand_dimsE  r  z%InterpreterBuilder.create_expand_dimsc                 C   ra  r   )r
   r.   r/   r   r   r8   r   rB  r&   r   r   r   create_broadcastH  r  z#InterpreterBuilder.create_broadcastc                 C      t |jtj|jS r   r
   r   r5   r.   r6   r8   r   r   r   r   r   r   create_int_to_ptrK     z$InterpreterBuilder.create_int_to_ptrc                 C   r  r   r  r  r   r   r   create_ptr_to_intN  r  z$InterpreterBuilder.create_ptr_to_intc                 C   s   t t|j|jg|jjS r   )r
   r.   concatenater   r   r8   r  r   r   r   
create_catQ  r?  zInterpreterBuilder.create_catc                 C   s    t tj|j|jgdd|jjS )Nr   )r
   r.   stackr   r   r8   r  r   r   r   create_joinT  s    zInterpreterBuilder.create_joinc                 C   s(   t |jd |jjt |jd |jjfS )N).r   ).r   r  )r   r   r   r   r   create_splitX  s   (zInterpreterBuilder.create_splitc                 C   sV   t |jtjrttj||jd t|jd|jj	S ttj||jt|jd|jj	S r   )
r[   r   r\   rh   r
   r.   fullr   rk   r8   r  r   r   r   create_splat\  s   &"zInterpreterBuilder.create_splatc                 C   sB   || j vrtd| | j | }tt|j|j|j||jjS )Nunsupported semantic )ir_sem_to_interpreter_semr   r
   r   
atomic_casr   r   r8   )r   r   cmpr   semscoper   r   r   create_atomic_casb  s   

 z$InterpreterBuilder.create_atomic_casc                 C   sf   || j vrtd| || jvrtd| | j | }| j| }tt||j|j|j||jjS )Nzunsupported rmwOp r  )	ir_rmw_op_to_interpreter_rmw_opr   r  r
   r   
atomic_rmwr   r   r8   )r   rmwOpr   r   r   r  r  r   r   r   create_atomic_rmwh  s   



"z$InterpreterBuilder.create_atomic_rmwc                 C      t d)Nz4extern_elementwise not supported in interpreter modeNotImplementedError)r   libNamelibPathsymbolargListretTypeisPurer   r   r   create_extern_elementwiseq     z,InterpreterBuilder.create_extern_elementwisec                 C   r  )Nz,inline_asm not supported in interpreter moder  )r   	inlineAsmconstraintsvaluesr   r  packr   r   r   create_inline_asmt  r  z$InterpreterBuilder.create_inline_asmc                 C   s   d| j d  d| j d  d| j d  d}|r|d| 7 }|r*tjdd	d
 id |D ]}t|d|j   q,|rCtjd d d S d S )N(r   z, r   r   ) r   c                 S   s   d| dS )N0x02xr   r   r   r   r   r  }  r  z1InterpreterBuilder.create_print.<locals>.<lambda>)	formatter)r   r.   set_printoptionsprintr   )r   prefixhexr  msgr   r   r   r   create_printw  s   *zInterpreterBuilder.create_printc                 C   s&   |sJ | d| d| d| d S )Nz in :r   )r   	conditionmessagefileNamefuncNamelineNor   r   r   create_assert  s   &z InterpreterBuilder.create_assertc                 C   s   d S r   r   r   r   r   r   create_barrier  s   z!InterpreterBuilder.create_barrierc                 C   s    dd |D }t ||||||S )Nc                 S      g | ]}|  qS r   r   .0ru  r   r   r   
<listcomp>      z<InterpreterBuilder.create_make_block_ptr.<locals>.<listcomp>)r$   )r   r%   r&   r'   r(   r)   r*   new_offsetsr   r   r   create_make_block_ptr  s   z(InterpreterBuilder.create_make_block_ptrc                 C   sv   t |jt |krtddd |jD }t|j|j|j||j|j}t	t |D ]}|j|  j
|| j
7  _
q)|S )Nz len(ptr.offsets) != len(offsets)c                 S   r  r   r  r  r   r   r   r    r  z5InterpreterBuilder.create_advance.<locals>.<listcomp>)r2   r(   r   r$   r%   r&   r'   r)   r*   r1   r   )r   r   r(   r  r   r   r   r   r   create_advance  s   z!InterpreterBuilder.create_advancec                 C   s8   t |}d|jv rttjdd|d|jS td| )NrR   r   r  r,   zunsupported type )rk   namer
   r.   r  r8   	TypeError)r   r   np_typer   r   r   get_all_ones_value  s   
z%InterpreterBuilder.get_all_ones_valuer   N)r!   r"   r#   rv   MEM_SEMANTICACQUIREr   RELEASERELAXEDACQUIRE_RELEASEr  	ATOMIC_OPADDRMW_OPFADDMINUMINMAXUMAXANDORXORXCHGr  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  create_si_to_fpcreate_ui_to_fpcreate_fp_to_sicreate_fp_to_uicreate_fp_extcreate_fp_trunccreate_int_castr   r
  r  create_faddcreate_fmulcreate_fdivcreate_fremcreate_fsub
create_mulcreate_precise_divfcreate_sdivcreate_udivcreate_sremcreate_urem
create_add
create_sub
create_shlcreate_lshrcreate_minsicreate_minuicreate_minimumfcreate_minnumfcreate_maxsicreate_maxuicreate_maximumfcreate_maxnumfcreate_icmpSLEcreate_icmpSLTcreate_icmpSGEcreate_icmpSGTcreate_icmpULEcreate_icmpULTcreate_icmpUGEcreate_icmpUGTcreate_icmpEQcreate_icmpNEcreate_fcmpOLTcreate_fcmpOGTcreate_fcmpOLEcreate_fcmpOGEcreate_fcmpOEQcreate_fcmpONEcreate_fcmpULTcreate_fcmpUGTcreate_fcmpULEcreate_fcmpUGEcreate_fcmpUEQcreate_fcmpUNE
create_and
create_xor	create_orr  r6  r=  r>  create_clampfcreate_selectrI  rJ  rM  
create_cos
create_expcreate_exp2create_iabscreate_floorcreate_ceil
create_logcreate_log2create_precise_sqrtcreate_sqrt
create_sinr^  r_  create_reshaperd  rl  ro  rr  rx  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r   r      sJ   
	

		
r   c                    s"   |d fdd
}t | || d S )N)memberc                    s$   | |i dd |  D d iS )Nc                 S   s   i | ]\}}|d kr||qS )r   r   r  kvr   r   r   
<dictcomp>  s
    z1_patch_attr.<locals>.<lambda>.<locals>.<dictcomp>r   )items)r(  argskwargsbuilderr   r   r    s    z_patch_attr.<locals>.<lambda>)setattr)objr  r(  r1  
new_memberr   r0  r   _patch_attr  s   r5  c                 C   s2   t | D ]\}}tj|rt| ||| qd S r   )inspect
getmembersr\   core
is_builtinr5  )pkgr1  r  r(  r   r   r   _patch_builtin  s
   r;  c                    sJ   dd  dd }dd | _  fdd| _dd | _d	d | _t|| _d S )
Nc                 S   s   | j j}|jdkrt|S dS )Nr   T)r   r   sizer   )r   r   r   r   r   	_get_bool  s   z%_patch_lang_tensor.<locals>._get_boolc                 S   s&   t jtt| jj| jj| jj	S r   )
r\   r8  r   r
   r.   rb  r   r   r   r8   r   r   r   r   _get_transpose  s   &z*_patch_lang_tensor.<locals>._get_transposec                 S      t | jjS r   )rR   r   r   r   r   r   r   r    r  z$_patch_lang_tensor.<locals>.<lambda>c                    s    | S r   r   r   r=  r   r   r    s    c                 S   r?  r   )reprr   r   r   r   r   r   r    r  c                 S   r?  r   )rQ   r   r   r   r   r   r   r    r  )	__index__r   __repr____str__propertyT)r   r>  r   r@  r   _patch_lang_tensor  s   


rG  c                   @   s<   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd ZdS )ReduceScanOpIneterfacec                 C   s   || _ || _d S r   )r   
combine_fn)r   r   rI  r   r   r   r     s   
zReduceScanOpIneterface.__init__c                 C   s0   |d ur|t |krtd| d| d S d S )Nzaxis z out of bounds for shape )r2   r   )r   r&   r   r   r   r   
check_axis  s   z!ReduceScanOpIneterface.check_axisc                 C   s>   |D ]}t |tjjstdt| | |j| j qd S )Nzinput must be a tensor, got )	r[   r\   r8  r   r   r   rJ  r&   r   )r   ry   rB  r   r   r   check_tensor  s
   z#ReduceScanOpIneterface.check_tensorc                 C   sN   t |dr|jrt||j}ntj|gt|d}|}tjt	||j
|S )Nr&   r,   )r   r&   r\   rh   r.   r   rk   r8  r   r
   r8   )r   r   r   ret_typer   r   r   	to_tensor  s
   z ReduceScanOpIneterface.to_tensorc                 C   s$   t |ts|f}| | | |S r   )r[   tuplerK  
apply_implr   ry   r   r   r   apply  s   


zReduceScanOpIneterface.applyc                 C   r  )Nzapply_impl not implementedr  rP  r   r   r   rO    r  z!ReduceScanOpIneterface.apply_implN)	r!   r"   r#   r   rJ  rK  rM  rQ  rO  r   r   r   r   rH    s    rH  c                       sF   e Zd Z fddZdd Zdd Zddd	Zd
d Zdd Z  Z	S )	ReduceOpsc                       t  || || _d S r   )superr   	keep_dims)r   r   rI  rU  	__class__r   r   r        
zReduceOps.__init__c                 C   sN   g }|D ]}|d ur| | qd}| | |jj |j qt||fS )Nr   )appendrM  r   r   flattenr   rN  )r   ry   r   r   r   r   r   r   unravel  s   zReduceOps.unravelc                    s2  j } j \ }g }g } d jjj}|d| ||d d   } D ]}||jj |tj||jjjd q't	|d j
D ]}	t|	|d| |d d   t fddt|D }
| dkrt	t|D ]}|
| jj || < qvqEt fddt|D }jjg ||
R  }t|ts|fn|}t	t|D ]}t|| tjjr|| jj n|| || < qqEg }t|D ]6\}	}jr|d urt||}nt	t|D ]}t|d}qn|d u r| }|| |	 j qt|dkr|d S t|S )Nr   r   r,   c                 3   *    | ]\}} |  | jV  qd S r   rM  r   r  iirg  )ry   input_indexr   r   r   	<genexpr>     ( z+ReduceOps.generic_reduce.<locals>.<genexpr>c                 3   r\  r   r]  r  oio)ry   output_indexr   r   r   ra  	  rb  )r   r[  r   r   r&   rY  r.   zerosr   r1   r<  unravel_indexrN  	enumerater2   itemrI  fnr[   r\   r8  r   rU  r  rM  )r   ry   original_axisr   
input_dataoutput_datainput_shapeoutput_shaperB  r   input_tuplej	acc_tuplecombine_fn_retr   r   _r   )ry   r`  rf  r   r   generic_reduce  sN   zReduceOps.generic_reduceNc                 C   s   t |tr	|d n|}d }d }|r!| ||jj| j| jd|j}|r3| ||jj| j| jdtj	}|d ur?|d ur?||fS |d urE|S |d urK|S t
d)Nr   r   keepdimsz-val_reduce_op and idx_reduce_op are both None)r[   rN  rM  r   r   r   rU  r   r\   rX   r   )r   ry   val_reduce_opidx_reduce_opr   idxr   r   r   min_max  s     zReduceOps.min_maxc                 C   s"   |  tj|jj| j| jd|jS )Nrw  )rM  r.   sumr   r   r   rU  r   rP  r   r   r   r}  1  rs  zReduceOps.sumc                 C   s   | j tjjkr| j|d tjtjdS | j tjjkr&| j|d tj	tj
dS | j tjjkr8| j|d tj	d dS | j tjjkrJ| j|d tjd dS | j tjjkrX| |d S | |S )Nr   )ry  rz  )rI  r\   standard_argmin_combine_tie_break_leftr|  r.   minargmin_argmax_combine_tie_break_leftrt  argmax_elementwise_max_elementwise_min_sum_combiner}  rv  rP  r   r   r   rO  4  s   
zReduceOps.apply_implr   )
r!   r"   r#   r   r[  rv  r|  r}  rO  __classcell__r   r   rV  r   rR    s    

+rR  c                       s<   e Zd Z fddZdd Zdd Zdd Zd	d
 Z  ZS )ScanOpsc                    rS  r   )rT  r   reverse)r   r   rI  r  rV  r   r   r   F  rX  zScanOps.__init__c                 C   "   | j tj|jj| jd|jdgS Nr  r,   )rM  r.   cumsumr   r   r   r   rP  r   r   r   r  J  rs  zScanOps.cumsumc                 C   r  r  )rM  r.   cumprodr   r   r   r   rP  r   r   r   r  M  rs  zScanOps.cumprodc                    s  g }g }d j jj}D ]}||j j |tj||j jjd qt|d jD ]}t	|| t
 fddt|D } j dkr_tt|D ]}|| j j ||  < qOq+t
 fddtt D t
fddt|D }	jjg |	|R  }
t|
t
s|
fn|
}	tt|D ]}t|	| tjjr|	| j j n|	| ||  < qq+g }t|D ]\}}||| j q|S )Nr   r,   c                 3   s*    | ]\}} |  | jV  qd S r   r]  r^  )indexry   r   r   r   ra  [  rb  z'ScanOps.generic_scan.<locals>.<genexpr>c                 3   s.    | ]}|j kr | d  n | V  qdS )r   Nr  )r  r   )r  r   r   r   ra  a  s   , c                 3   r\  r   r]  rc  )ry   
prev_indexr   r   r   ra  b  rb  )r   r   r&   rY  r.   rg  r   r1   r<  rh  rN  ri  r   r2   rj  rI  rk  r[   r\   r8  r   rM  )r   ry   rm  rn  r&   rB  r   r   rr  rs  rt  r   r   )r  ry   r  r   r   generic_scanP  s8    zScanOps.generic_scanc              	   C   s   g }| j r|D ]}|| tj|jj| jd|j qn|}| j	t
jjkr.| |d }n| j	t
jjkr=| |d }n| |}| j rV|D ]}tj|jj| jd|j_qGt|dkr`|d pct|S )Nr  r   r   )r  rY  rM  r.   flipr   r   r   r   rI  r\   r~  r  r  _prod_combiner  r  r2   rN  )r   ry   	new_inputrB  r   r   r   r   rO  n  s   &
zScanOps.apply_impl)	r!   r"   r#   r   r  r  r  rO  r  r   r   rV  r   r  D  s    r  c                  C   s4   ddd} ddd}| t _|t _| t j_|t j_d S )NFc                 [      t |||| S r   )rR  rQ  )ry   r   rI  rU  r/  r   r   r   _new_reduce  r   z'_patch_reduce_scan.<locals>._new_reducec                 [   r  r   )r  rQ  )ry   r   rI  r  r/  r   r   r   	_new_scan  r   z%_patch_reduce_scan.<locals>._new_scan)F)r\   reduceassociative_scanr8  )r  r  r   r   r   _patch_reduce_scan  s   

r  c                 C   sx   dd }ddd}ddd}dd	 }|| _ || _|| _t| _|| j_t|d
d| _t|dd| _	t|dd| _
t  d S )Nc                 S   sB  | j dkr	| S | j dkr| S | j dkr| S | j dkr$| S | j dkr-| S | j dkr6| S | j dkr?| S | j dkrH| S | j d	krQ|	 S | j d
krZ|
 S | j dkrc| S | j dkrl| S | j dkru| S | j dkr~| S | j dkr| S | j dkr| S | j dkr| S td|  d)Nvoidr^   rT   rS   rV   rU   rX   rW   rY   r6   fp8e5fp8e4nvfp8e4b15fp16bf16fp32fp64zfail to convert z to ir type)r  get_void_tyget_int1_tyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r1  r   r   r   
_new_to_ir  sF   
















z$_patch_lang_core.<locals>._new_to_irc                 [   s6   |d u rd}|d u rd| }}n| |}}t |||S )Nr   r   )r1   )arg1arg2stepr/  rm  endr   r   r   
_new_range  s   
z$_patch_lang_core.<locals>._new_range c                 S   s   | sJ |d S r   r   )rH  r  r   r   r   _new_static_assert  r   z,_patch_lang_core.<locals>._new_static_assertc                 S   sn   t | tjs| S t |ttfs|gn|}dd |D }t|tdt| jkr.td| | j	
|| | S )Nc                 S   s"   g | ]}t |tjr|jn|qS r   )r[   r\   	constexprr   r  r+  r   r   r   r       " z7_patch_lang_core.<locals>._set_attr.<locals>.<listcomp>r   z$len(values) != len(input.shape) for )r[   r\   r   listrN  r2   rt  r&   r   r   r    )ry   r  r  r   r   r   	_set_attr  s   z#_patch_lang_core.<locals>._set_attrztt.divisiblityr  ztt.contiguityztt.constancy)NN)r  )r1   static_rangestatic_assertr  static_printr   to_irr   multiple_ofmax_contiguousmax_constancyr  )langr  r  r  r  r   r   r   _patch_lang_core  s   
(
	
r  c                 C   s   dd | j  D }t|dksJ dt|d t t|d jt |d tkr1t|d jt t|d j t	|d  d S )Nc                 S   s"   g | ]\}}|t t jfv r|qS r   )r\   r8  )r  ru  r   r   r   r   r    r  z_patch_lang.<locals>.<listcomp>r   z:triton.language must be visible from within jit'd functionr   )
__globals__r-  r2   r;  interpreter_builderr   r\   r   rG  r  )rk  r  r   r   r   _patch_lang  s   r  c                 C   s:  t | trqttjjjtjjj	| }t
j}d|   kr#dk r)n nt
j}n7d|   kr3dk r9n nt
j}n'd|   krCdk rIn nt
j}nd|   krSdk rYn nt
j}ntd|  tt
j| g|d|}t||S t| d	rttjjjtjjj	| }tt
j|  gt
jd|}t||S | S )
Ni   l        l        l         l            l            zUnsupported integer value r,   data_ptr)r[   rR   r\   	str_to_tytritonruntimejitJITFunction_type_of_key_ofr.   rX   rW   rY   r6   r   r
   r   r   r   r  )rB  tyr   r   r   r   r   _implicit_cvt  s&   
"
"r  )	num_warps
num_stagesnum_ctasenable_fp_fusiongridmaxnregc                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
GridExecutorc                    sN   ddl m || _|| _|| _fdd|j D   fdd|D | _d S )Nr   _normalize_tyc                    s   i | ]	\}}| |qS r   r   )r  r  r  r  r   r   r,    s    z)GridExecutor.__init__.<locals>.<dictcomp>c                    s   g | ]}  |d kr|qS )r  )get)r  r  )rP   r   r   r        z)GridExecutor.__init__.<locals>.<listcomp>)r  r  rk  	arg_namesr  rP   r-  
constexprs)r   rk  r  r  r   )rP   r  r   r     s   zGridExecutor.__init__c                 C   sp   g }|D ]}t |dr||  q|| qi }| D ]\}}t |dr/| ||< q|||< q||fS Nr  )r   rY  cpur-  )r   args_devr/  args_hstrB  
kwargs_hstr   r   r   r   r   _init_args_hst  s   


zGridExecutor._init_args_hstc           
      C   sr   t ||D ]\}}t|dr|j||jj q| D ]\}}|| }	t|dr6|j|	|jj qd S r  )zipr   r   copy_todevicer-  )
r   r  r  r/  r  arg_devarg_hstr   	kwarg_dev	kwarg_hstr   r   r   _restore_args_dev$  s   

zGridExecutor._restore_args_devc              
      sL  dd |  D }|ddrd S  ||\}}t j tj jg|R i |} fdd|  D }t jr@ |n j}t	|dksMJ d|ddt	|   }t
j|  z,t|d	 D ]#}t|d
 D ]}t|d D ]}	t
|||	  jdi | qsqkqcW n ty }
 ztt|
|
d }
~
ww  |||| d S )Nc                 S   s   i | ]\}}|t vr||qS r   )RESERVED_KWSr)  r   r   r   r,  1  r  z)GridExecutor.__call__.<locals>.<dictcomp>warmupFc                    s(   i | ]\}}|| j v r|nt|qS r   )r  r  )r  r  rB  r   r   r   r,  ;  s   (    z#grid must have at most 3 dimensions)r   r   r   r   r   )r-  popr  r  rk  r6  getcallargscallabler  r2   r  r   r1   r   	Exceptionr   rA  r  )r   r  r/  r  r  r.  r  r   r   r   er   r   r   __call__/  s2   

zGridExecutor.__call__N)r!   r"   r#   r   r  r  r  r   r   r   r   r  	  s
    	r  c                   @   s2   e Zd ZdddZedd Z dd Zd	d
 ZdS )InterpretedFunctionr   Nc                    s<   | _  fdd}| _t|}dd |j D  _d S )Nc                     s"   |d }t  j j|| i |S )Nr  r  rk  r  )r.  r/  r  r   r   r   runR  s   z)InterpretedFunction.__init__.<locals>.runc                 S   s   g | ]}|j qS r   r  r  r   r   r   r  X  rF  z0InterpretedFunction.__init__.<locals>.<listcomp>)rk  r  r6  	signature
parametersr  r  )r   rk  r  r  r   r   r   r   O  s
   
zInterpretedFunction.__init__c                 C   s   | j jS r   )rk  r!   r   r   r   r   r!   Z  s   zInterpretedFunction.__name__c                 C   s   t | j| j|S r   r  )r   r  r   r   r   __getitem__^  r   zInterpretedFunction.__getitem__c              
   O   sD   t | j z	| j|i |W S  ty! } ztt||d }~ww r   )r  rk  r  r   rA  )r   r.  r/  r  r   r   r   r  a  s   
zInterpretedFunction.__call__r  )r!   r"   r#   r   rE  r  r  r   r   r   r   r  M  s    

r  )4r6  typingr   r   numpyr.   r  triton.languagelanguager\   dataclassesr   errorsr   	functoolsr   _C.libtritonr   r   r	   rv   r
   r$   rC   rZ   rk   r   r   r   	vectorizer`   r\  ra   r]  r6   r7  r   r   r5  r;  rG  rH  rR  r  r  r  r  r  r  r  r  r  r   r   r   r   <module>   sT     @   N"`>ND