o
    灛i                    @  s  d dl mZ d dlmZmZmZmZmZ ddlm	Z	 ddl
mZ ddl
mZ edZG d	d
 d
eZdddZdddZdddZdddZdd"d#Z	$	%	$ddd)d*Zdd-d.Zdd/d0Zdd1d2Zdd3d4Zdd5d6Zdd8d9Zdd:d;Zdd@dAZddBdCZddFdGZ d dHdIZ!ddJdKZ"ddLdMZ#ddNdOZ$ddPdQZ%ddRdSZ&d!dTdUZ'ddVdWZ(ddXdYZ)ddZd[Z*d"d\d]Z+d#d^d_Z,d$d`daZ-d%dddeZ.ddfdgZ/ddhdiZ0ddjdkZ1ddldmZ2ddndoZ3ddpdqZ4d&dtduZ5d'dydzZ6d(d|d}Z7d)ddZ8d*ddZ9d+ddZ:d,ddZ;d-ddZ<d.ddZ=d/ddZ>d0ddZ?d1ddZ@d2ddZA	d3d4ddZBdd ZCdd ZDdd ZEdd ZFdd ZGdd ZHdd ZIdd ZJdd ZKd5ddZLd6ddZMd7ddZNddÄ ZOddń ZPd8ddȄZQd9dd̈́ZRd:ddфZSd;ddӄZTd;ddՄZUd;ddׄZVd;ddلZWd;ddۄZXd;dd݄ZYd;dd߄ZZdd Z[d<ddZ\d=ddZ]dd Z^d>ddZ_d?ddZ`d@ddZadAddZbdAddZcdAddZddBdd ZedCddZfdDddZgdd ZhdEddZidFddZjdFddZkdS (G      )annotations)ListOptionalSequenceTupleTypeVar   )ir   )core)mathTc                      s   e Zd Z fddZ  ZS )IncompatibleTypeErrorImplc                   s@   || _ || _d| j   d | j  | _tt| | j d S )Nzinvalid operands of type  and )type_atype_b__repr__messagesuperr   __init__)selfr   r   	__class__ Z/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/triton/language/semantic.pyr      s   z"IncompatibleTypeErrorImpl.__init__)__name__
__module____qualname__r   __classcell__r   r   r   r   r      s    r   axisintbuilder
ir.builderreturn	tl.tensorc                 C  *   | dvrt d|  t|| tjS )Nr   r
   r   z+program_id axis must be 0, 1, or 2 but got )
ValueErrortltensorcreate_get_program_idint32r   r!   r   r   r   
program_id      r-   c                 C  r%   )Nr&   z-num_programs axis must be 0, 1, or 2 but got )r'   r(   r)   create_get_num_programsr+   r,   r   r   r   num_programs    r.   r0   a_tytl.dtypeb_tyc                 C  s   | j }|j }| j}|j}||kr||kr| S |S |tjjjkr'||kr%| S |S |tjjjkr6||kr4|S | S td| d| )Nzunexpected signedness r   )int_bitwidthint_signednessr(   dtype
SIGNEDNESSUNSIGNED	TypeError)r1   r3   a_rankb_ranka_snb_snr   r   r   integer_promote_impl+   s   r>   
div_or_modboolc                 C  s   |   s|  rtjS |  s| rtjS |  s| r&|r#tjS tjS |  s.| rA|r3tjS |  r>| r>tjS tjS | 	 rI|	 sSt
d|  d| |rk| j|jkrkt
d|   d |  d t| |S )Nunexpected type r   zCannot use /, #, or % with x because they have different signedness;this is unlikely to result in a useful answer. Cast them to the same signedness.)is_fp64r(   float64is_fp32float32is_fp16float16is_bf16bfloat16is_intr9   r5   r   r>   )r1   r3   r?   r   r   r   computation_type_impl;   s*   
rL   r   r   allow_ptr_aNonec                 C  sJ   |   r!|st| ||  r| |krt| || r#t| |d S d S N)is_ptrr   is_floating)r   r   rM   r   r   r   check_ptr_type_implc   s   


rR   FTlhsrhsTuple[tl.tensor, tl.tensor]c           
      C  sx   t | ||\} }| jj}|jj}t||| t||| |r8| s8| s8t|||}	t| |	|} t||	|}| |fS rO   )broadcast_impl_valuetypescalarrR   rP   rL   cast)
rS   rT   r!   allow_lhs_ptrallow_rhs_ptrarithmetic_checkr?   
lhs_sca_ty
rhs_sca_ty
ret_sca_tyr   r   r   binary_op_type_checking_implo   s   r`   inputotherc                 C  s   t | ||dd\} }| jj}|jj}| r| rtd| r3| s3|| } }| jj}|jj}| rDt|| j|j| jS |	 rUt|
| j|j| jS | rft|| j|j| jS td| )NTzcannot add pointers togetherrA   )r`   rW   rX   rP   r9   r(   r)   create_addptrhandlerQ   create_faddrK   
create_addra   rb   r!   input_scalar_tyother_scalar_tyr   r   r   add   s    
rj   c                 C  s   t | ||dd\} }| jj}| r"t|| jt||j| jS |	 r3t|
| j|j| jS | rDt|| j|j| jS td| )NTFrA   )r`   rW   rX   rP   r(   r)   rc   rd   minusrQ   create_fsubrK   
create_subr9   ra   rb   r!   	scalar_tyr   r   r   sub   s    rp   c                 C  sj   t | ||\} }| jj}| rt|| j|j| jS | r.t|	| j|j| jS t
d| NrA   )r`   rW   rX   rQ   r(   r)   create_fmulrd   rK   
create_mulr9   rn   r   r   r   mul   s   rt   c                 C  s   t | ||dddd\} }| jj}|jj}| r#| r#t|||}nI| r2| r2t| ||} n:| rI| rIt| tj|} t|tj|}n#| re| re|j|jkr^t|||}nt| ||} nt	d| t
|| j|j| jS NFTrA   )r`   rW   rX   rQ   rK   rY   r(   rF   fp_mantissa_widthr9   r)   create_fdivrd   rg   r   r   r   truediv   s    rx   c                 C  s   t | ||dddd\} }| jj}|jj}| rK| rKt||}t| ||} t|||}| r>t|	| j
|j
| jS t|| j
|j
| jS td| ru   )r`   rW   rX   rK   r>   rY   is_int_signedr(   r)   create_sdivrd   create_udivr9   )ra   rb   r!   rh   ri   ret_tyr   r   r   floordiv   s   
r}   ieee_roundingc                 C  s^   | j j}|j j}| r| stdt| ||dddd\} }|| j|j}t|| j S )Nz4both operands of fdiv must have floating scalar typeFT)	rW   rX   rQ   r9   r`   rw   rd   r(   r)   )ra   rb   r~   r!   rh   ri   retr   r   r   fdiv   s   r   c              	   C  s   t | ||dddd\} }| jj}|jj}| r.t| ttjt| |d||d|||}|S |	 rf|j
|j
krHtd|  d |  d | rYt|| j|j| jS t|| j|j| jS td| )NFT_builderzCannot mod z by rB   rA   )r`   rW   rX   rQ   rp   rt   r   floorr   rK   r5   r9   r   ry   r(   r)   create_sremrd   create_urem)ra   rb   r!   ro   ri   r   r   r   r   mod   s   ( r   xypropagate_nantl.PropagateNanc                 C     t | ||\} }| j}| r<|tjjkr"t|| j|j| j	S |tjj
kr5t|| j|j| j	S td| | rMt|| j|j| j	S | r^t|| j|j| j	S td| NzUnexpected propagate_nan Unexpected dtype )r`   r6   rQ   r(   PropagateNanALLr)   create_minimumfrd   rW   NONEcreate_minnumfr'   ry   create_minsiis_int_unsignedcreate_minuir9   r   r   r   r!   r6   r   r   r   minimum     r   c                 C  r   r   )r`   r6   rQ   r(   r   r   r)   create_maximumfrd   rW   r   create_maxnumfr'   ry   create_maxsir   create_maxuir9   r   r   r   r   maximum  r   r   minmaxc                 C  sn   t |||\}}t | ||\} }t | ||\} }| j}| r/t|| j|j|j|| jS td| d)Nr   z(. Only floating point clamp is supported)	r`   r6   rQ   r(   r)   create_clampfrd   rW   r9   )r   r   r   r   r!   r6   r   r   r   clamp%  s    r   c                 C  sz   t | ||ddd\} }| jj}|jj}| r| s t||t||}||kr/t| ||} ||kr9t|||}| |fS )NF)r`   rW   rX   rK   r   r>   rY   )ra   rb   r!   input_sca_tyother_sca_tyr_   r   r   r   bitwise_op_type_checking_impl6  s   

r   c                 C  *   t | ||\} }t|| j|j| jS rO   )r   r(   r)   
create_andrd   rW   ra   rb   r!   r   r   r   and_E     r   c                 C  r   rO   )r   r(   r)   	create_orrd   rW   r   r   r   r   or_J  r   r   c                 C  r   rO   )r   r(   r)   
create_xorrd   rW   r   r   r   r   xor_O  r   r   c                 C  D   | j  st| td|} |j  st|td|}t| ||S Nint1)rW   is_int1bitcastr(   r6   r   r   r   r   r   logical_andT  
   

r   c                 C  r   r   )rW   r   r   r(   r6   r   r   r   r   r   
logical_or\  r   r   c                 C  s&   | j  st| td|} t| |S r   )rW   r   r   r(   r6   invert)ra   r!   r   r   r   not_d  s   

r   c                 C  r   rO   )r   r(   r)   create_lshrrd   rW   r   r   r   r   lshrj  r   r   c                 C  r   rO   )r   r(   r)   create_ashrrd   rW   r   r   r   r   ashro  r   r   c                 C  r   rO   )r   r(   r)   
create_shlrd   rW   r   r   r   r   shlt  r   r   c                 C  s   | S rO   r   ra   r   r   r   plus~  s   r   c                 C  sH   | j j}| rtd|  d t||||}t	|| |S )Nz$wrong type argument to unary minus ())
rW   rX   rP   r'   r   r(   r)   get_null_valueto_irrp   )ra   r!   r   _0r   r   r   rk     s
   rk   c                 C  sP   | j j}| s| rtd|  d t||	||}t
| ||S )Nz%wrong type argument to unary invert (r   )rW   rX   rP   rQ   r'   r   r(   r)   get_all_ones_valuer   r   )ra   r!   r   _1r   r   r   r     s
   r   vtl.block_typec                 C  s&   | j  stjS | j j}ttj|S rO   )rW   is_blockr(   r   shape
block_type)r   r   r   r   r   
_bool_like  s   
r   c                 C     t | ||\} }| jj}| rt|| j|jt| S |	 rB|
 r4t|| j|jt| S t|| j|jt| S td| rq   )r`   rW   rX   rQ   r(   r)   create_fcmpOGTrd   r   rK   ry   create_icmpSGTcreate_icmpUGTr9   rn   r   r   r   greater_than     r   c                 C  r   rq   )r`   rW   rX   rQ   r(   r)   create_fcmpOGErd   r   rK   ry   create_icmpSGEcreate_icmpUGEr9   rn   r   r   r   greater_equal  r   r   c                 C  r   rq   )r`   rW   rX   rQ   r(   r)   create_fcmpOLTrd   r   rK   ry   create_icmpSLTcreate_icmpULTr9   rn   r   r   r   	less_than  r   r   c                 C  r   rq   )r`   rW   rX   rQ   r(   r)   create_fcmpOLErd   r   rK   ry   create_icmpSLEcreate_icmpULEr9   rn   r   r   r   
less_equal  r   r   c                 C  n   t | ||\} }| jj}| rt|| j|jt| S |	 r0t|
| j|jt| S td| rq   )r`   rW   rX   rQ   r(   r)   create_fcmpOEQrd   r   rK   create_icmpEQr9   rn   r   r   r   equal     r   c                 C  r   rq   )r`   rW   rX   rQ   r(   r)   create_fcmpUNErd   r   rK   create_icmpNEr9   rn   r   r   r   	not_equal  r   r   startendc                 C  s   t | tr
t |tstdt| d? }t|d? }|s|r"td|| kr*td||  }||d @ dkr:td|g}ttj|}t|| ||S )Nz/arange's arguments must be of type tl.constexpr    zarange must fit in int32z=arange's end argument must be greater than the start argumentr
   r   z#arange's range must be a power of 2)	
isinstancer    r'   r@   r(   r   r+   r)   create_make_range)r   r   r!   is_start_int64is_end_int64ranger   r|   r   r   r   arange  s   r   r   	List[int]r6   c                 C  s   t |tjr|jjdksJ dt|||}n(|d u rtd|dkr,|||}nt	|d|j
 }||}t||}t|| |S )Nr
   zonly accepts size-1 tensorz2dtype must be specified when value is not a tensorr   get_)r   r(   r)   numelvaluerY   r'   r   r   getattrnamesplat)r   r   r6   r!   get_value_fnr   r   r   full  s   r   r   c                 C  sF   | j  r	J dt|dkr| S t| j|}t|| j||S )NzCannot splat a block tensorr   )	rW   r   lenr(   r   r6   r)   create_splatrd   )r   r   r!   r|   r   r   r   r     s
   r   	dst_shapecan_reorderc                 C  sR   d}|D ]}||9 }q| j j|krtdt| j j|}t|| j|||S )Nr
   z:reshape() cannot change total number of elements in tensor)	rW   r   r'   r(   r   rX   r)   create_reshaperd   )ra   r   r   r!   r   sr|   r   r   r   reshape%  s   
r   c                 C  sZ   dd | j D }||d | j st| ||dS t| jj|}t|	| j
||S )Nc                 S  s   g | ]}t |qS r   r(   _constexpr_to_value).0r   r   r   r   
<listcomp>0      zexpand_dims.<locals>.<listcomp>r
   )r   r!   )r   insertrW   r   r   r(   r   rX   r)   create_expand_dimsrd   )ra   r   r!   r   r|   r   r   r   expand_dims/  s   
r  c                 C  sX   |sJ dt | jdksJ t| jj| jd |jd  g}t|| j|j|S )Nz;current implementation of `cat` always may reorder elementsr
   r   )	r   r   r(   r   rW   rX   r)   
create_catrd   )rS   rT   r   r!   ret_typer   r   r   cat:  s   "r	  abc                 C  s   t | ||\} }| jg k}|rt| d|} t|d|}t| jd tjr*td}nd}| j|g }t| jj|}t	|
| j|j|}|rQt|dgd|d}|S )Nr   r   Fr   r!   )rV   r   r  r   r(   	constexprr   rW   rX   r)   create_joinrd   r   )r
  r  r!   
was_rank_1two	new_shaper  r   r   r   r   joinA  s   
r  c                 C  sp   t | jdks	J t| jd dksJ | jd d }t| jj|}|| j\}}t	||t	||fS )Nr   r  r   )
r   r   r(   r   r   rW   rX   create_splitrd   r)   )r
  r!   r  r  outLHSoutRHSr   r   r   splitZ  s   

r  dims
Tuple[int]c                   s~   t  jt |krtdtdd |D ttt |kr%td| t jj	 fdd|D }t
| j||S )Nz5permute dims must have the same length as input shapec                 s  s    | ]}t |V  qd S rO   r   r  dr   r   r   	<genexpr>j  s    zpermute.<locals>.<genexpr>z?permute dims must be a permutation of 0, 1, ..., n-1, but were c                   s   g | ]} j | qS r   r   r  r   r   r   r  m  r  zpermute.<locals>.<listcomp>)r   r   r'   sortedlistr   r(   r   rW   rX   r)   create_transrd   )ra   r  r!   r  r   r   r   permuteg  s   "r!  c                 C  s   | j  st| j |}t|| j||S | j  }t|t|kr.t	d| d| ||kr4| S t
|D ]#\}}|| |kr[|dkr[t	d||  d| d| d| d| 
q8t| j j|}t|| j||S )Nz!Cannot broadcast, rank mismatch: , r
   z3Cannot broadcast, the expanded size of the tensor (z ) must match the existing size (z) at non-singleton dimension : )rW   r   r(   r   r)   r   rd   get_block_shapesr   r'   	enumeraterX   create_broadcast)ra   r   r!   r|   	src_shapeiitemr   r   r   broadcast_impl_shapeq  s,   

r*  c              	   C  sZ  | j }|j }| r'| s't|j|j}t||j|	 |}| |fS | sH| rHt|j|j}t|| j|	 |} | |fS | r)| r)|	 }|	 }t
|t
|k rtt
|t
|D ]}t|| jdt|jdg| } | j }|	 }qkn.t
|t
|k rtt
|t
|D ]}t||jdt|jdg| }|j }|	 }qt
|t
|ksJ g }t|D ]3\}	}
||	 }|
dkr|| q|dks||
kr||
 qtdt|	 d t|
 d t| ||krt|j|}t|| j||} ||kr)t|j|}t||j||}| |fS )Nr   r
   z?Cannot make_shape_compatible: incompatible dimensions at index r#  r   )rW   r   r(   r   rX   r   r)   r   rd   r$  r   r   r  r%  appendr'   strr&  )rS   rT   r!   lhs_tyrhs_ty	lhs_shape	rhs_shape_	ret_shaper(  leftrightr|   r   r   r   rV     sl   +'



rV   rounding_modeOptional[str]c                 C  s<   | d u rd S | dkrt jjS | dkrt jjS td|  d)NrtnertzzInvalid rounding mode: z0. Supported rounding modes are 'rtne' and 'rtz'.)r	   ROUNDING_MODERTNERTZr'   )r5  r   r   r   _str_to_rounding_mode  s   r<  dst_tyc                 C  s   | j }| rt|j| j  }||kr| S |j}|j}| s%| r+t| ||S |j}|j}||krCt	dt
| d t
| t|| j|||S )Nz!Cannot bitcast data-type of size z to data-type of size )rW   r   r(   r   rX   r$  rP   rY   primitive_bitwidthr'   r,  r)   create_bitcastrd   r   )ra   r=  r!   src_ty
src_sca_ty
dst_sca_tysrc_bitsdst_bitsr   r   r   r     s    r   Nfp_downcast_roundingc                 C  s  | j }t|tjr|j}t|tjr|j}| r#t|j| j  }||kr)| S |j}|j}t	|}d}|
 rU|
 rU|j|jk rU|d u rLtjj}n|tjjkrTd}n|d urgtdt| d t| | so| rw|jjswJ d| s| r|jdd usJ d|jd | |||dS | r|
 s|
 r| s|rt|| j||||S | r| r| r| stt| tj |||S |
 o|
 o|j|jk}|rt|!| j|||S |
 o|
 o|j|jk }	|	rt|"| j|||S |# r[|# r[|j$|j$ks'|j%|j%kr[|& o0|'  }
|' rL| j(|}t|)|| j(}t*| ||S t|+| j|||
|S |, r|# r|' r| j(|}t|)|| j(}t*| ||S |& rt|-| j|||S t|.| j|||S |# r|, r|' s|& st|/| j|||S t|0| j|||S |1 r	|# r	|j$}|d	krt|2| j|||S |d
kr	t*t| tj3|t|4dtj3|S |# r!|1 r!t|5| j|||S |1 r9|1 r9t|6| j|||S J d|  d| )NFTz]fp_downcast_rounding should be set only for truncating fp conversions. Source scalar type is z and destination type is z4fp8e4nv data type is not supported on CUDA arch < 89convert_custom_typesz0target doesn't provide conversion for this type.r   @   r
   r   zcannot cast z to )7rW   r   r(   r  r   r   r   rX   r$  r<  rQ   r>  r	   r9  r:  r'   r,  
is_fp8e4nvoptionsallow_fp8e4nvis_fp8e4b15codegen_fnsgetis_fp8r)   create_fp_to_fprd   r   rG   rE   rI   rY   rF   create_fp_trunccreate_fp_extrK   r4   r5   ry   is_boolr6   r   r   create_int_castis_standard_floatingcreate_fp_to_sicreate_fp_to_uicreate_ui_to_fpcreate_si_to_fprP   create_ptr_to_intint64	get_int64create_int_to_ptrr?  )ra   r=  r!   rE  r@  rA  rB  use_custom_roundingtruncate_fpext_fpsign_extendtyr   bitwidthr   r   r   rY     s   






&rY   c                 C  H   t jj}| r"| dkrt jj}|S | dkrt jj}|S td|  d|S )Nz.ca.cgCache modifier  not supported)r	   CACHE_MODIFIERr   CACGr'   cache_modifiercacher   r   r   _str_to_load_cache_modifierQ     rm  c                 C  sp   t jj}| r6| dkrt jj}|S | dkrt jj}|S | dkr$t jj}|S | dkr.t jj}|S td|  d|S )Nz.wbrd  z.csz.wtre  rf  )r	   rg  r   WBri  CSWTr'   rj  r   r   r   _str_to_store_cache_modifier]      	rr  c                 C  rc  )N
evict_lastevict_firstzEviction policy rf  )r	   EVICTION_POLICYNORMAL
EVICT_LASTEVICT_FIRSTr'   )eviction_policyevictionr   r   r   _str_to_eviction_policym  rn  r|  c                 C  sD   d }| r | dkrt jj}|S | dkrt jj}|S td|  d|S )NzeronanzPadding option rf  )r	   PADDING_OPTIONPAD_ZEROPAD_NANr'   )padding_optionpaddingr   r   r   _str_to_padding_optiony  s   r  c                 C  sp   t jj}| r6| dkrt jj}|S | dkrt jj}|S | dkr$t jj}|S | dkr.t jj}|S td|  d|S )Nacquirereleaseacq_relrelaxedMemory semantic rf  )r	   MEM_SEMANTICACQUIRE_RELEASEACQUIRERELEASERELAXEDr'   )
sem_optionsemr   r   r   _str_to_sem  rs  r  c                 C  s\   t jj}| r,| dkrt jj}|S | dkrt jj}|S | dkr$t jj}|S td|  d|S )Ngpuctasysr  rf  )r	   MEM_SYNC_SCOPEGPUCTASYSTEMr'   )scope_optionscoper   r   r   _str_to_scope  s   r  c                 C  s   | rEt | ds
| g} dd | D } | D ]}t|tr(d|  kr't|k s*J  J qt| dks3J t| tt| ksAJ dt| S dS )N__iter__c                 S  "   g | ]}t |tjr|jn|qS r   r   r(   r  r   r  elemr   r   r   r       " z0_canonicalize_boundary_check.<locals>.<listcomp>r   z'Duplicate dimension in `boundary_check`r   )hasattrr   r    r   setr  )boundary_checkblock_shapedimr   r   r   _canonicalize_boundary_check  s   
,r  c	              
   C  s   |d us|d urt d| jjj}	|	tjksJ d|	 r(|tjjkr(t d| jj}
t	||

 }t|| j||||||
S )NK`mask` and `other` arguments cannot be specified for loading block pointers3`tl.int1` should be rewrited in `tl.make_block_ptr`z@Padding option `nan` is not supported for integer block pointers)r'   rW   
element_tyr(   r   rK   r	   r  r  r  r$  r)   create_tensor_pointer_loadrd   )ptrmaskrb   r  r  rl  r{  is_volatiler!   elt_tyr=  r   r   r   _load_block_pointer  s   
r  c	              
   C  s|  | j j std| j   d|d u r|d urtd|s!|r%td| j  s@|r5|j  r5td|r@|j  r@td| j  r_|d urRt|| j  |}|d ur_t|| j  |}| j j}	|	j}
|
t	j
kr{t	j}
t	|
|	j}	t| |	|} |d urt||
|}| j  r| j  }t	|
|}n|
}|d u rt	|| j||||S t	|| j|j|r|jnd ||||S )NUnsupported ptr type z in `tl.load`z)`other` cannot be provided without `mask`z`padding_option` or `boundary_check` argument is not supported for loading a tensor ofpointers or loading a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadEMask argument cannot be block type if pointer argument is not a blockzFOther argument cannot be block type if pointer argument is not a block)rW   rX   rP   r'   r   r   r*  r$  r  r(   r   int8pointer_typeaddress_spacerY   r   r)   create_loadrd   create_masked_load)r  r  rb   r  r  rl  r{  r  r!   ptr_tyr  r   r=  r   r   r   _load_legacy  sH   




r  r  r  Optional[tl.tensor]r  r   r  r,  rk  rz  r  c	              
   C  s^   t |}	t|}
t|}| j r#| jj r#t| |||||	|
||	S t| |||||	|
||	S rO   )	rm  r|  r  rW   rP   r  r   r  r  )r  r  rb   r  r  rk  rz  r  r!   rl  r{  r  r   r   r   load   s   r  desc_ptrc                 C  s<   t ||dd}|| j|||t|t|}t||S NFrequire_i64)_convert_to_ir_valuescreate_descriptor_loadrd   r   rm  r|  r(   r)   )r  offsetsrk  rz  rW   r!   r   r   r   r   descriptor_load  s   r  c                 C  s*   t ||dd}t|| j|j|tjS r  )r  r(   r)   create_descriptor_storerd   void)r  r   r  r!   r   r   r   descriptor_store  s   r  c           	   	   C  s   |d urt d| jj }|j st|||}|j s"J d||j ks7J d| d|j  d| jjj|jjksPJ d| jjj d|jj d| jjj}|tjks^J dt||}t	|||}t
|| j|j|||tjS )	Nr  z-Value argument must be block type or a scalarzBlock shape(z) and value shape(z
) mismatchzBlock element type(z) and value element type(r  )r'   rW   r  r$  r   r*  r(   r   r  rY   r)   create_tensor_pointer_storerd   r  )	r  valr  r  rl  r{  r!   r  r  r   r   r   _store_block_pointer  s"   
2

r  c           	   	   C  s2  | j j std| j   d|rtd| j  s0|j  r%td|r0|j  r0td| j  rKt|| j  |}|d urKt|| j  |}| j j}|j}|t	j
krgt	j}t	||j}t| ||} t|||}|s~t	|| j|j||t	jS |j j stdt	|| j|j|j||t	jS )Nr  z in `tl.store`z`boundary_check` argument is not supported for storing a tensor of pointers or storing a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadzFValue argument cannot be block type if pointer argument is not a blockr  z"Mask must have boolean scalar type)rW   rX   rP   r'   r   r   r*  r$  r  r(   r   r  r  r  rY   r)   create_storerd   r  rR  create_masked_store)	r  r  r  r  rl  r{  r!   r  r  r   r   r   _store_legacy;  s2   



"r  r  c           	      C  sl   t |}t|}| j s| jj rtd| j r,| jj r,t	| ||||||S t
| ||||||S )N"Cannot store to a constant pointer)rr  r|  rW   is_constrX   r'   rP   r  r   r  r  )	r  r  r  r  rk  rz  r!   rl  r{  r   r   r   storeg  s   r  cmpr  r  c              	   C  sN   t |}t|}| jjj}|jdvrtdt|	| j
|j
|j
|||jS )N)   r   rG  z9atomic_cas only supports elements with width {16, 32, 64})r  r  rW   rX   r  r>  r'   r(   r)   create_atomic_casrd   )r  r  r  r  r  r!   r  r   r   r   
atomic_cas}  s   

"r  op&Tuple[tl.tensor, tl.tensor, tl.tensor]c                 C  sB  | j j std| j   | j  s| j j rtd| j jj}|tju r4|dkr4td| d |tj	tj
tjtjfv rLtd| d t| | j  rk|d ur^t|| j  |}|d urkt|| j  |}t|| j jj|}|s|d}tj	}| j  r||| j  }ttj	| j  }t||}| ||fS )Nz)Pointer argument of store instruction is r  rj   atomic_z does not support fp16z does not support T)rW   rX   rP   r'   r   r  r  r(   rH   r   r  int16rJ   r,  r   r*  r$  rY   get_int1r   r   r)   )r  r  r  r  r!   r  mask_irmask_tyr   r   r   atom_red_typechecking_impl  s.   




r  c                 C  s  t | ||d|\} }}t|}t|}|jj}| rG| r3t|	t
jj| j|j|j|||jS t|	t
jj| j|j|j|||jS |tjtjhvrVtd| tg d||}|tjkretjntj}t|||}	t| t|d|}
|tjkrtjntj}t|||}t| t|d|}t|||}t|||}t|	t
jj|
j|	jt|||j|||	j}t|	t
jj|j|jt|||j|||j}t||||}t|||S )Nr   z#atomic_max not supported for dtype         r
   )r  r  r  rW   rX   rK   ry   r(   r)   create_atomic_rmwr	   	ATOMIC_OPMAXrd   UMAXrF   rD   r9   r   r+   rZ  r   r  uint32uint64r   r   r   UMINwherer  r  r  r  r  r!   sca_tyr}  i_typei_vali_ptrui_typeui_valui_ptrposnegpos_retneg_retr   r   r   r   
atomic_max  J     r  c                 C  s  t | ||d|\} }}t|}t|}|jj}| rG| r3t|	t
jj| j|j|j|||jS t|	t
jj| j|j|j|||jS |tjtjhvrVtd| tg d||}|tjkretjntj}t|||}	t| t|d|}
|tjkrtjntj}t|||}t| t|d|}t|||}t|||}t|	t
jj|
j|	jt|||j|||	j}t|	t
jj|j|jt|||j|||j}t||||}t|||S )Nr   z#atomic_min not supported for dtype r  r
   )r  r  r  rW   rX   rK   ry   r(   r)   r  r	   r  MINrd   r  rF   rD   r9   r   r+   rZ  r   r  r  r  r   r   r   r  r  r  r   r   r   
atomic_min  r  r  c              
   C  sj   t | ||d|\} }}t|}t|}|jj}| rtjjntjj	}t
||| j|j|j|||jS )Nrj   )r  r  r  rW   rX   rQ   r	   r  FADDADDr(   r)   r  rd   )r  r  r  r  r  r!   r  r  r   r   r   
atomic_add  s   $r  c              
   C  N   t | ||d|\} }}t|}t|}t|tjj| j	|j	|j	|||j
S )Nand)r  r  r  r(   r)   r  r	   r  ANDrd   rW   r  r  r  r  r  r!   r   r   r   
atomic_and      r  c              
   C  r  )Nor)r  r  r  r(   r)   r  r	   r  ORrd   rW   r  r   r   r   	atomic_or   r  r   c              
   C  r  )Nxor)r  r  r  r(   r)   r  r	   r  XORrd   rW   r  r   r   r   
atomic_xor  r  r  c              
   C  r  )Nxchg)r  r  r  r(   r)   r  r	   r  XCHGrd   rW   r  r   r   r   atomic_xchg  s    r  c                 C  sH   |   |jjv sJ d|jj d|  |  } | dkrd} ttj| S )Nzinput_precision must be one of z. Got TF32X3TF32x3)lowerrI  allowed_dot_input_precisionsupperr   r	   INPUT_PRECISION)input_precisionr!   r   r   r   _str_to_dot_input_precision  s   r  accr  max_num_imprecise_acc	out_dtypec              
   C  s  dd }| j  r|j  sJ || j|j|j | j s#|j r1t| tj|} t|tj|}|d u r9|jj}t	||}t
| j}t
|j}	||	  krRdkskn ||	  kr]dkskn J d| j d|j d| jd j|jd	 jksJ d
| j d|j d| jd j d|jd	 j d	| jd	 jdkr| jd jdkr|jd jdksJ d| j d|j d| j j r| j jtjksJ d| jd jdksJ d|d}
tj}n.| rtd| j j s| j j r|d}
tj}n| r|dn|d}
|}| j jd	 }|j jd }|dkr"| j jd nd }t||r/|||gn||g}|d u rJ||
|rD|||gn||g}n|j}|j |ksUJ |d u rm| j rk|j rk|jj}nd}t|| j|j||||S )Nc                 S  st  |j s*|  s| rJ d|  r| rd S | |ks(J d|  d| dd S |  s2| rU| |ksAJ d|  d| d|  sQ|  sSJ d|  dd S d S |  s]| r{|jreg d	}nd
dg}dd }|| |d |||d d S |  s|  s| 	 s| 
 sJ d|  | s| s|	 s|
 sJ d| | |ksJ d|  d| dd S )Nz1Dot op does not support fp8e4nv on CUDA arch < 90zFirst input (z) and second input (z) must have the same dtype!z0Both operands must be same type. First operand (z) and second operand (r   z:Both operands must be either int8 or uint8. Operand type ()fp8e4nvfp8e5fp8e4b15r  r  c                   s@   t  fdd|D sd|}td| d| d  dd S )Nc                 3  s"    | ]}t  d |  V  qdS )is_N)r   )r  
dtype_namer6   r   r   r  =  s     zLdot.<locals>.assert_dtypes_valid.<locals>._validate_dtype.<locals>.<genexpr>r"  zOnly supports z. z (r   )anyr  AssertionError)r6   allowed_typesoperand_namesupported_typesr   r  r   _validate_dtype<  s   
z9dot.<locals>.assert_dtypes_valid.<locals>._validate_dtypezFirst operandzSecond operandzUnsupported dtype )rJ  rH  rN  rK   is_int8is_uint8allow_fp8e4b15rG   rI   rE   r   )	lhs_dtype	rhs_dtyperI  r  r  r   r   r   assert_dtypes_valid*  s4   "

"""z dot.<locals>.assert_dtypes_validr      z+Both inputs must be either 2D or 3D; (lhs: z	 vs rhs: r   r  zFirst input shape (z) and second input shape z= are not compatible for matmul (second index of first shape (z0) must be equal to first index of second shape (r  z0All non-batch values in both first input shape (z) and second input shape (z) must be >= 16!zonly int8 supported!r
   r   zsmall blocks not supported!r   zhout_dtype=bfloat16 is unsupported. Please use out_dtype=float32/float16 and cast with `.to(tl.bfloat16)`) rW   r   r6   rI  rK  rY   r(   rH   default_dot_input_precisionr  r   r   r   rX   rK   r  	get_int32r+   rI   r'   rE   get_fp32rF   rG   get_fp16r   r   rd   rN  max_num_imprecise_acc_defaultr)   
create_dot)rS   rT   r  r  r  r  r!   r#  lhs_rankrhs_rankr   ret_scalar_tyMNBr|   
acc_handler   r   r   dot'  sh    


F0 

 
"

r3  	conditionc                 C  s   t | tj|} | j r$t| ||\} }t|||\}}t| ||\} }t|||dd\}}| j s;t| ||\} }|j}t|| j	|j	|j	|S )NT)
rY   r(   r   rW   r   rV   r`   r)   create_selectrd   )r4  r   r   r!   r1  r|   r   r   r   r    s   

r  c                 C  s"   |r	t ||}n|}t | |S rO   )r(   r   r)   )r   ro   r2  res_tyr   r   r   wrap_tensor  s   r7  inputsSequence[tl.tensor]Tuple[tl.tensor, ...]c                   s    d u rt fddD d d jjt} |k s'J d| d fddtD tfddD sAJ d	d
d D  |   t fddttD S )Nc                 3  s&    | ]}t ||jjgd  dV  qdS )Tr  N)r   r   r   r  tr!   r   r   r    s   $ zreduction.<locals>.<genexpr>r   z&reduction axis must be < inputs rank (r   c                   s   g | ]
\}}| kr|qS r   r   )r  r(  r   )r   r   r   r    s    zreduction.<locals>.<listcomp>c                 3  s    | ]	}|j j kV  qd S rO   )rW   r   r;  r  r   r   r    s    z-all reduction inputs must have the same shapec                 S     g | ]}|j qS r   rd   r;  r   r   r   r        c                 3  *    | ]}t | | jjV  qd S rO   r7  
get_resultrW   rX   r  r(  )r8  	reduce_opr2  r   r   r       ( )	tuplerW   r   r   r%  allcreate_reduceverifyr   )r8  r   region_builder_fnr!   rankr   )r   r!   r8  rE  r2  r   r   	reduction  s   "rM  reversec                   s    d j jt}| |  kr|k s!n J d| d| d|dk r)||7 } D ]}|j jks7J dq+|dd  D |||   t fdd	tt D S )
Nr   z
scan axis z must be < inputs rank (r   z(all scan inputs must have the same shapec                 S  r>  r   r?  r;  r   r   r   r    r@  z$associative_scan.<locals>.<listcomp>c                 3  rA  rO   rB  rD  r8  scan_opr   r   r   r    rF  z#associative_scan.<locals>.<genexpr>)rW   r   r   create_scanrJ  rG  r   )r8  r   rK  rN  r!   rL  r<  r   rO  r   associative_scan  s   ."rR  num_binsc                 C  sJ   t | jdksJ d| j sJ dt|| j|ttj	|fS )Nr
   z histogram only supports 1D inputz%histogram only supports integer input)
r   r   r6   rK   r(   r)   create_histogramrd   r   r+   )ra   rS  r!   r   r   r   	histogram  s   "rU  valuesc                 C  s@   t dt| jt|krtd| jdt|| j  | S )Nr
   zAShape of input to multiple_of does not match the length of valuesztt.divisibility)	r   r   r   r'   rd   set_attrr	   	make_attrget_contextr   rV  r   r   r   multiple_of  s   r[  c                 C  :   t | jt |krtd| jdt|| j  | S )NzDShape of input to max_contiguous does not match the length of valuesztt.contiguityr   r   r'   rd   rW  r	   rX  rY  rZ  r   r   r   max_contiguous     r^  c                 C  r\  )NzCShape of input to max_constancy does not match the length of valuesztt.constancyr]  rZ  r   r   r   max_constancy  r_  r`  c                 C  s   t |  t jS rO   )r(   r)   create_barrierr  r=  r   r   r   debug_barrier  s   rb  prefixargsList[tl.tensor]hexc                 C  sx   |  ds|r| d7 } |  ds|r| d d d } t| dkr)| ds)d|  } dd |D }t|| ||tjS )N r#  r  r   c                 S  r>  r   r?  )r  argr   r   r   r    r@  z device_print.<locals>.<listcomp>)endswithr   
startswithr(   r)   create_printr  )rc  rd  rf  r!   new_argsr   r   r   device_print  s   rm  condmsg	file_namelinenoc              	   C  sP   | j }| st|jd}t|| jd|} t|| j||||tj	S )N)r
   )
rW   r   r(   r   rX   r)   r   rd   create_assertr  )rn  ro  rp  	func_namerq  r!   cond_tyr   r   r   device_assert  s
   ru  c                 C  s  t |tr
t|}t |tjrH|r-d|j  krdk s'n J d|j d| |jS d|j  kr8dk sBn J d|j d| |jS t |tjr|jjdksXJ d	|j	
 saJ d
|j	tjkrv|rv| |j|  |j	 S |j	tjkr|sJ d|jS J dt| )Nl         l            z@Block pointers only support 64 bit `shape/strides`, got a value z which is out of the range           zFBlock pointers only support 32 bit `offsets/block_shape`, got a value r
   z*Expected a scalar in shape/strides/offsetsz8Expected an integer scalar type in shape/strides/offsetsFzzBlock pointers only support 32 bit `offsets/block_shape`, add a `.to(tl.int32)` or use regular indexing for 64 bit supportz3Unsupported element type in shape/strides/offsets: )r   r    r(   r  r   r[  r'  r)   r   r6   rK   rZ  rS  rd   get_int64_tyry   r+   rW   )r!   r  r  r   r   r   _convert_elem_to_ir_value  s*   



ry  c                   s,   t |dr fdd|D S t |gS )Nr  c                   s   g | ]}t  |qS r   )ry  r  r!   r  r   r   r  %  s    z)_convert_to_ir_values.<locals>.<listcomp>)r  ry  )r!   	list_liker  r   rz  r   r  #  s   
r  basec              	     s8  t ||}t ||}t ||dd}| j r| jj r td| jjtjkr4t| t	tj
| jj|} t ds< g dd  D  tdd  D sPJ d	t|dsX|g}d
d |D }t|ttt|ksoJ dt fdd||||fD sJ d|| j||| |}t|t	t| jj S )NFr  zMExpected `base` to be a pointer type (but not a block pointer type or others)r  c                 S  r  r   r  r  r   r   r   r  ;  r  z"make_block_ptr.<locals>.<listcomp>c                 s  s2    | ]}t |tod |  kodk n  V  qdS )rv  rw  N)r   r    r  r   r   r   r  <  s   0 z!make_block_ptr.<locals>.<genexpr>zGExpected a list of constant integers (`int32_t` range) in `block_shape`c                 S  r  r   r  r  r   r   r   r  B  r  z<Expected a permutation of (0, 1, ..., len(order)-1) in orderc                 3  s     | ]}t  t |kV  qd S rO   )r   )r  r{  r  r   r   r  F  s    zBExpected shape/strides/offsets/block_shape to have the same length)r  rW   rP   r  r   r'   r(   r   rY   r  r  r  r  rH  r  r  r   r   create_make_block_ptrrd   r)   r   )r|  r   stridesr  r  orderr!   rd   r   r}  r   make_block_ptr)  s,   



  r  c                 C  s&   t ||dd}t|| j|| jS r  )r  r(   r)   create_advancerd   rW   )r|  r  r!   r   r   r   advanceP  s   r  )r   r    r!   r"   r#   r$   )r1   r2   r3   r2   r#   r2   )r1   r2   r3   r2   r?   r@   r#   r2   )r   r2   r   r2   rM   r@   r#   rN   )FFTF)rS   r$   rT   r$   r!   r"   r#   rU   )ra   r$   rb   r$   r!   r"   r#   r$   )
ra   r$   rb   r$   r~   r@   r!   r"   r#   r$   )r   r$   r   r$   r   r   r!   r"   )
r   r$   r   r$   r   r$   r   r   r!   r"   )ra   r$   rb   r$   r!   r"   r#   rU   )ra   r$   r!   r"   )ra   r$   r#   r$   )ra   r$   r!   r"   r#   r$   )ra   r$   r!   r$   r#   r$   )r   r$   r#   r   )r   r    r   r    r!   r"   r#   r$   )r   r   r6   r2   r!   r"   r#   r$   )r   r$   r   r   r!   r"   r#   r$   )
ra   r$   r   r   r   r@   r!   r"   r#   r$   )ra   r$   r   r    r!   r"   r#   r$   )
rS   r$   rT   r$   r   r@   r!   r"   r#   r$   )r
  r$   r  r$   r!   r"   r#   r$   )r
  r$   r!   r"   r#   rU   )ra   r$   r  r  r!   r"   r#   r$   )ra   r$   r   r   r!   r"   r#   r$   )rS   r$   rT   r$   r!   r"   r#   r$   )r5  r6  )ra   r$   r=  r2   r!   r"   r#   r$   rO   )
ra   r$   r=  r2   r!   r"   rE  r6  r#   r$   )r  r$   r  r  rb   r  r  r   r  r,  rk  r,  rz  r,  r  r@   r!   r"   r#   r$   )
r  r$   rk  r,  rz  r,  r!   r"   r#   r$   )r  r$   r   r$   r!   r"   r#   r$   )r  r$   r  r$   r  r  rk  r,  rz  r,  r!   r"   r#   r$   )r  r$   r  r$   r  r$   r  r,  r  r,  r!   r"   r#   r$   )r  r$   r  r$   r  r$   r  r,  r!   r"   r#   r  )r  r$   r  r$   r  r$   r  r,  r  r,  r!   r"   r#   r$   )rS   r$   rT   r$   r  r$   r  r6  r  r    r  r2   r!   r"   r#   r$   )
r4  r$   r   r$   r   r$   r!   r"   r#   r$   )r8  r9  r   r    r!   r"   r#   r:  )
r8  r9  r   r    rN  r@   r!   r"   r#   r:  )ra   r$   rS  r    r!   r"   r#   r$   )r   r$   rV  r   r#   r$   )r!   r"   r#   r$   )
rc  r,  rd  re  rf  r@   r!   r"   r#   r$   )rn  r$   ro  r,  rp  r,  rq  r    r!   r"   r#   r$   )T)r|  r$   r!   r"   r#   r$   )l
__future__r   typingr   r   r   r   r   _C.libtritonr	    r   r(   r   r   	Exceptionr   r-   r0   r>   rL   rR   r`   rj   rp   rt   rx   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rk   r   r   r   r   r   r   r   r   r   r   r   r   r  r	  r  r  r!  r*  rV   r<  r   rY   rm  rr  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r3  r  r7  rM  rR  rU  r[  r^  r`  rb  rm  ru  ry  r  r  r  r   r   r   r   <module>   s    (



:
w:	,	''		`		'