o
    灛inF                     @   s   d Z ddlZddlZddlmZmZ ddlmZ dd Zedej	d	ej	d
ej	dej	fddZ
edej	dej	fddZedej	d	ej	d
ej	dej	dej	dej	fddZedej	d	ej	d
ej	dej	dej	dej	fddZG dd dejjZejZdS )ao  
Fused Attention
===============
This is a Triton implementation of the Flash Attention algorithm
(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)

Sequence Parallel implementation inspired by HazyResearch
(see https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py)
    N   )cdivjit)languagec                   C   s   t jjj jdkS )Nhip)tritonruntimedriveractiveget_current_targetbackend r   r   \/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/triton/ops/flash_attention.pyis_hip   s   r   BLOCK_MBLOCK_DMODELBLOCK_N	IS_CAUSALc           8   	   C   s  t d}t d}|| } | | }!t j|||f||fd|!f||fdd}"t j|||f||f|!df||fdd}#|| t d| }$t d|}%t j|gt jdtd }&t j|gt jd}'t j||gt jd}(|d })t d|}*| |  |$d d d f |  |*d d d f |	  }+t |+},|,|) |j	j
},d}-|r|d | n|}.t|-|.|D ]}/t |"}0t |#}1t j||gt jd}2|rt |$d d d f |/|%d d d f  k|2td	}2|2t |,|07 }2t |&t |2d}3t j|&|3 }4t j|2|3d d d f  }5|(|4d d d f 9 }(|(t |5|j	j
|17 }(|'|4 t |5d }'|3}&t |"d|f}"t |#|df}#q|(|'d d d f  }(|||  |$ }6t |6|&t j|'  t j|||f||f|!||  df||fdd}7t |7|(|j	j
 d S )
Nr      )r   r   baseshapestridesoffsetsblock_shapeorderr   r   dtypeinf/ldG?-inf)tl
program_idmake_block_ptrarangezerosfloat32floatloadtor   
element_tyrangewheredotmaximummaxmathexp2sumadvancestorelog2)8QKVsm_scaleLOut	stride_qz	stride_qh	stride_qm	stride_qk	stride_kz	stride_kh	stride_kn	stride_kk	stride_vz	stride_vh	stride_vn	stride_vk	stride_oz	stride_oh	stride_om	stride_onZHN_CTX	Z_H_N_CTXr   r   r   r   start_moff_hz
qvk_offset	vk_offsetK_block_ptrV_block_ptroffs_moffs_nm_il_iaccqk_scaleoffs_kQ_ptrsqlohistart_nkvqkm_i_newalphapl_ptrsO_block_ptrr   r   r   _fwd_kernel   sx   

	0


2	rk   D_HEADc           
      C   s   t d| t d| }t d|}t | |d d d f |  |d d d f  t j}t ||d d d f |  |d d d f  t j}t j|| dd}	t || |	 d S )Nr   r   )axis)r"   r#   r%   r)   r*   r'   r3   r5   )
r<   DODeltar   rl   off_moff_nododeltar   r   r   _bwd_preprocessu   s   66ru   SEQUENCE_PARALLELCAUSALMMA_V3c.           F   	   C   s  |,r|&|( }.nd}.|$| |#|  | }/|$| |#|  }0|$| |#|  | }1|$| |#|  | }2|+r7|0||& 7 }0|0| }0t ||.|/ df}t ||&|( |1 df}t ||&|( |2 df}t ||.|/ df}t ||.|0 df}t ||&|( |1 df}t ||&|( |2 df}|&|( t d|( }3t d|*}4||%|"  }5|
|%|"  }6t j|(|)gt jd}7t j|(|)gt jd}8t |}9t |}:t|.|'|( |(D ]};|;|4 }<t |}=|,rt |<d d d f |3d d d f ktdtd}>n
t j|(|*gt jd}>|>t 	|=t 
|97 }>|>|9 }>t |6|< }?t j|>|?d d d f  }@t |}A|7t 	t 
|@| jj|A7 }7t |5|< }Bt 	|At 
|:}C|@|C|Bd d d f   | | jj}D|8t 	t 
|D|=7 }8|+st |}E|Et 	|D|97 }Et ||E| jj n'|+r|-rt 	|D|9}Ent 
t 	t 
|9t 
|D}Et ||E| jj t ||(df}t ||(df}t ||(df}qt ||7|jj t ||8|jj d S )Nr   r   g        r!   )r"   r4   r%   r&   r'   r)   r,   r-   r(   r.   transr1   r2   r*   r   r+   r5   )Fr7   r8   r9   r:   r\   r<   rn   DQDKDVr;   DQ_block_ptrrU   rV   DO_block_ptrDQ_block_ptrDK_block_ptrDV_block_ptr
stride_dqar=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rM   rN   rO   off_hoff_zrR   rb   	num_blockr   r   r   rv   rw   rx   r`   Q_offset	DQ_offsetK_offsetV_offsetrX   rW   D_ptrsri   dvdkrc   rd   rQ   offs_m_currr_   re   rZ   rh   rs   Didpdsdqr   r   r   _bwd_kernel_one_col_block   sn   



4
 &
r   c#           0   
   C   s  |d }#t d}$|$| }%|$| }&t j| ||f||fd||fdd}'t j|||f||fd||fdd}(t j|||f||fd||fdd})t j|||f||fd||fdd}*| rit j|||f||fd||fdd}+nt j|||f||fd||fdd}+t j|||f||fd||fdd},t j|||f||fd||fdd}-t ||}.| std|.D ]_}/tg | ||||#||||||	|
|'|(|)|*|+|,|-|||||||||||||||||&|%|$|/|.R |||| |!|"d qd S t d}/tg | ||||#||||||	|
|'|(|)|*|+|,|-|||||||||||||||||&|%|$|/|.R |||| |!|"d d S )Nr    r   )r   r   r   r   )r   r   r   rv   rw   rx   r   )r"   r#   r$   r   r,   r   )0r7   r8   r9   r:   r<   rn   rz   r{   r|   r;   r}   r   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rM   rN   rO   rP   SQ_Z_H_N_CTXr   r   r   rv   rw   rx   r\   rR   r   r   r~   rU   rV   r   r   r   r   num_block_nrb   r   r   r   _bwd_kernel   s  
			 			






 			




r   c                   @   s&   e Zd ZedddZedd ZdS )
_attentionFc                 C   s  t j }|d dk rtdd}d}	|jd |jd |jd }
}}|
|kr,||ks.J |dv s4J t |}t|jd ||jd |jd	  d	f}t j|jd |jd	  |jd f|jt j	d
}|dkridnd}t
| |||||||d|d	|d|d|d|d	|d|d|d|d	|d|d|d|d	|d|d|jd |jd	 |jd |jd |jd	  |jd  f||	|||dd | ||||| || _|| _|| _|| _|| _|S )Nr      zEFlash attention currently only supported for compute capability >= 80   @   >          r   r   r   r   devicer         )r   r   r   r   	num_warps
num_stages)torchcudaget_device_capabilityRuntimeErrorr   
empty_liker   emptyr   r'   rk   stridesave_for_backwardgridr:   r   causalsequence_parallel)ctxr_   rc   rd   r   r:   r   
capabilityr   r   LqLkLvrr   r   r;   r   r   r   r   forwardr  sF   
"
&.    
z_attention.forwardc                  C   s  t j }|d dk}d}t rd}| j\}}}}}	| j}
|jd }| }|
r>t||}|f|j }t j	||j
|jd}nt j||jd}t |}t |}t |	}tt|jd || jd  f ||||| jd	 t| jd |
ryt||ndf |||| j||||||	|| |d|d|d|d
|d|d|d|d
|d|d|d|d
|jd |jd |jd |jd |jd  |jd  t|||jd  |jd  |jd  f||| j|
| j|ddd t|jdkr|jdd}|||d d d fS )Nr   	   r   r   r   r   r   r   )r   rl   r   r   )r   r   r   rv   rw   rx   r   r      )dim)r   r   r   r   saved_tensorsr   r   
contiguousr   r&   r   r   
zeros_liker   ru   r   r   r   r:   numelr   r   lenr3   )r   rs   r   rx   BLOCKr_   rc   rd   rr   r;   r   
seq_len_kvreplicasnew_dq_shaper   r   r   rt   r   r   r   backward  s`   






&  &z_attention.backwardN)F)__name__
__module____qualname__staticmethodr   r   r   r   r   r   r   p  s
    'r   )__doc__r   r    r   r   r   r"   r   	constexprrk   ru   r   r   autogradFunctionr   apply	attentionr   r   r   r   <module>   sj    
		
^c 
b