o
    iu	                     @   s   d dl Z ddlmZ de jdede jfddZ			dd
e jjde jde jde jde jdB dededB de	e jdf fddZ
dS )    N   )PagedAttentionCachehidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
       N)shapeexpandreshape)r   r   batchnum_key_value_headsslenhead_dim r   f/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/sdpa_paged.py	repeat_kv   s
   0r           modulequerykeyvalueattention_maskdropoutscalingc              	   K   s   | dd }|d ur-|j||| j|d |d d\}}|ddd}|ddd}t| dr>t|| j}t|| j}|}	| }| }| }t	j
jj||||	||dd	}
|
dd
 }
|
d fS )Ncache
read_indexwrite_index)
key_statesvalue_states	layer_idxr   r   r   r   num_key_value_groupsF)	attn_mask	dropout_pscale	is_causalr   )popupdater   	transpose	unsqueezehasattrr   r    
contiguoustorchnn
functionalscaled_dot_product_attention)r   r   r   r   r   r   r   kwargsr   causal_maskattn_outputr   r   r   sdpa_attention_paged_forward   s:   


r2   )r   N)r+   $generation.continuous_batching.cacher   Tensorintr   r,   Modulefloattupler2   r   r   r   r   <module>   s,    	