o
    i                     @   sn   d dl Z d dl mZ ddlmZ de jdede jfdd	Zd
ejde jde jde jde jdB defddZ	dS )    N)nn   )PagedAttentionCachehidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
       N)shapeexpandreshape)r   r   batchnum_key_value_headsslenhead_dim r   g/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/eager_paged.py	repeat_kv   s
   0r   modulequerykeyvalueattention_maskscalingc                 K   s  | dd }|d ur-|j||| j|d |d d\}}|ddd}|ddd}t| dr>t|| j}t|| j}t|t	rZt
| dd}|dksQ|d u rSd	nd
}	||	 }
n|}
t||dd| }|
d urp||
 }t| dr| jdddd|jd d|jd d}tj||gdd}||jdddj }tjj|dtjd|j}|dd df }ntjj|dtjd|j}t||}|dd }||fS )Ncache
read_indexwrite_index)
key_statesvalue_states	layer_idxr   r   r   r   num_key_value_groupssliding_windowfull_attentionsliding_attentionr      sinks)dimT)r'   keepdim)r'   dtype.)popupdater   	transpose	unsqueezehasattrr   r   
isinstancedictgetattrtorchmatmulr$   r   r
   r	   catmaxvaluesr   
functionalsoftmaxfloat32tor)   
contiguous)r   r   r   r   r   r   kwargsr   r    
layer_typecausal_maskattn_weightsr$   attn_outputr   r   r   eager_paged_attention_forward   s@   





*rA   )
r2   r   $generation.continuous_batching.cacher   Tensorintr   ModulefloatrA   r   r   r   r   <module>   s"    