o
    i                     @   sj   d dl Z ddlmZ ddlmZ 						dde jjde jde jde jd	e jdB d
ede jfddZdS )    N   )PagedAttentionCache)!lazy_import_paged_flash_attentionmoduleqkvattention_maskcachereturnc
                 K   s  t | jj}t| ddsdn| jd df}|dkrdnd}|dur2|j||| j|
d	 |
d
 d\}}t|tr?|| }|	| }	d|
v rJd|
	dini }||
ddd | | |tj|tj ||	f| jd|d|}t|tr|d }|dfS )a  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    sliding_windowF)r      r   full_attentionsliding_attentionN
read_indexwrite_index)
key_statesvalue_states	layer_idxr   r   s_auxr   T)softmax_scalecausalwindow_size)r   config_attn_implementationgetattrr   updater   
isinstancedictget	transposesqueeze
contiguoustotorchint32clonescalingtuple)r   r   r   r   r	   r
   cu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kkwargsflash_attn_varlen_funcr   
layer_typecustom_kwargsattn_output r3   g/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/flash_paged.pypaged_attention_forward   sB   #

	

r5   )NNNNNN)	r%   generation.continuous_batchingr   modeling_flash_attention_utilsr   nnModuleTensorr5   r3   r3   r3   r4   <module>   s0    