o
    iu'                     @   s  d Z ddlZddlmZ ddlmZmZ ddlZee	Z
ddgZeddd	ed
efddZG dd deZejjdi d	d3dejdejdejdejdejdededed
eejejejf fddZej	d3dejdejdejdejdejdededed
eejejejf fddZ		d4dejdejdejdejdejdededededB d
ejeejejf B fddZded eed!f d"ed
dfd#d$Zejjd%i dd&ejdejdejdejd'ejd(ejdejdejdededed)ejd
eejejejf fd*d+Zejd&ejdejdejdejd'ejd(ejdejdejdededed)ejd
eejejejf fd,d-Zded&ejd.ejd/ejd
eejdB d!f f
d0d1Zejeed2 dS )5z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuplevarlen_attn
AuxRequest   )maxsizedevice_indexreturnc                 C   s   dS )z;Cache device capability check to avoid repeated CUDA calls.F )r	   r   r   [/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/torch/nn/attention/varlen.py_should_use_cudnn   s   r   c                   @   s   e Zd ZU dZdZeed< dS )r   z
    Request which auxiliary outputs to compute from varlen_attn.

    Each field is a boolean indicating whether that auxiliary output should be computed.
    FlseN)__name__
__module____qualname____doc__r   bool__annotations__r   r   r   r   r      s   
 ztorch_attn::_varlen_attn)mutates_argsFquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalc                 C   s   | j ot| jj}|r1td tjj	| ||d||||dd|d}	|	d |	d |	d }
}}ntd	 tjjj
| ||||||d|dd

\}
}}}}tjdtj| jd}|
||fS )z
    Private custom op for variable-length attention.

    This is the internal implementation. Users should use the public varlen_attn function instead.
    #Using cuDNN backend for varlen_attnNT        Fr         -Using Flash Attention backend for varlen_attn)return_debug_mask   dtypedevice)is_cudar   r(   indexloginfotorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r   r   r   r   r   r   r   r   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_r   r   r   _varlen_attn$   sF   



r;   c                 C   sT   t | }| d}	| d}
t j|
|	ft j| jd}t jdt j| jd}|||fS )z
    Fake implementation for meta tensor computation and tracing.

    Based on the 3D varlen path from meta__flash_attention_forward:
    - query shape: (total, num_heads, head_dim)
    - logsumexp shape: (num_heads, total_q)
    r   r    r&   r$   )r-   
empty_likesizeemptyfloatr(   r3   )r   r   r   r   r   r   r   r   r6   total_q	num_heads	logsumexpr8   r   r   r   _varlen_attn_fake^   s   



rC   
return_auxc	              
   C   s<   t jj| |||||||\}	}
}|dur|jr|	|
fS |	S )a9  
    Compute variable-length attention using Flash Attention.
    This function is similar to scaled_dot_product_attention but optimized for
    variable-length sequences using cumulative sequence position tensors.
    Args:
    - query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
    - key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
    - value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
    - cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
    - cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
    - max_q (int): Maximum query sequence length in the batch.
    - max_k (int): Maximum key/value sequence length in the batch.
    - is_causal (bool, optional): If set to True, applies causal masking (default: False).
    - return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.

    Shape legend:
    - :math:`N`: Batch size
    - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
    - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
    - :math:`H`: Number of attention heads
    - :math:`D`: Head dimension

    Returns:
    - Tensor: Output tensor from attention computation
    - If ``return_aux`` is not None and ``return_aux.lse`` is True, returns a tuple of Tensors:
    (output, lse), where lse is the logsumexp

    Example::

        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
        >>> head_dim = embed_dim // num_heads
        >>> seq_lengths = []
        >>> for _ in range(batch_size):
        ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
        ...     seq_lengths.append(min(length, max_seq_len))
        >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
        >>> total_tokens = seq_lengths.sum().item()
        >>>
        >>> # Create packed query, key, value tensors
        >>> query = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> key = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> value = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>>
        >>> # Build cumulative sequence tensor
        >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
        >>> cu_seq[1:] = seq_lengths.cumsum(0)
        >>> max_len = seq_lengths.max().item()
        >>>
        >>> # Call varlen_attn
        >>> output = varlen_attn(
        ...     query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False
        ... )
    N)r-   r.   
torch_attnr;   r   )r   r   r   r   r   r   r   r   rD   outr   r9   r   r   r   r      s   G
ctxinputs.r6   c              
   C   sL   |\}}}}}}}	}
|\}}}|  |||||||| || _|	| _|
| _d S N)save_for_backwardr   r   r   )rG   rH   r6   r   r   r   r   r   r   r   r   rF   r   r8   r   r   r   _setup_context   s   

rK   z!torch_attn::_varlen_attn_backwardgrad_outrF   r   r8   c                 C   s   t jd|jd}|jot|jj}|r0td t jj	
| |||||||||	d|
||\}}}ntd t jj	| |||||||||	d|
||\}}}|||fS )Nr   )r(   r   r   r"   )r-   r>   r(   r)   r   r*   r+   r,   r.   r/   _cudnn_attention_backward_flash_attention_backward)rL   r   r   r   rF   r   r   r   r   r   r   r8   unusedr4   dqdkdvr   r   r   _varlen_attn_backward   sL   



rS   c                 C   s(   t |}t |}t |}|||fS )zF
    Fake implementation for meta tensor computation and tracing.
    )r-   r<   )rL   r   r   r   rF   r   r   r   r   r   r   r8   
grad_querygrad_key
grad_valuer   r   r   _varlen_attn_backward_fake  s   



rW   grad_lsegrad_rngc                 C   sh   | j \}}}}}}	}
}| j}| j}| j}tjj|||||	|
||||||\}}}|||d d d d d d f	S rI   )saved_tensorsr   r   r   r-   r.   rE   rS   )rG   rL   rX   rY   r   r   r   r   r   rF   r   r8   r   r   r   rP   rQ   rR   r   r   r   	_backward,  s&   
r[   )setup_context)F)FN)r   logging	functoolsr   typingr   r   r-   	getLoggerr   r+   __all__intr   r   r   library	custom_opTensortupler;   register_fakerC   r   rK   rS   rW   r[   register_autogradr   r   r   r   <module>   s    

		9		(	

"O	
8	

