o
    iP                     @   sx  U d dl Z d dlmZmZmZ ddlmZ d dlmZm	Z	 d dl
mZ d dl
mZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlZddgZe	dZedZe jjZdd Zi Zeeef e d< dd Z!dVdeeeef geeef f fddZ"e"ej#ddde$fddZ%e"ej&dWde$fddZ'e"ej(dWde$fddZ)e"ej*dWde$fd d!Z+e"ej,					dXde$fd"d#Z-	dVd$e.e$ d%e.e$ d&e.e$ d'e/de$f
d(d)Z0e"ej1ej2ej3ej4ej5gddde$fd*d+Z6e"ej7de$fd,d-Z8d.d/ Z9e"ej:ej;ej<gddde$fd0d1Z=d2d3 Z>dd4dee?e?e$d5f e?e$d5f e?e$d5f e?e$d5f dB f  fd6d7Z@dd4dee?e?e$d5f e?e$d5f e?e$d5f e?e$d5f dB f  fd8d9ZAe"ejBd:d;ddde$fd<d=ZCe"ejDd:d;de$fd>d?ZEd@dA ZFe"ejGejHejIgddde$fdBdCZJe"ejKd:d;de$fdDdEZLe"ejMd:d;de$fdFdGZNi ej#e%ej&e'ej(e)ej*e+ej,e-ej1e6ej2e6ej3e6ej5e6ej4e6ej7e8ej:e=ej;e=ej<e=ejGeJejHeJejIeJejBeCejDeEejKeLejMeNiZdHdI ZOg dJZPdKdL ZQdMdN ZRdeSfdOdPZTdQdR ZUG dSd dZVG dTdU dUeZWdS )Y    N)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyTypeVar)Callable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_Pc                 C   s   t | tjr	| jS | S N)
isinstancetorchTensorshape)i r   Z/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/torch/utils/flop_counter.py	get_shape   s   r   flop_registryc                    s   t  d d fdd
}|S )N)out_valc                    s(   t t||| f\}}} |d|i|S )N	out_shape)r   r   )r    argskwargsr!   fr   r   nf   s   zshape_wrapper.<locals>.nfr   r%   r&   r   r$   r   shape_wrapper   s   r(   Freturnc                    s,   dt ttf dt ttf f fdd}|S )Nflop_formular)   c                    s.   st   d fdd}tjj|  S )Nr)   c                    sH   t | tjjstd|  dt|  | tv rtd|   t| < d S )Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )r   r   _opsOpOverloadPacket
ValueErrortyper   RuntimeError)targetr*   r   r   register)   s   z=register_flop_formula.<locals>.register_fun.<locals>.register)r)   N)r(   r   utils_pytree	tree_map_)r*   r2   get_rawtargetsr1   r   register_fun%   s
   z+register_flop_formula.<locals>.register_fun)r	   r   r   )r8   r7   r9   r   r6   r   r   $   s   ()r!   c          	      O   s<   | \}}|\}}||krt d| d| || d | S )zCount flops for matmul.z3matmul: inner dimensions must match (k == k2), got  and    AssertionError)	a_shapeb_shaper!   r"   r#   mkk2nr   r   r   mm_flop:   s
   rD   c                 K   
   t ||S )zCount flops for addmm.rD   
self_shaper>   r?   r!   r#   r   r   r   
addmm_flopF   s   
rI   c                 K   sd   | \}}}|\}}}	||krt d| d| ||kr&t d| d| || |	 d | }
|
S )z"Count flops for the bmm operation.z0bmm: batch dimensions must match (b == b2), got r:   z0bmm: inner dimensions must match (k == k2), got r;   r<   )r>   r?   r!   r#   br@   rA   b2rB   rC   flopr   r   r   bmm_flopK   s   

rM   c                 K   rE   )z&Count flops for the baddbmm operation.)rM   rG   r   r   r   baddbmm_flopZ   s   
rN   c	           
      K   s
   t | |S )zCount flops for _scaled_mm.rF   )
r>   r?   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr!   r#   r   r   r   _scaled_mm_flopa   s   
rU   x_shapew_shaper!   
transposedc           
      C   sL   | d }|r| n|dd }|^}}}	 t |t | | | | d }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   r;   Nr   )
rV   rW   r!   rX   
batch_size
conv_shapec_outc_infilter_sizerL   r   r   r   conv_flop_countr   s   
 r^   c          
      O   s   t | |||dS )zCount flops for convolution.rX   )r^   )
rV   rW   _bias_stride_padding	_dilationrX   r!   r"   r#   r   r   r   	conv_flop   s   rd   c                 C   s   dd }d}	 |
d rt |d }|t| ||| 7 }|
d rIt |d }|r9|t|| ||||dd7 }|S |t|||| ||dd7 }|S )Nc                 S   s    | d | d gt | dd   S )Nr   r   r;   )list)r   r   r   r   t   s    zconv_backward_flop.<locals>.tr   r   Fr_   )r   r^   )grad_out_shaperV   rW   r`   ra   rb   rc   rX   _output_padding_groupsoutput_maskr!   rf   
flop_countgrad_input_shapegrad_weight_shaper   r   r   conv_backward_flop   s   F  rn   c                 C   s   | \}}}}|\}}}	}
|\}}}}||  kr|kr<n t d||  kr+|kr<n t d||
kr<|	|kr<||
ks@t dd}|t|| ||f|| ||	f7 }|t|| ||	f|| |	|f7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    z8sdpa_flop_count: query/key/value shapes are incompatibler   r=   rM   )query_shape	key_shapevalue_shaperJ   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopsr   r   r   sdpa_flop_count  s   ""r   c                O   s   t | ||S )Count flops for self-attention.r   )rp   rq   rr   r!   r"   r#   r   r   r   	sdpa_flop  s   r   c                 C   sR   ddl m} ddlm} t| ||fs| jjdkr|   S |g| 	dd  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r   devicer.   difftolistsize)offsetsmax_lenr   r   r   r   r   _offsets_to_lengths'  s
   r   )grad_out.c                 c   sB   |durt |jdkrtdt |jdkrtd|dur)|j| jkr)td| j\}}	}
|j\}}}|j\}}}|du rCtd|du rKtd|j|jkrUtdt||}t||}t||d	d
D ]%\}}d|	||
f}d|||f}d|||f}|dur|nd}||||fV  qfdS | j|j|j|dur|jndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   z7sdpa_flop_count: expected key.shape to be 3-dimensionalz9sdpa_flop_count: expected value.shape to be 3-dimensionalzDsdpa_flop_count: grad_out.shape must match query.shape when providedz+sdpa_flop_count: cum_seq_q must not be Nonez+sdpa_flop_count: cum_seq_k must not be NonezAsdpa_flop_count: cum_seq_q and cum_seq_k must have the same shapeTstrictr   lenr   r=   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_h_qru   h_kd_kh_vr}   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shaper   r   r   %_unpack_flash_attention_nested_shapes3  s6   

&r   c                 c   sH   |durt |jdkrtdt |jdkrtd|dur)|j| jkr)td| j\}}}	}
|j\}}}}|j\}}}}|du rFtd|du rNtd|j|jkrXtdt||}t||}t||d	d
D ]%\}}d|	||
f}d|||f}d|||f}|dur|nd}||||fV  qidS | j|j|j|dur|jndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   zQ_unpack_efficient_attention_nested_shapes: expected key.shape to be 4-dimensionalzS_unpack_efficient_attention_nested_shapes: expected value.shape to be 4-dimensionalz^_unpack_efficient_attention_nested_shapes: grad_out.shape must match query.shape when providedzH_unpack_efficient_attention_nested_shapes: cu_seqlens_q must not be NonezH_unpack_efficient_attention_nested_shapes: cu_seqlens_k must not be Noneza_unpack_efficient_attention_nested_shapes: cu_seqlens_q and cu_seqlens_k must have the same shapeTr   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   ru   r   r   r   r}   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   r   r   r   )_unpack_efficient_attention_nested_shapesg  s6   

&r   T)r7   c             	   O   s(   t | ||||||d}
tdd |
D S )r   )r   r   r   r   r   r   r   c                 s   $    | ]\}}}}t |||V  qd S r   r   .0rp   rq   rr   r   r   r   r   	<genexpr>  
    


z0_flash_attention_forward_flop.<locals>.<genexpr>r   sum)r   r   r   r   r   r   r   r!   r"   r#   sizesr   r   r   _flash_attention_forward_flop     	r   c              	   O   s(   t | ||||||d}
tdd |
D S )r   )r   r   r   r   r   r   r   c                 s   r   r   r   r   r   r   r   r     r   z4_efficient_attention_forward_flop.<locals>.<genexpr>r   r   )r   r   r   biasr   r   r   r   r"   r#   r   r   r   r   !_efficient_attention_forward_flop  r   r   c                 C   sf  d}|\}}}}|\}	}
}}|\}}}}| \}}}}||	  kr)|  kr)|krFn t d||
  kr=|  kr=|krFn t d||ksJt d||krV||krV||ksZt dd}|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|S )Nr   zFsdpa_backward_flop_count: batch/heads/dimension mismatch among tensorszJsdpa_backward_flop_count: grad_out/value/key/query shapes are incompatiblero   )rg   rp   rq   rr   r~   rJ   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   _b4_h4_s4_d4r   r   r   sdpa_backward_flop_count  s(     """""r   c                O   s   t | |||S )z(Count flops for self-attention backward.r   )rg   rp   rq   rr   r!   r"   r#   r   r   r   sdpa_backward_flop  s   r   c
              
   O   *   t |||| ||||	d}tdd |D S )N)r   r   r   r   r   r   r   r   c                 s   &    | ]\}}}}t ||||V  qd S r   r   r   rp   rq   rr   rg   r   r   r   r     
    

z1_flash_attention_backward_flop.<locals>.<genexpr>r   )r   r   r   r   out	logsumexpr   r   r   r   r"   r#   shapesr   r   r   _flash_attention_backward_flop     
r   c
              
   O   r   )N)r   r   r   r   r   r   r   r   c                 s   r   r   r   r   r   r   r   r   >  r   z5_efficient_attention_backward_flop.<locals>.<genexpr>r   )r   r   r   r   r   r   r   r   r   r   r"   r#   r   r   r   r   "_efficient_attention_backward_flop#  r   r   c                 C   s   t | ts| fS | S r   )r   tuple)xr   r   r   normalize_tuple\  s   
r   ) KMBTc                 C   s0   t dtttd tt| d d }t| S )Nr   r   r;   r   )maxminr   suffixesstr)numberindexr   r   r   get_suffix_stre  s   (r   c                 C   s&   t |}| d|  d}|t |  S )Ni  z.3f)r   r   )r   suffixr   r   r   r   r   convert_num_with_suffixl  s   
r   c                 C   s   |dkrdS | | dS )Nr   0%z.2%r   )numdenomr   r   r   convert_to_percent_strs  s   r   c                    s   t   fdd}|S )Nc                    s   t | \}} | }t||S r   )r   r   )r"   	flat_argsspecr   r$   r   r   r&   y  s   
z)_pytreeify_preserve_structure.<locals>.nfr   r'   r   r$   r   _pytreeify_preserve_structurex  s   r   c                       s   e Zd ZdZ				ddejjeejj B dB dede	de
eef dB d	df
 fd
dZd	efddZd	e
ee
eef f fddZdddZdd Zdd Zdd Z  ZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    Nr;   Tmodsdepthdisplaycustom_mappingr)   c                    st   t    tdd | _|| _|| _d | _|d u ri }|d ur&tjddd i t	dd |
 D | _	t | _d S )Nc                   S   s   t tS r   )r   intr   r   r   r   <lambda>  s    z*FlopCounterMode.__init__.<locals>.<lambda>z<mods argument is not needed anymore, you can stop passing itr;   )
stacklevelc                 S   s*   i | ]\}}|t |d dr|nt|qS )_get_rawF)getattrr(   r   rA   vr   r   r   
<dictcomp>  s   * z,FlopCounterMode.__init__.<locals>.<dictcomp>)super__init__r   flop_countsr   r   modewarningswarnr   itemsr   mod_tracker)selfr   r   r   r   	__class__r   r   r     s   
zFlopCounterMode.__init__c                 C   s   t | jd  S )NGlobal)r   r   valuesr   r   r   r   get_total_flops  s   zFlopCounterMode.get_total_flopsc                 C   s   dd | j  D S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 S   s   i | ]	\}}|t |qS r   )dictr   r   r   r   r     s    z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>)r   r   r  r   r   r   get_flop_counts  s   
zFlopCounterMode.get_flop_countsc           
         s  |d u rj }|d u rd}dd l}d|_g d}g }  t d fdd}tj D ]}|dkr;q4|d	d
 }||krGq4|||d
 }|	| q4djv roso|D ]
}	d|	d  |	d< q]|dd| }t
|dkrzg dg}|j||ddS )Ni?B r   T)ModuleFLOPz% TotalFc                    s   t j|   }| kO d| }g }|||  t|t| g j|   D ]\}}||d t| t|t| g q,|S )N z - )r   r   r  appendr   r   r   r   )mod_namer   r~   paddingr  rA   r   global_flopsglobal_suffixis_global_subsumedr   r   r   process_mod  s    z.FlopCounterMode.get_table.<locals>.process_modr  .r   r
  )r  0r   )leftrightr  )headerscolalign)r   tabulatePRESERVE_WHITESPACEr  r   sortedr   keyscountextendr   )
r   r   r  headerr  r  mod	mod_depth
cur_valuesr   r   r  r   	get_table  s6   
zFlopCounterMode.get_tablec                 C   s,   | j   | j  t| | _| j  | S r   )r   clearr   	__enter___FlopCounterModer   r  r   r   r   r%    s
   



zFlopCounterMode.__enter__c                 G   sH   | j d u r	td| j j| }d | _ | j  | jr"t| | j |S )Nz<Internal error: FlopCounter.__exit__ called but mode is None)r   r=   __exit__r   r   printr#  r   )r   r"   rJ   r   r   r   r'     s   

zFlopCounterMode.__exit__c                 C   sV   || j v r)| j | }||i |d|i}t| jjD ]}| j| |  |7  < q|S )Nr    )r   setr   parentsr   )r   func_packetr   r"   r#   flop_count_funcrk   parr   r   r   _count_flops
  s   

zFlopCounterMode._count_flops)Nr;   TNr   )__name__
__module____qualname____doc__r   nnr  re   r   boolr  r   r   r  r   r  r#  r%  r'  r.  __classcell__r   r   r   r   r     s.    
?
c                   @   s<   e Zd ZdZdeddfddZdd Zd	d
 ZdddZdS )r&  Tcounterr)   Nc                 C   s
   || _ d S r   )r6  )r   r6  r   r   r   r     s   
z_FlopCounterMode.__init__c                 C   s`   ddl }| | jj}|  || }W d   n1 sw   Y  | | jj}|| j_||fS )a  Execute a branch function and capture its FLOP counts without
        affecting self.counter.flop_counts

        Args:
            branch_fn: The branch function to execute
            operands: Arguments to pass to the branch function

        Returns:
            Tuple of (result, flop_counts) where result is the branch output
            and flop_counts is a copy of the FLOP counts after execution
        r   N)copyr6  r   )r   	branch_fnoperandsr7  checkpointed_flop_countsresultr   r   r   r   $_execute_with_isolated_flop_counting  s   
z5_FlopCounterMode._execute_with_isolated_flop_countingc                 C   s  |t jjjur	tS |t jjju r|\}}}}| ||\}	}
|	tu r$tS | ||\}}|tu r2tS t|
 t| B }i }|D ]4}|
| }|| }i }t| t| B }|D ]}||d}||d}t	||||< q\|||< qB|
 D ]\}}| jj| | q{|	S d S )Nr   )r   opshigher_ordercondNotImplementedr<  r)  r  getr   r   r6  r   update)r   functypesr"   r#   predtrue_branchfalse_branchr9  true_outtrue_flop_counts	false_outfalse_flop_countsall_mod_keysmerged_flop_counts	outer_keytrue_func_countsfalse_func_countsmerged_func_countsall_func_keysfunc_keytrue_val	false_val
inner_dictr   r   r   _handle_higher_order_ops-  s<   
z)_FlopCounterMode._handle_higher_order_opsr   c                 C   sX  |r|ni }|t jjjjt jjjjt jjjjt jjjjt jjjjt jjj	jt jjj
jt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjhv rWtS t|t jjrf| ||||S || jjvr|t jjjjur|  |j|i |}|tur|W  d    S W d    n1 sw   Y  ||i |}| j|j|||S r   )r   r=  atensym_is_contiguousdefaultis_contiguousmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutr@  r   r+   HigherOrderOperatorrW  r6  r   r   	decomposer.  _overloadpacket)r   rC  rD  r"   r#   rr   r   r   r   __torch_dispatch__^  s<   













z#_FlopCounterMode.__torch_dispatch__)r   N)	r/  r0  r1  supports_higher_order_operatorsr   r   r<  rW  rm  r   r   r   r   r&    s    1r&  )Fr   )NNNFN)Xr   torch.utils._pytreer   r   r   module_trackerr   typingr   r   collections.abcr	   r
   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r   __all__r   r   r=  rX  r   r   r  __annotations__r(   r   mmr   rD   addmmrI   bmmrM   baddbmmrN   
_scaled_mmrU   re   r4  r^   convolution_convolutioncudnn_convolution_slow_conv2d_forwardconvolution_overrideablerd   convolution_backwardrn   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r   r   r   r   r   r   r&  r   r   r   r   <module>   s&  
*
&g6

96

7
  	

 