o ¡…›iP†ã@sxUddlZddlmZmZmZddlmZddlmZm Z ddl mZddl mZddl mZdd lmZdd lmZddlmZddlmZddlZd dgZe dƒZedƒZejjZdd„ZiZeeefe d<dd„Z!dVdeeeefgeeefffdd„Z"e"ej#ƒddœde$fdd„ƒZ%e"ej&ƒdWde$fdd„ƒZ'e"ej(ƒdWde$fdd„ƒZ)e"ej*ƒdWde$fd d!„ƒZ+e"ej,ƒ dXde$fd"d#„ƒZ- dVd$e.e$d%e.e$d&e.e$d'e/de$f d(d)„Z0e"ej1ej2ej3ej4ej5gƒddœde$fd*d+„ƒZ6e"ej7ƒde$fd,d-„ƒZ8d.d/„Z9e"ej:ej;ejdd4œdee?e?e$d5fe?e$d5fe?e$d5fe?e$d5fdBffd6d7„Z@dd4œdee?e?e$d5fe?e$d5fe?e$d5fe?e$d5fdBffd8d9„ZAe"ejBd:d;ddœde$fdd?„ƒZEd@dA„ZFe"ejGejHejIgƒddœde$fdBdC„ƒZJe"ejKd:d;de$fdDdE„ƒZLe"ejMd:d;de$fdFdG„ƒZNiej#e%“ej&e'“ej(e)“ej*e+“ej,e-“ej1e6“ej2e6“ej3e6“ej5e6“ej4e6“ej7e8“ej:e=“ej;e=“ej.nfr©r%r&rr$rÚ shape_wrappersr(FÚreturncs,dtttfdtttff‡‡fdd„}|S)NÚflop_formular)cs.ˆstˆƒ‰d‡fdd„}tjj |ˆ¡ˆS)Nr)csHt|tjjƒstd|›dt|ƒ›ƒ‚|tvrtd|›ƒ‚ˆt|<dS)Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )rrÚ_opsÚOpOverloadPacketÚ ValueErrorÚtyperÚRuntimeError)Útarget©r*rrÚregister)sþþÿz=register_flop_formula..register_fun..register)r)N)r(rÚutilsÚ_pytreeÚ tree_map_)r*r2©Úget_rawÚtargetsr1rÚregister_fun%s z+register_flop_formula..register_fun)r rr)r8r7r9rr6rr$s()r!c Os<|\}}|\}}||krtd|›d|›ƒ‚||d|S)zCount flops for matmul.z3matmul: inner dimensions must match (k == k2), got ú and é©ÚAssertionError) Úa_shapeÚb_shaper!r"r#ÚmÚkÚk2ÚnrrrÚmm_flop:s rDcKó t||ƒS)zCount flops for addmm.©rD©Ú self_shaper>r?r!r#rrrÚ addmm_flopFs rIcKsd|\}}}|\}}} ||krtd|›d|›ƒ‚||kr&td|›d|›ƒ‚||| d|} | S)z"Count flops for the bmm operation.z0bmm: batch dimensions must match (b == b2), got r:z0bmm: inner dimensions must match (k == k2), got r;r<)r>r?r!r#Úbr@rAÚb2rBrCÚfloprrrÚbmm_flopKs rMcKrE)z&Count flops for the baddbmm operation.)rMrGrrrÚbaddbmm_flopZs rNc Ks t||ƒS)zCount flops for _scaled_mm.rF) r>r?Ú scale_a_shapeÚ scale_b_shapeÚ bias_shapeÚscale_result_shapeÚ out_dtypeÚuse_fast_accumr!r#rrrÚ_scaled_mm_flopas rUÚx_shapeÚw_shaper!Ú transposedc CsL|d}|r|n|dd…}|^}}} t|ƒt|ƒ|||d} | S)aCount flops for convolution. Note only multiplication is counted. Computation for bias are ignored. Flops for a transposed convolution are calculated as flops = (x_shape[2:] * prod(w_shape) * batch_size). Args: x_shape (list(int)): The input shape before convolution. w_shape (list(int)): The filter shape. out_shape (list(int)): The output shape after convolution. transposed (bool): is the convolution transposed Returns: int: the number of flops rr;Nr) rVrWr!rXÚ batch_sizeÚ conv_shapeÚc_outÚc_inÚfilter_sizerLrrrÚconv_flop_countrs r^c Ost||||dS)zCount flops for convolution.©rX)r^) rVrWÚ_biasÚ_strideÚ_paddingÚ _dilationrXr!r"r#rrrÚ conv_flop˜srdcCs–dd„}d} | drt|dƒ}| t||||ƒ7} | drIt|dƒ}|r9| t||ƒ||ƒ||ƒdd7} | S| t||ƒ||ƒ||ƒdd7} | S)NcSs |d|dgt|dd…ƒS)Nrrr;)Úlist)rrrrÚt²s zconv_backward_flop..trrFr_)rr^)Úgrad_out_shaperVrWr`rarbrcrXÚ_output_paddingÚ_groupsÚoutput_maskr!rfÚ flop_countÚgrad_input_shapeÚgrad_weight_shaperrrÚconv_backward_flop£sF þrncCsÌ|\}}}}|\}}} } |\}}} }||kr|kr¸ó € ÿ ÿz0_flash_attention_forward_flop..©rªÚsum)r–r—r˜r™ršr›rœr!r"r#ÚsizesrrrÚ_flash_attention_forward_flopžóù þr½c Os(t|||||||d} tdd„| DƒƒS)r€)r–r—r˜r¬rr®r¯csrµrrr¶rrrr¸Ør¹z4_efficient_attention_forward_flop..©r´r»)r–r—r˜Úbiasr¬rr®r¯r"r#r¼rrrÚ!_efficient_attention_forward_flop¾r¾rÁcCsfd}|\}}}}|\} } }}|\} }}}|\}}}}|| kr)| kr)|krFntdƒ‚|| kr=|kr=|krFntdƒ‚||ksJtdƒ‚||krV||krV||ksZtdƒ‚d}|t||||f||||fƒ7}|t||||f||||fƒ7}|t||||f||||fƒ7}|t||||f||||fƒ7}|t||||f||||fƒ7}|S)NrzFsdpa_backward_flop_count: batch/heads/dimension mismatch among tensorszJsdpa_backward_flop_count: grad_out/value/key/query shapes are incompatiblero)rgrprqrrr~rJrsrtrurvrwrxryrzr{r|r}Ú_b4Ú_h4Ú_s4Ú_d4rrrÚsdpa_backward_flop_countÞs( ÿÿ"""""rÆcOst||||ƒS)z(Count flops for self-attention backward.©rÆ)rgrprqrrr!r"r#rrrÚsdpa_backward_flopûsrÈc Oó*t|||||||| d}tdd„|DƒƒS)N)r–r—r˜rr™ršr›rœcsó&|]\}}}}t||||ƒVqdSrrÇ©r·rprqrrrgrrrr¸ó € ÿ ÿz1_flash_attention_backward_flop..rº) rr–r—r˜ÚoutÚ logsumexpr™ršr›rœr"r#ÚshapesrrrÚ_flash_attention_backward_flopóø þrÐc OrÉ)N)r–r—r˜rr¬rr®r¯csrÊrrÇrËrrrr¸>rÌz5_efficient_attention_backward_flop..r¿) rr–r—r˜rÀrÍr¬rr®r¯r"r#rÏrrrÚ"_efficient_attention_backward_flop#rÑrÒcCst|tƒs|fS|Sr)rÚtuple)ÚxrrrÚnormalize_tuple\s rÕ)ÚÚKÚMÚBÚTcCs0tdtttƒdtt|ƒƒddƒƒ}t|S)Nrrr;r)ÚmaxÚminr”ÚsuffixesÚstr)ÚnumberÚindexrrrÚget_suffix_stres(rácCs&t |¡}|d|d›}|t|S)Nièz.3f)rÝrà)rßÚsuffixràr˜rrrÚconvert_num_with_suffixls rãcCs|dkrdS||d›S)Nrú0%z.2%r)ÚnumÚdenomrrrÚconvert_to_percent_strssrçcstˆƒ‡fdd„ƒ}|S)Ncst|ƒ\}}ˆ|Ž}t||ƒSr)rr)r"Ú flat_argsÚspecrÍr$rrr&ys z)_pytreeify_preserve_structure..nfrr'rr$rÚ_pytreeify_preserve_structurexsrêcs®eZdZdZ ddejjeejjBdBdede de eefdBd df ‡fd d„ Zd efdd „Z d e ee eefffdd„Zddd„Zdd„Zdd„Zdd„Z‡ZS)raþ ``FlopCounterMode`` is a context manager that counts the number of flops within its context. It does this using a ``TorchDispatchMode``. It also supports hierarchical output by passing a module (or list of modules) to FlopCounterMode on construction. If you do not need hierarchical output, you do not need to use it with a module. Example usage .. code-block:: python mod = ... with FlopCounterMode(mod) as flop_counter: mod.sum().backward() Nr;TÚmodsÚdepthÚdisplayÚcustom_mappingr)csttƒ ¡tdd„ƒ|_||_||_d|_|duri}|dur&tjdddit ¥dd„| ¡Dƒ¥|_ tƒ|_dS)NcSsttƒSr)rÚintrrrrÚsz*FlopCounterMode.__init__..z§s*z,FlopCounterMode.__init__..) ÚsuperÚ__init__rÚflop_countsrìríÚmodeÚwarningsÚwarnrÚitemsrÚmod_tracker)Úselfrërìrírî©Ú __class__rrrø–s ÿþzFlopCounterMode.__init__cCst|jd ¡ƒS)NÚGlobal)r»rùÚvalues©rÿrrrÚget_total_flops«szFlopCounterMode.get_total_flopscCsdd„|j ¡DƒS)aReturn the flop counts as a dictionary of dictionaries. The outer dictionary is keyed by module name, and the inner dictionary is keyed by operation name. Returns: Dict[str, Dict[Any, int]]: The flop counts as a dictionary. cSsi|] \}}|t|ƒ“qSr)Údictrôrrrrö¸sz3FlopCounterMode.get_flop_counts..)rùrýrrrrÚget_flop_counts®s zFlopCounterMode.get_flop_countsc s|durˆj}|dur d}ddl}d|_gd¢}g}ˆ ¡‰tˆƒ‰d‰‡‡‡‡fdd„}tˆj ¡ƒD]}|dkr;q4| d ¡d }||krGq4|||d ƒ}| |¡q4dˆjvroˆso|D] } d| d| d<q]|ddƒ|}t |ƒdkrzgd¢g}|j||d dS)Ni?BrT)ÚModuleÚFLOPz% TotalFcsŽtˆj| ¡ƒ}ˆ|ˆkO‰d|}g}| ||t|ˆƒt|ˆƒg¡ˆj| ¡D]\}}| |dt|ƒt|ˆƒt|ˆƒg¡q,|S)Nú z - )r»rùrÚappendrãrçrýrÞ)Úmod_namerìr~ÚpaddingrrArõ©Úglobal_flopsÚ global_suffixÚis_global_subsumedrÿrrÚprocess_modÊs ýýz.FlopCounterMode.get_table..process_modrÚ.rr )rÚ0rä)ÚleftÚrightr)ÚheadersÚcolalign)rìÚtabulateÚPRESERVE_WHITESPACErráÚsortedrùÚkeysÚcountÚextendr”) rÿrìrÚheaderrrÚmodÚ mod_depthÚ cur_valuesr˜rrrÚ get_tableºs6 zFlopCounterMode.get_tablecCs,|j ¡|j ¡t|ƒ|_|j ¡|Sr)rùÚclearrþÚ __enter__Ú_FlopCounterModerúrrrrr%ùs zFlopCounterMode.__enter__cGsH|jdur tdƒ‚|jj|Ž}d|_|j ¡|jr"t| |j¡ƒ|S)Nzs& *÷õüÿþýü û&üôgþû6 ö9û6 ö7 ÷õõþ ó ó ÿþýüûúùø ÷ öõô óòñðïë