o
    iT                     @   sL  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ e
r6d dlmZ dd	 d
fdefddZe jedd	 ddZe jedd	 d
dZeG dd dZeG dd dZG dd dZG dd dZd*ddZdd	 d dfddZdd  Zd+d"d#ZeG d$d% d%ZeG d&d' d'Zd(d) Z dS ),    N)deque)	dataclass)AnyLiteralTYPE_CHECKINGprofile)
DeviceType)_KinetoEventc                 C      | j S N)childrenx r   W/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/torch/profiler/_utils.py<lambda>       r   Freversec                 c   sX    |rt ndd }t|| }|r*||}|V  |||D ]}|| q|sd S d S )Nc                 S      | S r   r   r   r   r   r   r          z_traverse.<locals>.<lambda>)reversedr   append)treenext_fnchildren_fnr   order	remaining
curr_eventchild_eventr   r   r   	_traverse   s   r    c                 C      |   S r   )popr   r   r   r   r          T)r   r   c                 C   r!   r   )popleftr   r   r   r   r      r#   c                   @   sJ   e Zd ZU dZeed< dZeed< dZeed< dZeed< e	dd Z
dS )	EventMetricsr   duration_time_nsself_time_nsidle_time_nsqueue_depthc                 C   s   | j dkrdS | j| j  S )Nr   g        )r&   r(   selfr   r   r   fraction_idle_time(   s   
zEventMetrics.fraction_idle_timeN)__name__
__module____qualname__r&   int__annotations__r'   r(   r)   propertyr,   r   r   r   r   r%   !   s   
 r%   c                   @   s*   e Zd ZU eed< eed< dZeed< dS )Intervalstartendr   r)   N)r-   r.   r/   r0   r1   r)   r   r   r   r   r3   /   s   
 r3   c                   @   sF   e Zd ZdddZdd Zdd Zdefd	d
Zdee	 fddZ
dS )EventKeyreturnNc                 C   s
   || _ d S r   event)r+   r9   r   r   r   __init__7      
zEventKey.__init__c                 C   s   t | jjS r   )hashr9   idr*   r   r   r   __hash__:   s   zEventKey.__hash__c                 C   s   | j j|j jkS r   )r9   r=   )r+   otherr   r   r   __eq__=   s   zEventKey.__eq__c                 C   s
   | j j S r   )r9   namer*   r   r   r   __repr__@   r;   zEventKey.__repr__	intervalsc           	      C   s   d}t |dd d}|r*t| jj|d j}t| jj|d j}||k r*||| 7 }d\}}|t|k rw|| }|| }|d7 }|j|jkrW|j|jkrQ|d7 }q.|j|_|}t| jj|j}t| jj|j}||k rq||| 7 }|t|k s4|S )Nr   c                 S   r   r   r4   r   r   r   r   r   E   r   z,EventKey.intervals_overlap.<locals>.<lambda>key)r      rG   )	sortedmaxr9   start_time_nsr4   minend_time_nsr5   len)	r+   rC   overlap_timeoverlap_startoverlap_endijprev_intervalcurr_intervalr   r   r   intervals_overlapC   s0   zEventKey.intervals_overlapr7   N)r-   r.   r/   r:   r>   r@   strrB   listr3   rU   r   r   r   r   r6   6   s    
r6   c                   @   sV   e Zd ZdeddfddZdddZdd	 Zdd
dZdd Zdde	de
fddZdS )BasicEvaluationprofr7   Nc                 C   sZ   || _ i | _|   t| j dd d| _dd | jD | _g | _|  | _	| 
  d S )Nc                 S   s   | j jS r   )r9   rJ   r   r   r   r   r   j   r#   z*BasicEvaluation.__init__.<locals>.<lambda>rE   c                 S      g | ]}|j qS r   r8   .0er   r   r   
<listcomp>l       z,BasicEvaluation.__init__.<locals>.<listcomp>)r   metricscompute_self_timerH   keys
event_keyseventscuda_eventscompute_queue_depthqueue_depth_listcompute_idle_time)r+   rZ   r   r   r   r:   e   s   
zBasicEvaluation.__init__c                 C   s   | j jdusJ t| j j }|rS| }|j}|jD ]}||j8 }|| qt|| j	vs<J d|j
 d|j t|d| j	t|< |j| j	t| _|sdS dS )zM
        Computes event's self time(total time - time in child ops).
        NzDuplicate id: z, )r'   )r   kineto_resultsr   experimental_event_treer"   r&   r   r   r6   ra   r=   rA   r%   )r+   stackr   	self_timer   r   r   r   rb   q   s$   

z!BasicEvaluation.compute_self_timec                    s2  | j jdusJ | j j }dd dd tfdd|D dd	 d
}tfdd|D dd	 d
}t|| dd	 d
| _i }d}|D ] t| fdd	|d}|| < |dur\|n|}qEd}d}|| | j }	dd }
g }|	j|
d
 |	D ]}t|dr| d }| |	  d }||v r|| dur|| }t|dr|
 }|
 |  }||v r|| dur|| }nt|dr|j}|j}|t|k r|| 
 |kr|d7 }|t|k r|| 
 |ks|| d }t|d}t|dst|dr|t||| qxt|dr|| jt| _qx|S )z
        Computes queue_depth at each event. This will calculate the queue depth data for
        All the events in the tree.
        This will return a list of Interval of queue depth data of cuda launch and kernels.
        Nc                    s.   h d}t t| d|  t fdd|D S )z+Check if the event is a CUDA launch kernel.>   cudaLaunchKernel__cudaLaunchKernelcudaLaunchKernelExCcudaLaunchCooperativeKernel&cudaLaunchCooperativeKernelMultiDevicerA   c                 3   s    | ]}  |V  qd S r   )
startswithr]   patternrA   r   r   	<genexpr>   s    zUBasicEvaluation.compute_queue_depth.<locals>.is_cuda_launch_kernel.<locals>.<genexpr>)rW   getattrany)r^   launch_patternsr   rv   r   is_cuda_launch_kernel   s   zBBasicEvaluation.compute_queue_depth.<locals>.is_cuda_launch_kernelc                    sF   |   tjkr	dS tt| d|   h d}t fdd|D  S )z,Check if the event is a CUDA runtime kernel.FrA   >   cpymemfreeallocc                 3   s    | ]}| v V  qd S r   r   rt   rv   r   r   rw      s    zNBasicEvaluation.compute_queue_depth.<locals>.is_cuda_kernel.<locals>.<genexpr>)device_typer	   CUDArW   rx   lowerry   )r^   exclude_patternsr   rv   r   is_cuda_kernel   s
   z;BasicEvaluation.compute_queue_depth.<locals>.is_cuda_kernelc                 3       | ]	} |r|V  qd S r   r   r\   )r{   r   r   rw          z6BasicEvaluation.compute_queue_depth.<locals>.<genexpr>c                 S   r!   r   start_nsr   r   r   r   r      r#   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>rE   c                 3   r   r   r   r\   )r   r   r   rw      r   c                 S   r!   r   r   r   r   r   r   r      r#   c                 S   r!   r   r   r   r   r   r   r      r#   r   c                    s   |      kS r   )linked_correlation_idr   )cuda_launch_eventr   r   r      s    rD   c                 S   s@   t | dr|  d S t | dr|  S t | dr| jS td)Nstart_us  r   rJ   zUnknown Event Type)hasattrr   r   rJ   	Exceptionr8   r   r   r   new_old_event_comparator   s   


zEBasicEvaluation.compute_queue_depth.<locals>.new_old_event_comparatorr   r   r   rJ   rG   )r   rj   re   rH   rf   index_of_first_matchsortr   r   duration_usr   duration_nsrJ   rL   rM   rI   r   r3   ra   r6   r)   )r+   cuda_event_listcuda_launch_eventscuda_kernel_eventskernel_mappinglast_mapped_kernelindexcurrent_kernel_indexspawned_kernel_index
all_eventsr   rh   r9   
start_timeend_timecurrent_queue_depthr   )r   r   r{   r   rg      sz   
	




z#BasicEvaluation.compute_queue_depthc                 C   s   d}d}g }| j r(| jr(|t| jd j| j d jt| j d j| jd jg7 }| j D ] }|jdkr9|s9|j}d}|jdkrK|rK|t||j d}q+dd | j	D }|D ]}t
||| j	t
| _qVdS )z4
        Computes idle time of the profile.
        Fr   r   Tc                 S   r[   r   r8   r\   r   r   r   r_     r`   z5BasicEvaluation.compute_idle_time.<locals>.<listcomp>N)rh   re   r3   rJ   r4   r5   rL   r)   r   ra   r6   rU   r(   )r+   idle
idle_startidle_intervals
data_point
event_listr9   r   r   r   ri      s0   
z!BasicEvaluation.compute_idle_timec                    s  ddl }ttj}dd |D }d d}g d}|t|k ru||  kr+|d7 }qt|d t|D ]6}t| fdd|d	}t|||d
}	|	durj||	 |krjt	||	 j
|| j
 |durf|n|} nq4|d7 }|t|k s fddjD }
|
r|jfdd|
D |jd}|jfdd|
D |jd}||| || }||| || }|d|  }dd tt||
ddtdddD }
|
d| }
|
S )a  
        Filter and Rank the events based on some heuristics:
        1) Events that are in the falling phase of the queue depth.
        2) Events that have a high idle_time, self_time difference.

        Parameters:
            length: The number of events to return.
        r   Nc                 S   r[   r   )r)   r\   r   r   r   r_      r`   z/BasicEvaluation.rank_events.<locals>.<listcomp>   rG   c                    s   |  kS r   r   r   )bottom_threasholdr   r   r   .  r#   z-BasicEvaluation.rank_events.<locals>.<lambda>rD   )r4   r5   c                    s   g | ]	}|  r|qS r   )rU   r]   r9   )decrease_intervalr   r   r_   =  s    c                       g | ]} j | jqS r   )ra   r'   r   r*   r   r   r_   D      )dtypec                    r   r   )ra   r,   r   r*   r   r   r_   H  r   g333333?c                 S   s   g | ]\}}|qS r   r   )r]   _r9   r   r   r   r_   P  s    T)strict)rF   r   )torchrX   r   rh   rM   ranger   argmaxr   r3   r4   ra   tensorfloat32meanstdrH   zipoperator
itemgetter)r+   lengthr   rh   	qd_valuestop_threasholdrQ   rR   next_minimum_idxpeak_idxr   rm   	idle_timenormalized_gainnormalized_selfheuristic_score_listr   )r   r   r+   r   rank_events  sf   
zBasicEvaluation.rank_eventsrG   Tr   print_enablec                    sJ     |}|s	|S |rdnd}|d fdd|D 7 }|r#t| |S )NzOptimizable events:
zNo events to optimize

c                    s@   g | ]}d  d| dt |j d j| jd ddd  	qS )zP--------------------------------------------------------------------------------z
Event:                z
Source code location: z
Percentage idle time: d   z.2fz%
)source_code_locationr9   ra   r,   r   r*   r   r   r_   b  s    z:BasicEvaluation.get_optimizable_events.<locals>.<listcomp>)r   joinprint)r+   r   r   r   outputr   r*   r   get_optimizable_events[  s   


z&BasicEvaluation.get_optimizable_eventsrV   )rG   T)r-   r.   r/   r   r:   rb   rg   ri   r   r0   boolr   r   r   r   r   rY   d   s    

oIrY   c                 C   sD   |d u s
|t | krt | }t||D ]}|| | r|  S qd S r   )rM   r   )seq	predicater4   r5   rQ   r   r   r   r   p  s   r   c                 C   r   r   r   r   r   r   r   r   y  r   c                 C   s2   | || } t | dkrd S | t| |d| S )Nr   rE   )rM   r   rI   )r   rF   r4   r5   r   r   r   r   y  s   r   c                 C   s0   | d urt d| j}|d u r| j} q | jS dS )Nz
\.py\(.*\)zNo source code location found)researchrA   parent)r9   matchr   r   r   r     s   r   r7   c                  C   s8   ddl m}  |  	 W d    d S 1 sw   Y  d S )Nr   r   )torch.autograd.profilerr   r   r   r   r   _init_for_cuda_graphs  s   "r   c                   @   sV   e Zd ZU dZeed< ed ed< ed dB ed< eeB dB ed< eee	f ed	< dS )
TimelineEventz-Represents an event in the profiler timeline.	timestamp)r4   r5   regular
event_typefilenamenodeNmarker_type
identifierr9   )
r-   r.   r/   __doc__r0   r1   r   rW   dictr   r   r   r   r   r     s   
 r   c                   @   sF   e Zd ZU dZed ed< eeB ed< edB ed< dZ	edB ed< dS )ContextStackEntryz5Represents a context (filename or node) in the stack.r   context_typer   Nmetadatatid)
r-   r.   r/   r   r   r1   rW   r0   r   r   r   r   r   r   r     s   
 r   c              	      s  ddl m} | dg }g  dd } fdd}|D ]I}d|vs$d	|vr%q||rT|d
 dd }|dr=|d|| qzt|}W n	 tyL   Y nw |d|| q|d } t|ddd| q jdd d g }	 D ]}
|
j	 dkr |
j
dusJ |
jdkrt|
j
tsJ ||
j
}|
jd}|	td|
j
|| qp|
jdkrd}|
jd}t|	D ]}|jdkr|j|kr|j} nq|r|di }|
j
|v r||
j
 }|	td|
j
|| qp dkr tt|	d ddD ]}|	| }|
j|jkr|
j
|j
kr|	|  nqqpdkred}d}|
jd}t|	D ]#}|j|krG|jdkrG|jrG|jdd}|jd
d} nq%|sO|rd|
jdi }|r]||d< |rd||d< 	 qpdS ) an  
    Maps recorded profiler events to their corresponding fx nodes and adds stack traces.

    Builds a timeline of all events (regular ops and FX markers for filenames/nodes),
    sorts by timestamp, then processes chronologically while maintaining a context stack of active
    filename/node scopes. Regular events are augmented with stack traces and node names from the
    innermost active context. Runtime is O(n log n) for n events.

    Args:
        traced_data: Json of profiler events from Chrome trace

    Returns:
        Dict mapping recorded event names to their aten operations with added stack traces
    r   )_FX_METADATA_REGISTRYtraceEventsc                 S   s2   |  ddko|  dddo|  dddS )Ncatcpu_oprA    z## z ##)getrs   endswithr8   r   r   r   is_fx_marker_event  s
   zLmap_recorded_events_to_aten_ops_with_stack_trace.<locals>.is_fx_marker_eventc                    sD   |d }||d  }  t|d| ||   t|d| || d S )Ntsdurr4   r5   )r   r   )r   r   r9   start_tsend_tsevent_timeliner   r   append_fx_marker_event  s   zPmap_recorded_events_to_aten_ops_with_stack_trace.<locals>.append_fx_marker_eventr   r   rA      z.pyr   r   r   Nc                 S   r   r   )r   r   r   r   r   r     r   zBmap_recorded_events_to_aten_ops_with_stack_trace.<locals>.<lambda>rE   r4   r   node_metadatar5   rG   r   stack_tracezNo model stack trace availabler   args	node_name)torch.fx.tracebackr   r   r   r0   
ValueErrorr   r   r   r   r   r   
isinstancerW   r9   r   r   r   r   r   r   rM   r"   
setdefault)traced_datar   trace_eventsr   r   r9   content
node_indexr   context_stacktimeline_eventr   r   current_file_metadata	ctx_entryr   	node_metarQ   current_stack_tracecurrent_node_name	event_tidr   r   r   r   0map_recorded_events_to_aten_ops_with_stack_trace  s   











r  )r   NrV   )!	functoolsr   r   collectionsr   dataclassesr   typingr   r   r   r   r   torch.profilerr	   torch.autogradr
   r   r    partialtraverse_dfstraverse_bfsr%   r3   r6   rY   r   r   r   r   r   r   r  r   r   r   r   <module>   s@   

.  
	

	