o
    灛iG                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 ddl
mZ dd Zd'd
dZ			d(ddZd)ddZG dd dZG dd dZdd Zd*ddZd*ddZdd  Zed+d#d$Zd*d%d&ZdS ),    N)contextmanager)AnyDictList   )languagec                 C   sL   d | } dddd|  dg}t|}|tjjd}dd |D }|S )	N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounitsc                 S   s   g | ]}t |qS  )int.0xr   r   P/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/triton/testing.py
<listcomp>   s    znvsmi.<locals>.<listcomp>)join
subprocesscheck_outputdecodesysstdoutencodingsplit)attrscmdoutretr   r   r   nvsmi
   s   

r      meanc                 C   s  ddl }|dv s
J |j |j krtd|   |dur0|D ]}|  |d d|_q!|j }|j	| |   W d   n1 sIw   Y  |j
  |jjdd}|jjdd}|  |  |  |j
  ||}	tdt||	 }
|j }|j	| t|
D ]}|dur|D ]}d|_q|   qW d   n1 sw   Y  |j
  g }d}t|D ]+}|jjdd}|jjdd}|  |  |  |j
  ||||
 g7 }q||}t||| S )	a+  
    Benchmark the runtime of the provided function.

    :param fn: Function to benchmark
    :type fn: Callable
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    r   Nminmaxr!   medianzQCannot capture graph in default stream. Please use side stream in benchmark code.Tenable_timingr   
   )torchcudacurrent_streamdefault_streamRuntimeErrordetach_requires_grad_grad	CUDAGraphgraphsynchronizeEventrecordreplayelapsed_timer$   r   rangetensorgetattritem)fnrepgrad_to_nonereturn_moder)   r   gstart_event	end_eventestimate_msn_repeatir   	n_retriestimesr   r   r   do_bench_cudagraph   sZ   








rH      d   Tr*   c                    s  |dv sJ ddl }|jj| |      |r&|jtd|j|d}	n|jtd|j|d}	 jdd}
 jdd}|
	  t
d	D ]	}|	  |   qE|	     |
|d	 }td
t|| }td
t|| } fddt
|D }
 fddt
|D }t
|D ]}|   qt
|D ]!}|dur|D ]}d|_q|	  |
| 	  |   || 	  q   |jdd t|
|D |jd}|dur|||j||jd }t|d
kr|d }|S t||| S )a  
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.

    :param fn: Function to benchmark
    :type fn: Callable
    :param warmup: Warmup time (in ms)
    :type warmup: int
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param quantiles: Performance percentile to return in addition to the median.
    :type quantiles: list[float]
    :param fast_flush: Use faster kernel to flush L2 between measurements
    :type fast_flush: bool
    r"   r   Ng    A)dtypedeviceg    ATr&      r   c                       g | ]} j d dqS Tr&   r4   r   rE   dir   r   r          zdo_bench.<locals>.<listcomp>c                    rN   rO   rP   rQ   rR   r   r   r      rT   c                 S   s   g | ]	\}}| |qS r   )r7   )r   ser   r   r   r      s    )rK   )r)   _dynamodevice_interfaceget_interface_for_devicer3   emptyr   int8r4   r5   r8   zero_r7   r$   r0   r9   zipfloatquantiletolistlenr:   r;   )r<   warmupr=   r>   	quantiles
fast_flushr?   device_typer)   cacherA   rB   _rC   n_warmuprD   rE   r   rG   r   r   rR   r   do_benchR   sN    ri    c                 C   sJ  dd l }dd l}t| |js|| } t||js||}|d u r$d}t|r-|| jn|}|d u r5d}t|r>|| jn|}t| |jrX| j|jkrP|  } | 	 
   } t||jrp|j|jkrh| }|	 
   }| jdksz|jdkr|jj| |||dd d S |j| |||dst| d|  d	| d
| d| d
d S )Nr   g{Gz?g        r   T)atolrtol	equal_nan)rk   rl    z is not close to z (atol=z, rtol=))numpyr)   
isinstanceTensorr9   callablerK   bfloat16r^   cpudetachsizetestingassert_allcloseallcloseAssertionError)r   yrk   rl   err_msgnpr)   r   r   r   assert_close   s4   

&r   c                   @   sl   e Zd ZdZ						ddee dee dedee d	ee d
edeeef dedededefddZ	dS )	Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    rj   FNx_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                 C   sL   || _ || _|
| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _dS )a  
        Constructor.
        x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
        of scalars and there are multiple x_names, all arguments will have the same value.
        If x_vals is a list of tuples/lists, each element should have the same length as
        x_names.

        :param x_names: Name of the arguments that should appear on the x axis of the plot.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[Any]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
        :type args: Dict[str, Any]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        N)r   r   r   r   r   r   r   stylesr   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   colorr   r   r   r   __init__   s   .
zBenchmark.__init__)rj   rj   FFNN)
__name__
__module____qualname____doc__r   strr   r   boolr   r   r   r   r   r      s>    
	
r   c                	   @   s>   e Zd Zdd Z		ddedededefd	d
ZdddZdS )Markc                 C   s   || _ || _d S N)r<   
benchmarks)r   r<   r   r   r   r   r   
  s   
zMark.__init__F   bench	save_path
show_plots
print_datac              
      sr  dd l }dd lm}	 dd l}
|j}dd |jD }dd |jD }t|j}|
j|| | | d}|jD ] t	 tt
fsG fdd|D  t t|kr[tdt| d  tt| }g g g }}}|jD ]<}| jdi ||j|i|j|}z|\}}}W n ty   |d d }}}Y nw ||g7 }||g7 }||g7 }qmt | | | |jt|< q5|jrl|	  |	 }|d }t|jD ][\}}||d	  ||d
  }}|jr|j| d nd }|jr|j| d nd }|j|| || |||d |  s*|  s*|t}|t}|j|| ||d|d q|   |!|j"p6| |#|j$ |%|j&rFdnd |'|j(rQdnd |r[|	)  |rl|	*|j+,||j d |||j  }|r|j-d dkr|j./ \}}|| ||  |d< |rt0|jd  t0|1  |r|j2|j+,||j dd| ddd |S )Nr   c                 S      g | ]}| d qS )-minr   r   r   r   r   r         zMark._run.<locals>.<listcomp>c                 S   r   )-maxr   r   r   r   r   r     r   )columnsc                    s   g | ]} qS r   r   )r   rg   r   r   r   r     s    z	Expected z values, got r   r   r   )labelr   lsg333333?)alphar   loglinearz.png   Diff:z.csvz%.fF)float_formatindexr   )3osmatplotlib.pyplotpyplotpandasr   listr   	DataFramer   rq   tuplera   
ValueErrordictr]   r   r<   r   r   	TypeErrorlocr   figuresubplot	enumerater   plotisnullallastyper^   fill_betweenlegend
set_xlabelr   
set_ylabelr   
set_xscaler   
set_yscaler   showsavefigpathr   shaper   r`   print	to_stringto_csv)r   r   r   r   r   diff_colsave_precisionkwragsr   pltpdy_meany_miny_maxr   dfx_argsrow_meanrow_minrow_maxr|   r   axfirst_xrE   colstycol0col1r   r   r   _run  sz   


$

 

"z	Mark._runrj   c                 K   s   t | jt}|r| jgn| j}g }|r)tj|dd ttj|dd}	|	d |D ]}
|	| j
|
|||fi | |rH|	d|
j d q+|rT|	d |	  |r^|r\|d	 S |S d S )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )rq   r   r   r   makedirsopenr   r   writeappendr   r   close)r   r   r   r   	return_dfkwargshas_single_benchr   
result_dfshtmlr   r   r   r   runS  s(   

zMark.runN)Fr   )FFrj   F)	r   r   r   r   r   r   r   r   r   r   r   r   r   r     s    Er   c                        fdd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                    s
   t |  S r   )r   )r<   r   r   r   <lambda>r  s   
 zperf_report.<locals>.<lambda>r   )r   wrapperr   r   r   perf_reportk  s   r   c                 C   s^   ddl }ddlm} | s|j } |jj| d }|jj| d }|| d d d	 }|S )
z return DRAM bandwidth in GB/s r   Nr   drivermem_clock_ratemem_bus_widthr   g    .A   )r)   runtimer   r*   current_deviceactiveutilsget_device_properties)rL   r)   r   mem_clock_khz	bus_widthbw_gbpsr   r   r   get_dram_gbpsv  s   
r  c           	      C   s   dd l }ddlm} |s|j }|jj|d d }|j|}|d dk r2| |j	ks/J d}n+| |j
|jfv r=d}n | |j	|j|jfv rJd}n| |jtjtjtjfv rYd	}ntd
|| | d }|S )Nr   r   r   multiprocessor_count   r      i   i   dtype not supported&.>)r)   r   r   r*   r   r   r   r   get_device_capabilityfloat16float32int32rt   int16r[   tl
float8e4nvfloat8e4b15float8e5r-   	rK   
clock_raterL   r)   r   num_subcores
capabilityops_per_sub_coretflopsr   r   r   get_max_tensorcore_tflops  s$   
r  c                     r   )Nc                    s   t   fdd}|S )Nc            
         s   dd l }|t  }  | k}|rg|dkrgtjjd }tj	d dd}d|v s4J d|d j
jj}| d	j d
| d}tjddd|gd|d}	|	jdks\J ddt|	jv seJ d S | i | d S )Nr   zcuda-memcheck__file__PATH1)r  PYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]pytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodecallspecidr   r   r   
returncoder   r   )
r   r   r!  	ppid_namerun_cuda_memcheckr   r   test_idr   r   )target_kwargstest_fnr   r   r     s   z1cuda_memcheck.<locals>.decorator.<locals>.wrapper)	functoolswraps)r1  r   r0  )r1  r   	decorator  s   z cuda_memcheck.<locals>.decoratorr   )r0  r5  r   r4  r   cuda_memcheck  s   r6  F    c              
   c   s$   zzt g d t dddd|  d|  g t dddd| d| g tdgd	 }td
gd	 }t||  dk sEJ d|  dt|| dk sUJ d| dd|  }d| d }||fV  W t g d t g d t g d d S t g d t g d t g d w )N)r	   r
   r   -pmr  r	   r
   r   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryr(   zGPU SMs must run at z MHzg 3O?i   gMbP?)r	   r
   r   r9  r   )r	   r
   r   z-rgc)r	   r
   r   z-rmc)r   r   r   abs)ref_sm_clockref_mem_clockcur_sm_clockcur_mem_clockr  gbpsr   r   r   set_gpu_clock  s8     r@  c           	      C   s   dd l }ddlm} |s|j }|jj|d d }|j }|d dk r;| |j	kr/d}n#| |j
kr7d}ntd	| |j	krCd}n| |j
|jfv rNd}ntd	|| | d
 }|S )Nr   r   r   r  r  r       @   r  r  )r)   r   r   r*   r   r   r   r   r  r	  r  r-   rt   r  r   r   r   get_max_simd_tflops  s&   




rC  )r    Nr!   )rI   rJ   NNTr!   r*   )NNrj   r   )r7  r8  )r2  r   r   r   
contextlibr   typingr   r   r   rj   r   r  r   rH   ri   r   r   r   r   r  r  r6  r@  rC  r   r   r   r   <module>   s,    
	?

O%Bc

