o
    灛i)                     @   s   d dl Z d dlZd dlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ e  dd Zdd	 Zd
d Zdd Z	dddZdd ZdS )    N   )cdiv)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsnvsmic                  C   sT   z
t dgd d W S  ty)   dd l} |   | d}| || jd  Y S w )Nzclocks.max.smr   g     @@)r   FileNotFoundErrorpynvmlnvmlInitnvmlDeviceGetHandleByIndexnvmlDeviceGetMaxClockInfoNVML_CLOCK_SM)r
   handle r   ^/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/triton/ops/matmul_perf_model.pyget_clock_rate_in_khz   s   
r   c                 C   D   |t |d }tjj| d d }t ||| t|t |  }|S z# return compute throughput in TOPS    multiprocessor_count)minr   activeutilsget_device_propertiesr   r   devicenum_ctas	num_warpsdtypetotal_warpsnum_subcorestflopsr   r   r   get_tensorcore_tflops   s   r#   c                 C   r   r   )r   r   r   r   r   r   r   r   r   r   r   get_simd_tflops    s   r$   c                 C   s>   t j| }|d dk r|t jkrt| |||S t| |||S )Nr      )torchcudaget_device_capabilityfloat32r$   r#   )r   r   r   r   
capabilityr   r   r   
get_tflops(   s   r+   Fc           +      K   s  t j }|j}| }t||}t||	}|}|| | }t||t||	}}d| | | d }t||| |}|| }tj	j
|d }td|| }td|d }ttd|d d d}t||d |d	   }|d
 }|| | dd|d    }|| | d |d  }|| | dd|d    } || | d |d  }!||  d }"||! d }#|"| |#|  }$|d }%|| | | d }&|dkr|&|% }'n|%}(|&|( }'|| d d |% })|'|)7 }'t||$|' }*|rtd|* d| d|$ d|' d|d  d |*S )zO return estimated running time in ms
          = max(compute, loading) + store r   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r&   r'   current_devicer   element_sizer   maxr+   r   r   r   r   r   r   print)+r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r   dtsize	num_cta_m	num_cta_n	num_cta_kr   	total_opstput
compute_msnum_smactive_cta_ratioactive_cta_ratio_bw1active_cta_ratio_bw2dram_bwl2_bwload_a_dram	load_a_l2load_b_dram	load_b_l2
total_dramtotal_l2load_msstore_bwstore_c_dramstore_ms	reduce_bwzero_mstotal_time_msr   r   r   estimate_matmul_time/   sT   





r\   c                    s  t j }t j }|d  }|d j}g }| D ]2}|j}	|	d |	d |	d |jf\}
}}}tj	j
|d }|
| | | | }||krK|| q|} |t jt jfvr]dd | D } i }| D ]9}|j}	|	d |	d |	d |	d |j|jf\}
}}}}}|
||||f}||v r|| ||f qa||fg||< qag }| D ]O\}}|\}
}}}}|d	 d
kr|
| | d }|td| d
 }d}||  tjd| fddd}|D ]	}||d	  qq|d	 d	 }d|_|| q|S )Nr6   r<   r=   r>   max_shared_memc                 S   s   g | ]}|j d  dkr|qS )r?   r,   )rA   ).0configr   r   r   
<listcomp>   s    z&early_config_prune.<locals>.<listcomp>r?   r   r%   i   r   i,  r   c                    s0   | d   dk rdt | d    S | d   S )Nr,   r   
   )abs)xoptimal_num_stagesr   r   <lambda>   s   z$early_config_prune.<locals>.<lambda>)key)r&   r'   r1   r(   r2   r   rA   r5   r   r   r   r   appendfloat16r)   r   itemsr   heapq	nsmallest)configs
named_argsrA   r   r*   rB   r   pruned_configsr_   kwr<   r=   r>   r5   max_shared_memoryrequired_shared_memoryconfigs_mapr?   r   rg   kvmmas
mma_cyclesldgsts_latencynearestnrandom_configr   rd   r   early_config_prunep   sX   




"r|   )F)	functoolsrk   r&    r   runtimer   testingr   r   r   r   	lru_cacher   r#   r$   r+   r\   r|   r   r   r   r   <module>   s    
	
A