o
    灛i)$                     @   s  d dl Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
mZ e je je je jgZdd Zd	d
 Zdd Zdd Zeeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddge  g de
edddeddd ied ejd!ejd"ejd#ejd$ejd%ejd&ejd'ejdejd(ejfd)d*ZG d+d, d,e jjZejZdS )-    N   )Configautotunecdiv
heuristicsjit)language   )early_config_pruneestimate_matmul_timec                 C   s   dt | v r	tjS | S )Nfp8)strtorchfloat16)a r   S/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/triton/ops/matmul.pyupcast_if_fp8
   s   r   c                 C   sb   t | } t |}| |u r| S | tv sJ |tv sJ tD ]}| |u r&|  S ||u r.|   S qd S N)r   _ordered_datatypes)r   bdr   r   r   get_higher_dtype   s   r   c                    s    fddS )Nc                    s   |     S r   )zero_)nargsnamer   r   <lambda>!   s    zinit_to_zero.<locals>.<lambda>r   r   r   r   r   init_to_zero    s   r   c                  C   s   g } dD ]?}dD ]:}dD ]5}dD ]0}|dkrdnd}|  t|||dd	||d
 dD ]}|  t||||d	||tdd q+qqqq| S )N)r               )       )r$   @   )r$   r%         r%   r   r    r	   BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)r   r       r#   C)r.   r/   pre_hook)appendr   r   )configsr.   block_mblock_kblock_nr/   split_kr   r   r   get_configs_io_bound$   s,   
r9   r&   r'   r$   r(   r   r0   r-   r%   r    r!   )MNK
   )r
   
perf_modeltop_k)r4   keyprune_configs_byEVEN_Kc                 C   s   | d | d | d   dkS )Nr<   r+   r,   r   r   )argsr   r   r   r   U   s    r   	acc_dtypeinput_precisionfp8_fast_accumr)   r*   r+   GROUP_Mr,   AB_DTYPEc           +      C   s  t d}t d}t ||}t ||}|| }|| }t|||  |}|| ||  }|| | }|| t d| }|| t d| } t t || ||}!t t | | ||}"|| t d| }#| |!d d d f | |#d d d f |   } ||#d d d f | |"d d d f |	   }t j||f|d}$tdt ||| D ]w}%|rt 	| }&t 	|}'n1||%||   }(t jd|j
jd})t j	| |#d d d f |(k |)d}&t j	||#d d d f |(k |)d}'|d ur|&|}&|'|}'|rt j|&|'|$||d}$n|$t j|&|'||d7 }$| || | 7 } ||| | 7 }q|$|j
j}$|| t d| }|| t d| } ||d d d f |
 | d d d f |   }||k d d d f | |k d d d f @ }*|dkrwt j||$|*d d S t j||$|*d d S )Nr   r	   )dtype)r	   r	   )maskother)	out_dtyperE   )rJ   )tl
program_idr   minarangemax_contiguousmultiple_ofzerosrangeloadrI   
element_tytodotstore
atomic_add)+ABr1   r:   r;   r<   	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnrD   rE   rF   r)   r*   r+   rG   r,   rB   rH   pidpid_zgrid_mgrid_nwidthgroup_id
group_sizepid_mpid_nrmrnramrbnrkacckr   r   k_remaining_0rJ   r   r   r   _kernel6   sR   
-
,,
  

,(
ru   c                   @   s.   e Zd ZeZi Zedd ZedddZdS )_matmulc                    s  | j }| ddkr| ddkr|  } |ddkr'|ddkr'| }| jd |jd ks5J d| j\ }|j\}t| j|j}	|d u rL|	}tj f||d}
tjtj	tjftj
tj	tj
ftj	tj	ftjtjfi}|d u r{||	 d }n t|tjsJ d||| j v sJ d|||j v sJ ddd	 }||}||	}	||}| jtjtjfv r|jtjtjfv rd }	 fd
d}t| | ||
 || d| d|d|d|
d|
d|||d|	d |
S )Nr   r	   zincompatible dimensions)devicerI   zacc_dtype must be a torch.dtypez+acc_dtype not compatible with the type of az+acc_dtype not compatible with the type of bc                 S   s   t tt| dd S )N.)getattrrM   r   split)tyr   r   r   
to_tl_type   s   z!_matmul._call.<locals>.to_tl_typec                    s$   t  | d t | d  | d fS )Nr)   r*   r,   )r   )METAr:   r;   r   r   r      s   $ z_matmul._call.<locals>.<lambda>r0   )rD   rE   rF   rG   rH   )rw   stride
contiguousshaper   rI   r   emptyr   float32bfloat16int8int32
isinstancerM   
float8e4nvfloat8e5ru   )r   r   rD   rE   rF   output_dtyperw   r<   _ab_dtypecsupported_acc_dtypesr}   gridr   r   r   _call   sL   

$	z_matmul._callNTc                 C   s   t j||||||dS )N)rD   rE   rF   r   )rv   r   )ctxr   r   rD   rE   rF   r   r   r   r   forward   s   z_matmul.forward)NNTN)	__name__
__module____qualname__ru   kernel_locksstaticmethodr   r   r   r   r   r   rv      s    
9rv   )r    r   r   r   r   r   r   rM   matmul_perf_modelr
   r   r   r   r   r   r   r   r   r   r9   	constexprru   autogradFunctionrv   applymatmulr   r   r   r   <module>   s    >
E