o
    i]                     @   sx  d dl mZ d dlmZ d dlmZmZmZmZ e r5ddl	Z	ddl
mZ ddlZddlmZ ddl
mZ eeZdadd Zd	ee dB d
e	jdefddZe	jZe	ejZ e	ej!Z"ej#dej$fddZ%d9de	j&d	ede'e	j&e	j&f fddZ(ej#dej$dej$dej$dej$fddZ)ej#dej$dej$dej$dej$fddZ*e	j+fde	j&de	j&de	j&d e	j&d	ee d
e	jde	j&fd!d"Z,e	j+fde	j&de	j&de	j&d e	j&d	ee d
e	jde	j&fd#d$Z-e	j.de	j+fd%e	j&d&e	j&d'e	j&d(e	j&d	e'eef dB d
e	jde	j&fd)d*Z/G d+d, d,ej0Z1d-d. Z2G d/d0 d0ej3Z4	1d:d2ee5 dB fd3d4Z6G d5d6 d6eZ7G d7d8 d8eZ8dS );   )ConversionOps)should_convert_module)is_kernels_availableis_torch_accelerator_availableis_torch_availablelogging    N)
functionalc               
   C   sh   t du r.zddlm}  | da W n ty- } ztd| d da W Y d}~nd}~ww t r2t S dS )zALazily load the CUTLASS quantization kernel from HuggingFace Hub.N   )
get_kernelzRedHatAI/quantizationz,Failed to load CUTLASS quantization kernel: . Falling back to Triton.F)_quantization_kernelhub_kernelsr   	Exceptionloggerwarning_once)r   e r   k/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/finegrained_fp8.py_get_quantization_kernel"   s   r   
block_sizeoutput_dtypereturnc                 C   s   t  rtj rt sdS |tjtjfvrdS | du rdS t| dks/| d dks/| d dkr1dS tj }|d d |d  }t	 }|du rIdS z|
|W S  tyY   Y dS w )a;  
    Check if CUTLASS blockwise FP8 matmul is supported for the given block size and output dtype.

    CUTLASS blockwise kernels require:
    - SM90+ (Hopper or newer)
    - Block size [128, 128] for weights
    - Block size [1, 128] for activations (handled implicitly)
    - Output dtype bfloat16 or float16
    FNr   r      r
   
   )r   torchcudais_availabler   bfloat16float16lenget_device_capabilityr   $cutlass_scaled_mm_supports_block_fp8r   )r   r   
capabilitycuda_capabilitykernelr   r   r   _supports_cutlass0   s$   $
r&   
BLOCK_SIZEc           	      C   s   t jdd}|| t d| }t | | t j}t t |d }|| }||jj	}t 
|| | t 
|| | d S )Nr   axisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	x_ptry_ptrs_ptrr'   pidoffsxsyr   r   r   act_quant_kernel]   s   r=   r   r:   c                    s      sJ  jd | dksJ tj tjd} jg   d d  d| R dtji} fdd}t|  |||d ||fS )Nr   r2   r2   c                    s   t   | d fS )Nr'   )tritoncdivnumel)metar:   r   r   grido   s   zact_quant.<locals>.grid)r'   )	is_contiguousshaper   
empty_likefloat8_e4m3fn	new_emptysizer/   r=   )r:   r   r<   r;   rE   r   rD   r   	act_quanti   s   2rL   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc           6      C   s  t jdd}t ||}t ||}|| }|| }|| }t|| |}|||  }|| | } || t d| | }!| | t d| | }"t d|}#| |!dddf |
 |#dddf |   }$||#dddf | |"dddf |   }%||!|  }&|"| }'||'|  }(t j||ft jd})tdt ||D ]h}*t j|$|#dddf ||*|  k dd}+t j|%|#dddf ||*|  k dd},|*| }-|-|	 }.t |&|.|  }/t |(|.|  }0|)t 	|+|,|/dddf  |0dddf  7 })|$|| 7 }$|%|| 7 }%q|j
jt jkr|)t j}1n|j
jt jkr%|)t j}1n|)t j}1|| t d| }2| | t d| }3|||2dddf   ||3dddf   }4|2dddf |k |3dddf |k @ }5t j|4|1|5d dS )zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and
    store the result in output tensor `C`.
    r   r(   Nr?           maskotherrS   )r*   r+   rA   minr,   zerosr/   ranger-   dotr2   r3   r   r.   r   r4   )6ABCAsBsMNKgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nrM   rN   rO   rP   r8   	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_maskr   r   r   _w8a8_block_fp8_matmulw   sL   %,,((0,(r   c           -      C   s  t jdd}t ||}t ||}|| }|| }|| }t|| |}|||  }|| | }|| t d| | }|| t d| | }t d|}| |dddf |
 |dddf |   } ||dddf | |dddf |   }!t |}"t |}#t j||ft jd}$tdt ||D ]B}%t j| |dddf ||%|  k dd}&t j|!|dddf ||%|  k dd}'|$t 	|&|'|" |# 7 }$| || 7 } |!|| 7 }!q|j
jt jkr|$t j}(n|j
jt jkr|$t j}(n|$t j}(|| t d| })|| t d| }*|||)dddf   ||*dddf   }+|)dddf |k |*dddf |k @ },t j|+|(|,d dS )zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with per-tensor quantization, and
    store the result in output tensor `C`.
    r   r(   Nr?   rQ   rR   rU   )r*   r+   rA   rV   r,   r-   rW   r/   rX   rY   r2   r3   r   r.   r   r4   )-rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rM   rN   rO   rP   r8   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   scale_ascale_br~   r   r   r   r   r   r   r   r   r   r   r   !_w8a8_block_fp8_matmul_per_tensor   sB   !,,

((,(r   rZ   r[   r]   r^   c                    s  |du r	d\}}nt |dksJ |d |d }}||jd kr,||jd kr,d}d}| jd |jd ks8J | dkrb| jdd |jdd krP|  sRJ t| jd ||jd ksbJ |  | jd   |j\}|jdkry| s{J | dkr|jdksJ t||jd ksJ  d	| d	|j t|||jd ksJ | d	| d	|j | jdd f }	| j|	|d
}
d} |k rt }t	|d}|}|| dksJ |} fdd}| dkr(| dkr(t
| | ||
|| |||| d| d|d|d|
d|
d|||dd |
S t| | ||
|| |||| d| d|d|d|
d|
d|d|d|d|d|||dd |
S )a  This function performs matrix multiplication with block-wise
    quantization.
    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.
    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should
        be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.
    Returns:
        torch.Tensor: The result of matmul.
    N)r   r   r   r   r
   r>   r   , r?      c                    s"   t  | d t | d  fS )NrM   rN   )r@   rA   )METAr_   r`   r   r   rE   Q  s   "z*w8a8_block_fp8_matmul_triton.<locals>.grid   )rM   rN   rO   rP   )r    rG   rB   rF   r@   rA   ndimrJ   next_power_of_2r0   r   strider   )rZ   r[   r]   r^   r   r   block_nblock_kra   C_shaper\   rM   rO   rN   rE   r   r   r   w8a8_block_fp8_matmul_triton  s   
( 
00

2r   c              
   C   sF  t ||rt }|durzr| j}|  | jd  }| jd }	|jd }
|	d dks/|
d dkr:td|	 d|
 d| ||	 }|  }||d }|   }|  }|   }||||||d}|dd |
f }||W S  t	y } zt
d| d	 W Y d}~nd}~ww t| |||||S )
a  
    Dispatch to CUTLASS or Triton for block-wise FP8 matmul.

    Uses CUTLASS when:
    - Block size is [128, 128] (the only size CUTLASS supports)
    - Running on SM90+ (Hopper or newer)
    - The CUTLASS kernel is available
    - Output dtype is bfloat16 or float16 (CUTLASS requirement)
    - Tensor dimensions are compatible (divisible by 16)

    Otherwise falls back to Triton.
    Nr>   r   r   zCUTLASS requires K (z	) and N (z) divisible by 16zCUTLASS kernel failed: r   )r&   r   rG   rB   
ValueErrorview
contiguoustcutlass_scaled_mmr   r   r   r   )rZ   r[   r]   r^   r   r   r%   original_shaper_   ra   r`   A_2dB_col_majorAs_2dBs_kmr\   r   r   r   r   r   w8a8_block_fp8_matmul  s0   


r   input_qweight_qinput_scaleweight_scalec              
   C   s  | j dkr| jn
d| jd | jd f\}}}|jd }	| d|}
||jd d}|	|d  }||d  }tj|| |	ftj| jd}t|D ]k}||d  }||d  }t|D ]X}||d  }||d  }|
dd||f }|||||f }|dd||d f }|||f }tj||	 tj
dtj| jd||d| }|dd||f  |7  < qZqH||||	}||S )a  
    Performs blocked matrix multiplication with FP8 quantized matrices.

    Args:
        input_q: Quantized input tensor with 1x128 block quantization
        weight_q: Quantized weight tensor with 128x128 block quantization
        input_scale: Scaling factors for input blocks
        weight_scale: Scaling factors for weight blocks
        block_size: Tuple of (M, N) for weight block dimensions
        output_dtype: Desired output dtype
       r
   r   r>   )r2   deviceN)r   r   	out_dtype)r   rG   r   r   rW   r/   r   rX   
_scaled_mmr   tensorr.   )r   r   r   r   r   r   
batch_sizeseq_len
hidden_dimout_featuresinput_reshapedinput_scale_reshapednum_weight_blocks_mnum_weight_blocks_noutputim_startm_endjn_startn_endinput_blockweight_blockcurr_input_scalecurr_weight_scaleblock_resultr   r   r   w8a8_block_fp8_matmul_compile  s>   ,

r   c                       s\   e Zd Zdejddfdedededeeef dB f fdd	Zd
ej	dej	fddZ
  ZS )	FP8LinearFNdynamicin_featuresr   biasr   c           	         s   t  || || _|| _tjtj|||d| _| jd u r,ttj	dtj
d| _n)|| jd  d | jd  }|| jd  d | jd  }ttj||tj
d| _| jdkrfttj	dtj
d| _|rttt| j| _d S | dd  d S )Nr?         ?r   r
   staticr   )super__init__r   activation_schemer   nn	Parameteremptyweightr   r/   weight_scale_invactivation_scaler   r   register_parameter)	selfr   r   r   r2   r   r   scale_out_featuresscale_in_features	__class__r   r   r     s    	

zFP8Linear.__init__inputr   c           	   	   C   sN  | j  dkrt|| j | jS t| j tjjj	r&| j j
 }| jj
 }n
| j  }| j }t r9tj jnd}tt|tj}||jB | jdkrYt|| jd \}}n| jdkrs| jtj}|| jttdtj}ntdt||||| j|j d}W d    n1 sw   Y  |!  | jd ur|| j }|j|j dS )	Nr
   r   r   r   rV   r0   zNot supportedr   r?   )"r   element_sizeFlinearr   
isinstancer   distributedr   DTensor_local_tensorr   r   r   acceleratorcurrent_acceleratortypegetattrr   r   r   rL   r   r   r.   r/   clamp_FP8_MIN_FP8_MAXrI   NotImplementedErrorr   r2   synchronize)	r   r   r   	scale_invdevice_typetorch_accelerator_moduleqinputscaler   r   r   r   forward;  s:   





zFP8Linear.forward)__name__
__module____qualname__r   rI   intbooltupler   Tensorr   __classcell__r   r   r   r   r     s    "r   c                 C   s   | | d | S )Nr
   r   )r   r   r   r   r   	_ceil_divd  s   r   c                       sf   e Zd Zejf fdd	ZdejdejdejdejfddZd	ejd
ejdejdejfddZ  Z	S )	FP8Expertc                    sB  t    ddlm} || _t|dr|jn|j| _|j| _	t|dr&|j
n|j| _d| j | j	}}| j	| j}}ttj| j|||d| _ttj| j|||d| _| j\}	}
t||	}t||
}ttj| j||tjd| _t||	}t||
}ttj| j||tjd| _| dd  | dd  ||j | _d S )Nr   )ACT2FNnum_local_expertsmoe_intermediate_sizer?   gate_up_bias	down_bias)r   r   activationsr   r   hasattrr   num_expertshidden_sizer   r   intermediate_sizeintermediate_dimr   r   r   rW   gate_up_proj	down_projr   r/   gate_up_proj_scale_invdown_proj_scale_invr   
hidden_actact_fn)r   configr   r2   r   Wg_outWg_inWd_outWd_inbobi
gu_scale_o
gu_scale_i
dp_scale_o
dp_scale_ir   r   r   r   i  s2   





zFP8Expert.__init__hidden_statestop_k_indextop_k_weightsr   c                 C   s0  t |}t  % t jjj|| jd}|ddd}t |j	ddd
 }W d    n1 s1w   Y  |D ]]}|d }|t| jkrFq8t || \}}	||	 }
| |
| j| | j| jddd\}}| || }| || j| | j| }||	|d f }|||j }|d|	||j q8|S )N)num_classesr   r
   r   )r>   r   dimr>   )r   
zeros_likeno_gradr   r	   one_hotr  permutegreatersumnonzeror    r  wherer   r
  chunkr  r	  r  r.   r2   
index_add_)r   r  r  r  final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statesrouting_weightsr   r   r   r     s2   


zFP8Expert.forwardr   r   r   c           	   	   C   s   |  dkrt||d S t rtj jnd}tt|tj	}|
|j
 t|| jd \}}t||||| j|jd}W d    n1 sFw   Y  |  |j|jdS )Nr
   r   r   r?   )r   r   r   r   r   r   r   r   r   r   r   rL   r   r   r2   r   r.   )	r   r   r   r   r   r   r   r   r   r   r   r   r     s"   zFP8Expert.linear)
r   r   r   r   rI   r   r   r   r   r   r   r   r   r   r   h  s    .
* r   Fmodules_to_not_convertc           	   
   C   s   |j r| S d}|  D ]a\}}t||sq|ri nddi}d}td@ |dr7td
| j|jd|}nt	|t
jrQtd
|j|j|jdu|j|jd|}|dur]| || d}W d   n1 sgw   Y  q|sttd	 | S )a  
    A helper function to replace all `torch.nn.Linear` modules by `FP8Linear` modules.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
            Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
        quantization_config (`FbgemmFp8Config`):
            The quantization config object that contains the quantization parameters.
        pre_quantized (`book`, defaults to `False`):
            Whether the model is pre-quantized or not
    Fr2   NrC   z.experts)r  r   )r   r   r   r   r   TzYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.r   )
dequantizenamed_modulesr   r   r   endswithr   r  weight_block_sizer   r   Linearr   r   r   r   r   set_submoduler   warning)	modelr4  quantization_configpre_quantizedhas_been_replacedmodule_namemodulemodule_kwargs
new_moduler   r   r   replace_with_fp8_linear  sH   

rD  c                   @   s6   e Zd ZdZdd Zdejdeeejf fddZ	dS )	Fp8Quantizez^
    A quantization operation that creates two tensors, weight and scale out of a weight.
    c                 C   
   || _ d S Nhf_quantizerr   rI  r   r   r   r        
zFp8Quantize.__init__
input_dictr   c                 K   s  t | d \}}|d }d }| jjd ur-t| jjtr%| jjd}nt| jjdd }|d u r;|jd |jd f}|\}}|jd |jd }}	|| dksV|	| dkrit	d| d|	 d| d| d| 
|jd d }
|| }|	| }|j}|
tj}|jg |
||||R  }| jd	d
}t|dk|t|}t| }t|dk|t|}|dd}|| }tj|ttd
t}||}d| 
tj}|dr|ddd d }n|d }||||iS )Nr   r8  r   r>   Matrix dimensions (r   $) must be divisible by block sizes (z). for )r>   r  rO  r   r   r   .r
   z.weight_scale_inv
_scale_inv)r   itemsrI  r=  r   dictgetr   rG   r   r.   r   r/   reshaper1   amaxr&  	ones_liker   	unsqueezer   r   
_FP8_DTYPEr7  rsplit)r   rL  kwargstarget_keysvaluer   block_mr   rowscolsleading_shape
rows_tiles
cols_tilesr   
value_fp32reshapedmax_abssafe_max_absscalesscales_broadcastscaled	quantized
inv_scales	scale_keyr   r   r   convert  sH     

zFp8Quantize.convertN)
r   r   r   __doc__r   r   r   rS  strrn  r   r   r   r   rE    s    "rE  c                	   @   sJ   e Zd ZdZdd Z	d
deeejf dedB deeejf fdd	Z	dS )Fp8DequantizeziInverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor.c                 C   rF  rG  rH  rJ  r   r   r   r   P  rK  zFp8Dequantize.__init__NrL  full_layer_namer   c              
   K   s  t |dk r||d iS |d d }|d d }|jdd  \}}| jjj}|d u r4|jd |jd f}|\}	}
||	 dksD||
 dkrUtd| d| d	|	 d|
 d
	||j}|d||	 |	||
 |
}|d||	 ||
 }|	d	d}|| }|||jiS )Nr   zweight$r   r   r   r>   rM  r   rN  z).)
r    rG   rI  r=  r8  r   r.   r2   rU  rX  )r   rL  rr  r[  rk  rh  r_  r`  r   r^  r   re  expanded_scalesdequantizedr   r   r   rn  S  s(   
zFp8Dequantize.convertrG  )
r   r   r   ro  r   rS  rp  r   r   rn  r   r   r   r   rq  M  s    rq  )r   )NNF)9core_model_loadingr   quantizers.quantizers_utilsr   utilsr   r   r   r   r   torch.nnr   r@   triton.languagelanguager*   r	   r   
get_loggerr   r   r   r   listr   r2   r   r&   rI   rY  finforV   r   r0   r   jit	constexprr=   r   r   rL   r   r   r/   r   r   compiler   r9  r   r   Moduler   rp  rD  rE  rq  r   r   r   r   <module>   s   
 '&TN
{
LALg

5J