o
    i7                     @   s  d dl mZ ddlmZ ddlmZ ddlmZmZ ddl	m
Z
mZmZmZmZ e r5d dlZd dlmZ e
 r>d d	lmZ e Ze rJesJd dlZeeZG d
d deZG dd dejjZG dd dejZedddd Z	ddee  dB fddZ!dS )    )	lru_cache   )ACT2FN)ConversionOps)get_module_from_nameshould_convert_module)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availableis_torch_xpu_availableloggingN)nn)init_empty_weightsc                	   @   sT   e Zd Zdd Z	d	deeejeej B f dej	j
dB deeejf fddZdS )
FbgemmFp8Quantizec                 C   s
   || _ d S N)hf_quantizer)selfr    r   f/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/fbgemm_fp8.py__init__-   s   
zFbgemmFp8Quantize.__init__N
input_dictmodelreturnc                 K   sH  t | d \}}|d }ddlm} t||\}}t||r|dkrR|dd}	|	j}
|	d|
d }t	|\}}||
}|dd}||
d d|
d }nE|dkr|dd}	|	j}
|	d|
d }t	|\}}||
}|dd}||
d |
d d}nt	|\}}t
j||jd d}|t
j|| d|iS )	Nr   r   )FbgemmFp8Llama4TextExpertsgate_up_proj   	down_proj_scale)tupleitemsintegrationsr   r   
isinstance	transposeshapereshapequantize_fp8_per_rowtorchr   	Parameterview)r   r   r   kwargs
target_keyvaluer   moduletensor_nametransposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valueweight_scaler   r   r   convert0   s2   


zFbgemmFp8Quantize.convertr   )__name__
__module____qualname__r   dictstrr'   Tensorlistr   Moduler6   r   r   r   r   r   ,   s    
r   c                       *   e Zd Zejf fdd	Zdd Z  ZS )FbgemmFp8Linearc                    s   t  ||| || _|| _tjtj||f|d| _tjtj|dftj	d| _
| jdtjdgtjddd |rLtjtj| jtj	d| _d S d | _d S )Ndtyper   input_scale_ubF
persistent)superr   in_featuresout_featuresr'   r   r(   zerosweightfloat32r5   register_bufferfloatbias)r   rG   rH   rN   rB   	__class__r   r   r   f   s    
zFbgemmFp8Linear.__init__c                 C   s   g |j d d dR }t|d|j d  | jd\}}| jtj}t	r=tj
|| j |d| |j| jd}ntjjj|| j||dd}| jd urT|| j n|}||j}||}~~|S )Nr   )scale_ub)scale_ascale_b	out_dtyperN   Tuse_fast_accum)r$   r&   r)   
contiguousrC   r5   tor'   rK   _is_torch_xpu_available
_scaled_mmrJ   t	unsqueezerB   rN   opsfbgemmf8f8bf16_rowwisedevicer%   )r   xoutput_shapex_quantizedx_scaleweight_scale_float32outputr   r   r   forwardt   s(   $	
zFbgemmFp8Linear.forward)r7   r8   r9   r'   float8_e4m3fnr   rg   __classcell__r   r   rO   r   r@   e   s    r@   c                       r?   )r   c                    s   t    |j| _|j| _|j| _| j| _t|j | _	t
jt
j| j| jd| j ft
jd| _t
jt
j| jd| jd ft
jd| _t
jt
j| j| j| jft
jd| _t
jt
j| j| jdft
jd| _| jdt
jdgt
jddd d S )Nr   rA   r   rC   FrD   )rF   r   num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimr   
hidden_actact_fnr'   r   r(   rI   rh   r   rK   gate_up_proj_scaler   down_proj_scalerL   rM   )r   configrB   rO   r   r   r      s&   
"z#FbgemmFp8Llama4TextExperts.__init__c              	   C   s  | | jd| j}d}t|}t| jD ]-}|| }|d| j}t||| j\}}| j	j
d d }	| jtj}
trtj|| j	| ddd|	   |d|
| d d|	  dd  |jd}tj|| j	| dd|	d   |d|
| d |	d  dd  |jd}nNtjjj|| j	| ddd|	  ||
| d d|	  dd dd}tjjj|| j	| dd|	d  ||
| d |	d  dd dd}|| | }t||| j\}}| jtj}tr"tj|| j| dd |d||  dd  |jd}ntjjj|| j| dd |||  dd dd}|||< q||j}| d| jS )	z
        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
        Returns:
            torch.Tensor: (batch_size * token_num, hidden_size)
        r   Nr   r   r   )rR   rS   rT   TrU   )r)   rk   rm   r'   
empty_likeranger%   r&   rC   r   r$   rq   rX   rK   rY   rZ   r#   rW   r[   r\   rB   r]   r^   r_   rp   rr   r   r`   )r   hidden_states
num_tokensnext_statesiexpert_hiddenexpert_hidden_reshapedexpert_quantizedexpert_scalesharded_expert_dimgate_up_proj_scale_float32gateup	activatedactivated_quantizedactivated_scaledown_proj_scale_float32expert_outputr   r   r   rg      sz   
 " "
z"FbgemmFp8Llama4TextExperts.forward)r7   r8   r9   r'   rK   r   rg   ri   r   r   rO   r   r      s    r   r   )maxsizec                  C   s$   t rddlm}  | djS tjjjS )Nr   
get_kernelzkernels-community/fp8-fbgemm)rY   hub_kernelsr   r&   r'   r]   r^   r   r   r   r   get_quantize_fp8_per_row   s   

r   Fmodules_to_not_convertc              	   C   s   t  ad}|r	i nddi}|  D ]^\}}t||sqd}	tdd8 |jjdkr9t| jd| j}
t	|
p6| j}	nt
|tjrSt|j|j|jdufi |}	|	d W d   n1 s]w   Y  |	du rgq| ||	 d}q|swtd | S )	a  
    A helper function to replace all `torch.nn.Linear` modules by `FbgemmFp8Linear` modules.
    This will enable running your models using high performance fp8 kernel from FBGEMM library.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
            Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
        quantization_config (`FbgemmFp8Config`):
            The quantization config object that contains the quantization parameters.
        pre_quantized (`book`, defaults to `False`):
            Whether the model is pre-quantized or not
    FrB   NT)include_buffersLlama4TextExpertstext_configzYou are loading your model using FP8 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r   r&   named_modulesr   r   rP   r7   getattrrs   r   r"   r   Linearr@   rG   rH   rN   requires_grad_set_submoduleloggerwarning)r   r   quantization_configpre_quantizedtp_planhas_been_replacedmodule_kwargsmodule_namer-   
new_moduler   r   r   r   replace_with_fbgemm_fp8_linear	  s>   

r   )NNFN)"	functoolsr   activationsr   core_model_loadingr   quantizers.quantizers_utilsr   r   utilsr   r	   r
   r   r   r'   r   
accelerater   rY   fbgemm_gpu.experimental.gen_ai
fbgemm_gpu
get_loggerr7   r   r   r   r@   r>   r   r   r=   r;   r   r   r   r   r   <module>   s.   	

9/l
	
