o
    i-                     @   s   d dl mZ ddlmZ erddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ e	 r5d dlZdd	lmZ eeZdZG d
d deZdS )    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameN)WeightConverterc                       s   e Zd ZdZdZ fddZdd Zdd Zd	d
dede	fddZ
d"ddZ	d#d	d
de	fddZdd Zdd Zdd Zdd Zede	fddZdd Zd d! Z  ZS )$Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    Fc                    s   t  j|fi | d | _d S N)super__init__triton_kernels_hub)selfquantization_configkwargs	__class__ i/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   0   s   
zMxfp4HfQuantizer.__init__c                 C   sF   | j du r zddlm} |d| _ W | j S  ty   tdw | j S )z3Lazy import and initialize kernels only when neededNr   )
get_kernelz(kernels-community/gpt-oss-triton-kernelsz2kernels package is required for MXFP4 quantization)r   integrations.hub_kernelsr   ImportError)r   r   r   r   r   _lazy_import_kernels4   s   
z%Mxfp4HfQuantizer._lazy_import_kernelsc                 O   s\  t  std| jjrd S tj s)tj s)| jr%t	
d d| j_d S tdt s0tdtj r?d}tdo=t }ntj }|dk}tdoNt }| jrm|s_t	
d	 d| j_d S |slt	
d
 d| j_d S n|sstd|sytd| js|   |d}|d u rt	
d d S t|tr| jsd| v sd| v rtdd S d S d S )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`z3.5.0)      z3.4.0u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) zuMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0
device_mapzYou have loaded an FP4 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or device_map = 'xpu'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r	   r   r   
dequantizetorchcudais_availablexpupre_quantizedloggerwarning_onceRuntimeErrorr   r
   r   get_device_capability
ValueErrorr   get
isinstancedictvalues)r   argsr   gpu_is_supportedkernels_availablecompute_capabilityr    r   r   r   validate_environment?   st   



z%Mxfp4HfQuantizer.validate_environmentmodelr   
param_namereturnc                 K   s8   ddl m} t||\}}t||r|dv rdS dS dS )Nr   Mxfp4GptOssExperts)down_proj_biasgate_up_proj_biasFT)integrationsr;   r   r/   )r   r7   r8   r   r;   moduletensor_namer   r   r   param_needs_quantization   s   
z)Mxfp4HfQuantizer.param_needs_quantizationc                 K   s4   t j rt j  d S t j rt j  d S d S r   )r$   r%   r&   empty_cacher'   )r   r7   r   r   r   r   #_process_model_after_weight_loading   s
   

z4Mxfp4HfQuantizer._process_model_after_weight_loadinguse_kernelsc                 K   sN   ddl m} |rtd d| j_| || jj|j| _||| j| jd}d S )Nr   )replace_with_mxfp4_linearzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   )	r>   rE   r)   r*   r   r#   get_modules_to_not_convertrF   _keep_in_fp32_modules)r   r7   rD   r   rE   r   r   r   $_process_model_before_weight_loading   s   

z5Mxfp4HfQuantizer._process_model_before_weight_loadingc                 C   6   d|j jv rt|dd d ur|jddddd |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrrL   updater   configr   r   r   update_tp_plan      zMxfp4HfQuantizer.update_tp_planc                 C   rJ   )NrK   base_model_ep_planrM   rN   )r   rO   rP   rV   rQ   rR   r   r   r   update_ep_plan   rU   zMxfp4HfQuantizer.update_ep_planc           	      C   s,  ddl m} | }t|jdd}t|jdd}| D ]s\}}t||rt|drt|dr|jj	j
|jj	jd	d
|d	dd|| d< |jjj	j
|jjj	jd	d
|| d< |jj	j
|jj	jd	d
||dd	|| d< |jjj	j
|jjj	jd	d
|| d< qi }||fS )Nr   r:   num_local_experts    hidden_sizei@  gate_up_proj	down_projZ      z.gate_up_proj_blocksz.gate_up_proj_scalesz.down_proj_blocksz.down_proj_scales)r>   r;   
state_dictrP   rS   named_modulesr/   hasattrr[   storagelayoutunswizzle_datadata	transposereshapegate_up_proj_precision_configweight_scaler\   down_proj_precision_config)	r   r7   r;   ra   rX   rZ   namer?   metadatar   r   r   get_state_dict_and_metadata   s@   

z,Mxfp4HfQuantizer.get_state_dict_and_metadatac                 C   s   dS )NTr   r   r   r   r   is_serializable   s   z Mxfp4HfQuantizer.is_serializablec                 C   s   t d dS )NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r)   r*   rp   r   r   r   is_trainable   s   zMxfp4HfQuantizer.is_trainablec                 C   s   ddl m} || S )Nr   )Mxfp4Quantize)integrations.mxfp4rs   )r   rs   r   r   r   get_quantize_ops   s   z!Mxfp4HfQuantizer.get_quantize_opsc                 C   sV   ddl m}m} | jr)| jjrtddgd|| gdgS tddgd|| gdgS g S )Nr   )Mxfp4DequantizeMxfp4Deserialize_blocks_scales )source_patternstarget_patterns
operations)rt   rv   rw   r(   r   r#   r   )r   rv   rw   r   r   r   get_weight_conversions  s    	z'Mxfp4HfQuantizer.get_weight_conversions)r7   r   )F)rO   
__module____qualname____doc__requires_calibrationr   r   r6   strboolrA   rC   rI   rT   rW   ro   rq   propertyrr   ru   r~   __classcell__r   r   r   r   r   )   s,    K



'r   )typingr   baser   modeling_utilsr   utilsr   r   r	   r
   r   quantizers_utilsr   r$   core_model_loadingr   
get_loggerrO   r)   r   r   r   r   r   r   <module>   s   
