o
    i&c                     @   sV  d dl mZmZ e rddlZddlmZ ddlmZ d dlmZ d dl	m
Z
mZ eeZg dZed	d
 ZG dd deZG dd deZG dd deZdd Zdd ZejdddejdedejfddZejdddejdedejfddZG dd dejZd d! Zd"d# Zd$d% Z d&d' Z!d(d) Z"d*d+ Z#d/d,e$e% dB fd-d.Z&dS )0   )is_torch_availablelogging    N)nn)contextmanager)ConversionOps)get_module_from_nameshould_convert_module)g        g      ?g      ?g      ?g       @g      @g      @g      @g       g      g      g      g       g      g      g      c                 c   s    t  redd l}t| |jr| j} n
t| tr|| } t| dd }|dkrA|j|  d V  	 W d    d S 1 s<w   Y  |dkret|dre|j	|  d V  	 W d    d S 1 s`w   Y  d V  d S )Nr   typecudaxpu)
r   torch
isinstanceTensordevicestrgetattrr   hasattrr   )devr   dev_type r   a/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/mxfp4.py	on_device1   s&   

  
r   c                   @   sb   e Zd Zdd Z			ddeeejf dejj	dB de
e dB dedB deeejf f
d	d
ZdS )Mxfp4Quantizec                 C   
   || _ d S Nhf_quantizerselfr   r   r   r   __init__H      
zMxfp4Quantize.__init__N
input_dictmodelmissing_keysfull_layer_namereturnc              	   K   s*  t | d \}}t|tr|d n|}t||\}}t|ji t|trt|	ddt
\}	}
t
jjt
jjt
jj}}}t|	|
t
\}	}
d|v rOdnd}||jv rZ|j|= t|||	 t|| d||
|| dd ||  d	|_i W  d    S W d    d S 1 sw   Y  d S )
Nr   gate_up_proj	down_proj_precision_configrhs_dataweight_scaleflex_ctxT)tupleitemsr   listr   r   r   Mxfp4GptOssExpertsquantize_to_mxfp4	transposetriton_kernels_hub
matmul_ogsPrecisionConfigFlexCtx
InFlexDataswizzle_mxfp4_parameterssetattrdiscard_is_hf_initialized)r   r"   r#   r$   r%   kwargs_valuemoduletriton_weight_tensorr/   r9   r:   r;   projr   r   r   convertK   s8   


"zMxfp4Quantize.convertNNN__name__
__module____qualname__r    dictr   r   r   r   Moduler3   rG   r   r   r   r   r   G   s     

r   c                   @   sV   e Zd Zdd Z			d
deeejf dejj	dB dedB deeejf fdd	Z
dS )Mxfp4Dequantizec                 C   r   r   r   r   r   r   r   r    x   r!   zMxfp4Dequantize.__init__Nr"   r#   r%   r&   c           	      K   sr   d|  v rt|d tr|d d }n|d }d|  v r0t|d tr,|d d }n|d }t||}||iS )N_blocksr   _scales)keysr   r3   dequantize_convertops)	r   r"   r#   r%   r$   rA   blocksscalesdequantizedr   r   r   rG   {   s   
zMxfp4Dequantize.convertrH   )rJ   rK   rL   r    rM   r   r   r   r   rN   rG   r   r   r   r   rO   w   s    
rO   c                   @   sb   e Zd Zdd Z			ddeeejf dejj	dB dedB de
e dB deeejf f
d	d
ZdS )Mxfp4Deserializec                 C   r   r   r   r   r   r   r   r       r!   zMxfp4Deserialize.__init__Nr"   r#   r%   r$   r&   c           
      K   s   i }d|  v rt|d tr|d d |d< n|d |d< d|  v r:t|d tr4|d d |d< n|d |d< t||\}}d|v rGdnd}	t|d |d ||	|d jt ||  d|_i S )NrP   r   rQ   r)   r*   T)	rR   r   r3   r   swizzle_mxfp4_convertopsr   r7   r?   r@   )
r   r"   r#   r%   r$   rA   
param_datarD   rB   rF   r   r   r   rG      s,   zMxfp4Deserialize.convertrH   rI   r   r   r   r   rW      s     

rW   c                 C   s.   |j jj}|| tjtjdd\} }| |fS )N   )axis)numerics_detailsmxfpdowncast_to_mxfp_torchtor   bfloat16uint8)wr7   r^   w_scaler   r   r   r5      s   
r5   c           
      C   sn   |j j|j j|j j}}}|jj}|jjj}|jdd\}}	||| |d|fi |	} ||||}| |fS )zE
    Changes the layout of the tensors depending on the hardware
    rZ   )mx_axisdtype)tensorFP4convert_layoutwrap_torch_tensortensor_detailslayoutStridedLayout"make_default_matmul_mxfp4_w_layout)
rb   rc   r7   rh   ri   rj   rl   rm   value_layoutvalue_layout_optsr   r   r   r<      s   

r<   i   rf   rows_per_chunkrf   rr   r&   c                C   s  ddl }| tj} |tjd }| jdd |jks,J d| jdd d|jtjt|| jd}| j^ }}}|	|| }	| 
|	|} |
|	d}tj|	|d	 || jd}
td|	|D ]R}t|| |	}| || }||| }|
|| }|d
@ tj}|| |ddddd	f< ~|d? tj}|| |ddddd	f< ~tj|||d ~~~qa|
j
g |||d	 R  jg ||| d	 R  }
|
dd	 S )w
    Convert the mxfp4 weights again, dequantizing and makes them compatible with the forward
    pass of GPT_OSS.
    r   N   r'   zblocks.shape[:-1]=z does not match scales.shape=)rf   r   rZ   r         )out)mathr_   r   ra   int32shaperg   
FP4_VALUESr   prodreshapeemptyrangeminintldexpviewr6   
contiguous)rT   rU   rf   rr   rx   lutprefix_shapeGB
rows_totalrw   r0r1blkexpsubidx_loidx_hir   r   r   _convert_moe_packed_tensors   s2   44r   c             	   C   sL   z	t | |||dW S  tjy%   | d} |d}t | |||d Y S w )rs   rq   cpu)r   r   OutOfMemoryErrorr_   )rT   rU   rf   rr   r   r   r   convert_moe_packed_tensors  s   

r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )r4   c                    s   t    |j| _|j| _|j| _tjtj	| jd| j | jd dtj
ddd| _tjtj	| jd| j tjddd| _tjtj	| j| j| jd dftj
ddd| _tjtj	| j| jtjddd| _d| _t|dd	| _d | _d | _t|dd	| _d S )
Nr          re   Frequires_gradgZd;?swiglu_limitg      @)superr    num_local_expertsnum_expertsintermediate_sizehidden_sizer   	Parameterr   zerosra   r)   float32gate_up_proj_biasr*   down_proj_biasalphar   limitgate_up_proj_precision_configdown_proj_precision_config)r   config	__class__r   r   r    )  s.   
" zMxfp4GptOssExperts.__init__hidden_statesr&   c                 C   s   t jjt jjt jj}}}t jj}t|j= ||d|d| j| j	fd}	||| j
| jtj||| jd |	d}
||
| j| jtj||| j|jd}W d    |S 1 sWw   Y  |S )Nswiglu)r   r   r   )gather_indxprecision_configgammasfused_activation)scatter_indxr   r   )r7   r8   FnSpecsFusedActivationr   	swiglu_fnr   r   r   r   r)   r   r_   r   r   r   r*   r   r   	gate_scal)r   r   routing_data
gather_idxscatter_idxr   r   r8   r   actintermediate_cache1intermediate_cache3r   r   r   forwardG  s<   

zMxfp4GptOssExperts.forward)rJ   rK   rL   r    r   r   r   __classcell__r   r   r   r   r4   (  s    r4   c                 C   s
  dd l }tjjtjjtjjtjjf\}}}}t| j t	j
 }t|jdd}d}	| jd }
| jd }|| }|| }|d | }|
| }dd }|| |\}}t	j|dd}t	j|dd\}}t	|d|}|d}t	j|||d d	|| }|dt	j}d
}t	||k ||}t	j|ddt	j}t	|t	j}t	||k ||	}t	||k||	}t	||	k|	|}|| }t	|| |	k|	|}|| | d}|| | d}||||}|}W d    n1 sw   Y  ||||||||fS )Nr   
LOCAL_RANK0r'   rZ   c                 S   sF   t j|  dddd d d |f }| }t j| |dd}|| fS )NrZ   T)dimstabler   )r   argsortlongtake_along_dimr   )valsktk_indxtk_valr   r   r   topk  s   "z routing_torch_dist.<locals>.topkr   )binsmaxi  T)r   )src_indxdst_indx)osr7   routing
GatherIndxRoutingDataScatterIndxcompute_expt_data_torchr   r   r   distributedget_world_sizer   environgetrz   softmaxsortgatherr}   histcr   r_   ry   wherer   )logitsn_expts_actr   r   r   r   r   
world_sizerankreplace_valuen_tokensn_expts_totn_local_expertslocal_expert_startlocal_expert_endn_gates_padr   	expt_scal	expt_indxsort_indiceshistvar	topk_indx	gate_indxr   r   r   	expt_datahit_expertsr   r   r   routing_torch_distk  sN   



4r   c           
      C   s   dd l m} | r| rt| drt}ntjj}|jd }|	d| j
j}tj|| j
j| j
j}t|j ||| j
j\}}}W d    n1 sMw   Y  | j||||d}	|		|d| j
j}	|	|fS )Nr   
_is_hookedr'   )r   )torch.distributedr   is_availableis_initializedr   r   r7   r   rz   r}   router
hidden_dimr   
functionallinearweightbiasr   r   top_kexperts)
r   r   distr   
batch_sizerouter_logitsr   r   r   
routed_outr   r   r   mlp_forward  s   
r  c              
   K   s   ddl m} |d}|d}|d}	|d}
|d}|d}d	D ]W}||v r}|d ur;||||||	|
||}| d
}| d}t| |ddd | t| |r}t| |r}tt| |t| |}t| |tj	
|| t| | t| | q&d S )Nr   shard_and_distribute_moduler#   empty_paramcasting_dtypeto_contiguousr   device_mesh)r)   r*   rP   rQ   .rZ   )integrations.tensor_parallelr  r   r>   rsplitr   r   r   r   r   r   r_   delattr)rD   
param_nameparam_valuetarget_devicedq_param_namerA   r  r#   r  r  r  r   r	  rF   blocks_attrscales_attrrV   r   r   r   
dequantize  s<   










r  c                 C   s   t | |}tj|S r   )r   r   r   r   )rT   rU   rV   r   r   r   rS     s   
rS   c              	   K   s  |j j|j j|j j}}}ddlm}	 |d}
|d}|d}|d}|d}|d}d	|v rB|d
d dd }d|v rR|d
d dd }|durb|	|
||||||| nt| |	d
dd t
jj|dd | d}| d}t| |}t| |}|jjdkr=|jjdkr?|d}|dkr||| jd d}n
||d| jd }t|d|dkrtt
drt
j jnd}|| }|| }t| t|dd|dd|\}}W d   n1 sw   Y  |dkrt
|| j| jd g|_nt
|| j| jg|_t| || t| | d|||| dd t| | t| | ~dS dS dS )q
    This transforms the weights obtained using `convert_gpt_oss.py` to load them into `Mxfp4GptOssExperts`.
    r   r  r#   r  r  r  r   r	  rT   r
  r'   rP   r   rU   rQ   NrZ   Fr   metar)   r
   r   acceleratorr   r(   r+   r,   r.   )r8   r9   r:   r;   r  r  r   splitr>   r  r   r   r   r   r   r
   sizer}   r   r   r  current_acceleratorr_   r   r   r<   r6   Sizer   rz   r  )rD   r  r  r  r7   rA   r9   r:   r;   r  r#   r  r  r  r   r	  rF   r  r  rT   rU   local_expertsrE   r/   r   r   r   load_and_swizzle_mxfp4  sf   






$









r  c                 C   sx  |j j|j j|j j}}}| d}	t|d|dkr(ttdr&tj	 j
nd}| | } || }|dkrE| |	|jd d} n
| |	d|jd } t|d|dkrYd}t| t| d	d|d	d|\}
}W d
   n1 sxw   Y  |dkrt|	|j|jd g|
_nt|	|j|jg|
_||jv r|j|= t|||
 t|| d|||| dd d
S )r  r   r
   r   r  r   r)   r   r'   r(   Nr+   r,   r.   )r8   r9   r:   r;   r  r   r   r   r  r  r
   r_   r   r}   r   r   r<   r6   r  r   rz   r=   r>   )rT   rU   rD   rF   r  r7   r9   r:   r;   r  rE   r/   r   r   r   rX   ,  s<   




rX   modules_to_not_convertc              	   C   s   |j r| S ddlm} |dad}|  D ]H\}}t||sq|jjdkrH|j sHt	d | 
|t| j d}W d   n1 sCw   Y  |jjd	kr]|j s]d
dlm} |t||_q|setd | S )aD  
    Public method that replaces the expert layers of the given model with mxfp4 quantized layers.

    Args:
        model (`torch.nn.Module`):
            The model to convert, can be any `torch.nn.Module` instance.
        quantization_config (`Mxfp4Config`, defaults to `None`):
            The quantization config object that contains the quantization parameters.
        modules_to_not_convert (`list`, *optional*, defaults to `None`):
            A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
            converted.
    rZ   )
get_kernelz(kernels-community/gpt-oss-triton-kernelsFGptOssExpertsr  TN	GptOssMLPr   )
MethodTypezYou are loading your model using mixed-precision FP4 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r  hub_kernelsr  r7   named_modulesr	   r   rJ   r   r   set_submoduler4   r   typesr"  r  r   loggerwarning)r#   quantization_configr  r  has_been_replacedmodule_namerD   r"  r   r   r   replace_with_mxfp4_linear[  s,   
r,  )NN)'utilsr   r   r   r   
contextlibr   core_model_loadingr   quantizers.quantizers_utilsr   r	   
get_loggerrJ   r'  r{   r   r   rO   rW   r5   r<   r`   rf   r   r   r   r   rN   r4   r   r  r  rS   r  rX   r3   r   r,  r   r   r   r   <module>   sT   

0,
:
CD!C/