o
    ieB                     @   s  d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	 e	 r%d dl
Z
eeZ		d'd	e
jd
e
jde
jdB dede
jf
ddZde
jjde
jde
jde
jde
jf
ddZ	d(d	e
jd
e
jde
jdB de
jfddZ			d)d	e
jd
e
jde
jdB de
jdB dede
jfddZde
jjde
jde
jde
jde
jf
ddZG dd deZe Zde
jde
jfd d!Z	d(ddd"d#ee
jj dB ded$edee
jj fd%d&ZdS )*    )Callable)wraps   )logging)GeneralInterface)is_torch_availableNFinputweightbiasis_transposedreturnc                 C   sJ   |rt | d|d}nt || dd}|dur#|| }|S )a  Batched linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (batch_size, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
            else of shape (batch_size, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (batch_size, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
       N)torchbmm	unsqueezesqueeze)r   r	   r
   r   out r   _/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/moe.py_batched_linearD   s   r   selfhidden_statestop_k_indextop_k_weightsc                 C   sF  |j }|d}|d}|d}tj||ddd|d}|d}	|d}
|
| jk }|
d| jd }|| }| j	| }| j
| }| jrQ| j| nd }| jr[| j| nd }t|||| jd}| |}t|||| jd}|	j|jkr|	d|}	||	d }||d|j }||||jdd}||jS )Nr   r   devicer   r   dim)r   sizer   aranger   expandreshapenum_expertsclampgate_up_proj	down_projhas_biasgate_up_proj_biasdown_proj_biasr   r   _apply_gateshapegathertodtypeviewsum)r   r   r   r   r   	num_top_k
num_tokens
hidden_dim	token_idxsample_weights
expert_ids
valid_maskexpert_ids_clampedselected_hidden_statesselected_gate_upselected_downselected_gate_up_biasselected_down_biasgate_up_out	gated_outout_per_samplefinal_hidden_statesr   r   r   batched_mm_experts_forwardf   s6   


"







rC   offsc                 C   sT   t tjjdrtjjj| |j||dS t tdr&tj| |j||dS td)a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.
    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, output_dim, input_dim).
        offs (`torch.Tensor`, *optional*):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    Raises:
        ImportError: If neither `torch.nn.functional.grouped_mm` nor `torch._grouped_mm` is available, indicating that the PyTorch version is incompatible.
    
grouped_mmrD   _grouped_mmzNeither torch.nn.functional.grouped_mm nor torch._grouped_mm is available. Please make sure you are using a PyTorch version that includes grouped_mm (2.9+).)	hasattrr   nn
functionalrE   r.   r/   rG   ImportError)r   r	   rD   r   r   r   rG      s   
rG   c                 C   s>   |r
t | ||d}nt | |dd|d}|dur|| }|S )a*  Grouped linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, output_dim, input_dim) if transposed is `False`,
            else of shape (num_experts, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (num_experts, output_dim). Default is `None`.
        offs (`torch.Tensor`, *optional*):
            Offsets tensor indicating the boundaries of each group in the input tensor.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    rF   r   N)rG   	transpose)r   r	   r
   rD   r   r   r   r   r   _grouped_linear   s   rN   c                 C   sr  |j }|d}|d}|d}tj||ddd|d}|d}	|d}
|| }t|
}t|}|
| }|	| }|| }| j}| j	}| j
rU| j| nd }| j
r_| j| nd }|jdkrj| n| }tj|| jd| jd d}tj|dtjd}t||||| jd}| |}t||||| jd}||d }|| }||||jdd	}||jS )
Nr   r   r   r   cpu)binsminmax)r   r/   r   r   )r   r    r   r!   r   r"   r#   argsortr&   r'   r(   r)   r*   typefloatinthistcr$   cumsumint32rN   r   r+   r0   r1   r.   r/   )r   r   r   r   r   r2   r3   r4   r5   r6   r7   r:   perminv_permexpert_ids_gsample_weights_gselected_hidden_states_gr;   r<   r=   r>   histc_inputnum_tokens_per_expertoffsetsr?   r@   out_per_sample_grA   rB   r   r   r   grouped_mm_experts_forward   s>   


"




rc   c                       s8   e Zd ZdZeedZdededef fddZ	  Z
S )ExpertsInterfacez9Interface for registering custom experts implementations.)
batched_mmrE   experts_implementationdefaultr   c                    sB   |du r
t d n|dkr|| vrtd| dt ||S )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.Na
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r   rf   rg   	__class__r   r   get_interface4  s   
zExpertsInterface.get_interface)__name__
__module____qualname____doc__rC   rc   _global_mappingstrr   rq   __classcell__r   r   ro   r   rd   ,  s    "rd   r?   c                 C   s    |j ddd\}}| || S )a  
    Default gating mechanism: splits the gate_up_out into gate and up parts,
    applies the activation function to the gate part, and multiplies it with the up part.
    Args:
        gate_up_out (`torch.Tensor`):
            The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
    Returns:
        `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
    r   r   r   )chunkact_fn)r   r?   gateupr   r   r   _default_apply_gateF  s   
r}   )r   r(   experts_classr(   c                   s<   dt tjj dt tjj f fdd}| dur|| S |S )aV  Decorator to modify experts class to support different experts implementations.

    Args:
        experts_class (`type[torch.nn.Module]`, *optional*):
            The experts class to modify. If not provided, returns a decorator that can be applied to the class.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the expert weights are stored in transposed format.
        has_bias (`bool`, *optional*, defaults to `False`):
            Whether the expert layers include bias terms.

    Returns:
        `type[torch.nn.Module]`: The modified experts class.
    r~   r   c                    sX   | j | j tfdd}t  fdd}t| ds$t| _|| _ || _| S )Nc                    s.   | |g|R i | || _  | _| _d S N)configr(   r   )r   r   argskwargs)r(   r   original_initr   r   __init__i  s   
z=use_experts_implementation.<locals>.wrapper.<locals>.__init__c                    s&   t | jj }|| g|R i |S r   )ALL_EXPERTS_FUNCTIONSrq   r   _experts_implementation)r   r   r   experts_forward)original_forwardr   r   forwardp  s   z<use_experts_implementation.<locals>.wrapper.<locals>.forwardr+   )r   r   r   rH   r}   r+   )r~   r   r   r(   r   )r   r   r   wrappere  s   
z+use_experts_implementation.<locals>.wrapperN)rT   r   rI   Module)r~   r   r(   r   r   r   r   use_experts_implementationT  s   (r   )NFr   )NNF)collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr   r   
get_loggerrr   rj   Tensorboolr   rI   r   rC   rG   rN   rc   rd   r   r}   rT   r   r   r   r   r   <module>   s   
-
"
<
$
&
F
