o
    ‚›ix3  ã                   @   s†  d dl Z d dlZd dlmZ d dlZd dlmZmZ ddlmZ ddl	m
Z
 ddlmZ e
 e¡ZedƒG d	d
„ d
ejƒƒZeZedƒG dd„ dejƒƒZedƒG dd„ dejƒƒZedƒG dd„ dejƒƒZedƒG dd„ dejƒƒZedƒG dd„ dejƒƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG d d!„ d!ejƒZG d"d#„ d#ejƒZG d$d%„ d%ejƒZG d&d'„ d'eƒZG d(d)„ d)ejƒZi d*e“d+ed,d-d.œf“d/e“d0e“d1ed2d3if“d4e“d5ed6d3if“d7e“d8e“d9ej “d:e“d;e“d<e“d=ej!“d>e“d?ej"“d@ej#“eej$ej%ej&edAœ¥Z'ee'ƒZ(dBdC„ Z)e)d1ƒZ*e)d0ƒZ+e)d*ƒZ,e)d/ƒZ-e)d<ƒZ.e)dDƒZ/e)d;ƒZ0e)d:ƒZ1dS )Eé    N)ÚOrderedDict)ÚTensorÚnné   )Úuse_kernel_forward_from_hub)Úlogging)Úis_torchdynamo_compilingÚGeluTanhc                       óL   e Zd ZdZddef‡ fdd„Zdedefdd	„Zdedefd
d„Z‡  Z	S )ÚGELUTanha&  
    A fast C implementation of the tanh approximation of the GeLU activation function. See
    https://huggingface.co/papers/1606.08415.

    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
    match due to rounding errors.
    FÚuse_gelu_tanh_pythonc                    s2   t ƒ  ¡  |r| j| _d S tjtjjdd| _d S )NÚtanh)Úapproximate)	ÚsuperÚ__init__Ú_gelu_tanh_pythonÚactÚ	functoolsÚpartialr   Ú
functionalÚgelu)Úselfr   ©Ú	__class__© úZ/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/activations.pyr   (   s   
zGELUTanh.__init__ÚinputÚreturnc                 C   s6   |d dt  t dtj ¡|dt  |d¡   ¡  S ©Nç      à?ç      ð?ç       @ç÷Hmâä¦?g      @©Útorchr   ÚmathÚsqrtÚpiÚpow©r   r   r   r   r   r   /   ó   6zGELUTanh._gelu_tanh_pythonc                 C   ó
   |   |¡S ©N©r   r)   r   r   r   Úforward2   ó   
zGELUTanh.forward©F)
Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úboolr   r   r   r.   Ú__classcell__r   r   r   r   r      s
    r   ÚNewGELUc                   @   ó"   e Zd ZdZdedefdd„ZdS )ÚNewGELUActivationzÎ
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    r   r   c                 C   s6   d| dt  t dtj ¡|dt  |d¡   ¡  S r   r#   r)   r   r   r   r.   A   r*   zNewGELUActivation.forwardN©r1   r2   r3   r4   r   r.   r   r   r   r   r9   :   s    r9   ÚGeLUc                       r
   )ÚGELUActivationa³  
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    FÚuse_gelu_pythonc                    s(   t ƒ  ¡  |r| j| _d S tjj| _d S r,   )r   r   Ú_gelu_pythonr   r   r   r   )r   r=   r   r   r   r   N   s   
zGELUActivation.__init__r   r   c                 C   s    |d dt  |t d¡ ¡  S )Nr   r    r!   )r$   Úerfr%   r&   r)   r   r   r   r>   U   s    zGELUActivation._gelu_pythonc                 C   r+   r,   r-   r)   r   r   r   r.   X   r/   zGELUActivation.forwardr0   )
r1   r2   r3   r4   r5   r   r   r>   r.   r6   r   r   r   r   r<   E   s
    r<   ÚSiLUc                   @   r8   )ÚSiLUActivationaè  
    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
    later.
    r   r   c                 C   s   t j |¡S r,   )r   r   Úsilur)   r   r   r   r.   f   s   zSiLUActivation.forwardNr:   r   r   r   r   rA   \   s    rA   ÚFastGELUc                   @   r8   )ÚFastGELUActivationz}
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 C   s*   d| dt  |d dd| |   ¡  S )Nr   r    g€ÑÓ3Eˆé?r"   )r$   r   r)   r   r   r   r.   p   s   *zFastGELUActivation.forwardNr:   r   r   r   r   rD   j   ó    rD   Ú	QuickGELUc                   @   r8   )ÚQuickGELUActivationzr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    r   r   c                 C   s   |t  d| ¡ S )Ng¬Zd;û?)r$   Úsigmoidr)   r   r   r   r.   z   s   zQuickGELUActivation.forwardNr:   r   r   r   r   rG   t   rE   rG   c                       s<   e Zd ZdZdedef‡ fdd„Zdedefdd	„Z‡  ZS )
ÚClippedGELUActivationa’  
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://huggingface.co/papers/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
    ÚminÚmaxc                    s8   ||krt d|› d|› dƒ‚tƒ  ¡  || _|| _d S )Nzmin should be < max (got min: z, max: ú))Ú
ValueErrorr   r   rJ   rK   )r   rJ   rK   r   r   r   r   ‹   s
   

zClippedGELUActivation.__init__Úxr   c                 C   s   t  t|ƒ| j| j¡S r,   )r$   Úclipr   rJ   rK   )r   rN   r   r   r   r.   “   ó   zClippedGELUActivation.forward)	r1   r2   r3   r4   Úfloatr   r   r.   r6   r   r   r   r   rI   ~   s    rI   c                       s2   e Zd ZdZ‡ fdd„Zdedefdd„Z‡  ZS )ÚAccurateGELUActivationzÙ
    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
    https://github.com/hendrycks/GELUs

    Implemented along with MEGA (Moving Average Equipped Gated Attention)
    c                    s    t ƒ  ¡  t dtj ¡| _d S )Né   )r   r   r%   r&   r'   Úprecomputed_constant©r   r   r   r   r   Ÿ   s   
zAccurateGELUActivation.__init__r   r   c                 C   s,   d| dt  | j|dt  |d¡   ¡  S )Nr   r   r"   é   )r$   r   rT   r(   r)   r   r   r   r.   £   s   ,zAccurateGELUActivation.forward)r1   r2   r3   r4   r   r   r.   r6   r   r   r   r   rR   —   s    rR   c                       sD   e Zd ZdZ‡ fdd„Zdedefdd„Zdedefdd	„Z‡  ZS )
ÚMishActivationzÙ
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    c                    s   t ƒ  ¡  tjj| _d S r,   )r   r   r   r   Úmishr   rU   r   r   r   r   ­   s   
zMishActivation.__init__r   r   c                 C   s   |t  tj |¡¡ S r,   )r$   r   r   r   Úsoftplusr)   r   r   r   Ú_mish_python±   rP   zMishActivation._mish_pythonc                 C   r+   r,   r-   r)   r   r   r   r.   ´   r/   zMishActivation.forward)	r1   r2   r3   r4   r   r   rZ   r.   r6   r   r   r   r   rW   §   s
    rW   c                   @   r8   )ÚLinearActivationz[
    Applies the linear activation function, i.e. forwarding input directly to output.
    r   r   c                 C   s   |S r,   r   r)   r   r   r   r.   ½   s   zLinearActivation.forwardNr:   r   r   r   r   r[   ¸   s    r[   c                   @   s   e Zd ZdZddd„ZdS )ÚLaplaceActivationzû
    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
    https://huggingface.co/papers/2209.10655

    Inspired by squared relu, but with bounded range and gradient for better stability
    ç»¹øÛž æ?ç ^×/ØÒ?c                 C   s*   ||   |t d¡ ¡}ddt |¡  S )Nr!   r   r    )Údivr%   r&   r$   r?   )r   r   ÚmuÚsigmar   r   r   r.   É   s   zLaplaceActivation.forwardN)r]   r^   ©r1   r2   r3   r4   r.   r   r   r   r   r\   Á   s    r\   c                   @   s   e Zd ZdZdd„ ZdS )ÚReLUSquaredActivationz^
    Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668
    c                 C   s   t j |¡}t |¡}|S r,   )r   r   Úrelur$   Úsquare)r   r   Úrelu_appliedÚsquaredr   r   r   r.   Ó   s   
zReLUSquaredActivation.forwardNrb   r   r   r   r   rc   Î   s    rc   c                       s   e Zd Z‡ fdd„Z‡  ZS )ÚClassInstantierc                    s4   t ƒ  |¡}t|tƒr|n|i f\}}|di |¤ŽS )Nr   )r   Ú__getitem__Ú
isinstanceÚtuple)r   ÚkeyÚcontentÚclsÚkwargsr   r   r   ri   Ú   s   zClassInstantier.__getitem__)r1   r2   r3   ri   r6   r   r   r   r   rh   Ù   s    rh   c                       sf   e Zd ZdZddddejdf‡ fdd„	Zded	efd
d„Zded	efdd„Z	ded	efdd„Z
‡  ZS )ÚXIELUActivationzî
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    gš™™™™™é?r   gíµ ÷Æ°¾Fc              
      sn  t ƒ  ¡  t t t tj||d¡¡ d¡¡| _	t t t tj|| |d¡¡ d¡¡| _
|  dtj||d¡ |  dtj||d¡ || _t|ƒ| _t|ƒ| _d | _zFdd l}tjj ¡ | _d}zddlm}	 |	| jƒ| _|d7 }W n ty“ }
 z|d|
› d	7 }| j| _W Y d }
~
nd }
~
ww t |¡ W d S  ty¶ }
 zt d
t|
ƒ¡ W Y d }
~
d S d }
~
ww )N)Údtyper   ÚbetaÚepszUsing experimental xIELU CUDA.)Úallow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.u¡   CUDA-fused xIELU not available (%s) â€“ falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)r   r   r   Ú	Parameterr$   ÚlogÚexpm1ÚtensorÚ	unsqueezeÚalpha_pÚalpha_nÚregister_bufferÚwith_vector_loadsrQ   Ú_beta_scalarÚ_eps_scalarÚ_xielu_cuda_objÚ	xielu.opsÚclassesÚxieluÚXIELUÚtorch.compilerrt   Ú_xielu_cudaÚ_xielu_cuda_fnÚ	ExceptionÚloggerÚwarning_onceÚstr)r   Úalpha_p_initÚalpha_n_initrr   rs   rq   r}   rƒ   Úmsgrt   Úerrr   r   r   r   è   s@   
	("ÿ

€þý€ÿzXIELUActivation.__init__rN   r   c              
   C   sh   t j | j¡}| jt j | j¡ }t |dk|| | | j|  t t 	|| j
¡¡| | | j|  ¡S )Nr   )r   r   rY   rz   rr   r{   r$   Úwhererw   rJ   rs   )r   rN   rz   r{   r   r   r   Ú_xielu_python  s   $ýzXIELUActivation._xielu_pythonc                 C   sœ   |j }| ¡ dk r| d¡}| ¡ dk s	| ¡ dkr$| dd| d¡¡}||j kr1t d||j ¡ | j || j	 
|j¡| j 
|j¡| j| j| j¡}| |¡S )zDFirewall function to prevent torch.compile from seeing .item() callsrV   r   éÿÿÿÿr   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)ÚshapeÚdimry   ÚviewÚsizer‰   rŠ   r€   r.   rz   Útorq   r{   r~   r   r}   )r   rN   Úoriginal_shapeÚresultr   r   r   r†     s*   
ÿ
ýù
	zXIELUActivation._xielu_cudar   c                 C   s4   | j d ur|jrtƒ s|  |¡S t d¡ |  |¡S )Nz:torch._dynamo is compiling, using Python version of xIELU.)r€   Úis_cudar   r‡   r‰   rŠ   r‘   r)   r   r   r   r.   5  s
   


zXIELUActivation.forward)r1   r2   r3   r4   r$   Úbfloat16r   r   r‘   r†   r.   r6   r   r   r   r   rp   à   s    	ù+	rp   r   Úgelu_10iöÿÿÿé
   )rJ   rK   Ú	gelu_fastÚgelu_newÚgelu_pythonr=   TÚgelu_pytorch_tanhÚgelu_python_tanhr   Úgelu_accurateÚlaplaceÚ
leaky_reluÚlinearrX   Ú
quick_gelurd   Úrelu2Úrelu6rH   )rB   Úswishr   Úprelurƒ   c                 C   s,   | t v rt |  S td| › dtt  ¡ ƒ› ƒ‚)Nz	function z not found in ACT2FN mapping )ÚACT2FNÚKeyErrorÚlistÚkeys)Úactivation_stringr   r   r   Úget_activationY  s   r±   rB   )2r   r%   Úcollectionsr   r$   r   r   Úintegrations.hub_kernelsr   Úutilsr   Úutils.import_utilsr   Ú
get_loggerr1   r‰   ÚModuler   ÚPytorchGELUTanhr9   r<   rA   rD   rG   rI   rR   rW   r[   r\   rc   rh   rp   Ú	LeakyReLUÚReLUÚReLU6ÚSigmoidr@   ÚTanhÚPReLUÚACT2CLSr¬   r±   r    rŸ   r   rž   r§   rB   rX   Ú
linear_actr   r   r   r   Ú<module>   s¢   

			^ÿþýüûúùø	÷
öõôóòñðïê