o
    i*                     @   s"  d dl Z d dlZd dlmZ d dlmZ d dlZejjj	ejjj
ejjjejjjejjjejjjejjjejjjejjjejjjejjjejjjejjjejjjdZ	dGdejded	ed
ejdB dejf
ddZ		dGdejdeded
ejdB dejf
ddZ
dejdedejfddZdejdejfddZdejdejfddZdejdejfddZdHdejdedejfddZdIdejded
ejdB dejfd d!ZdIdejded
ejdB dejfd"d#Z	 	$	%	dJdejded&ed'ed
ejdB dejfd(d)Z	 	$	%	dJdejded&ed'ed
ejdB dejfd*d+Z			,	-	dKdejdededed	ed
ejdB dejfd.d/Z		dLdejded
ejdB dejfd0d1Z	dMdejd3eded
ejdB dejf
d4d5Zdejd6ejdejfd7d8ZdNd:d;Zd<d= Zd>d? Z d@Z!edAdB Z"edCdD Z#edEdF Z$dS )O    N)defaultdict)contextmanager)uniform_normal_	constant_ones_zeros_eye_dirac_xavier_uniform_xavier_normal_kaiming_uniform_kaiming_normal_trunc_normal_orthogonal_sparse_              ?tensorab	generatorreturnc                 C   $   t | ddstd | |||dS | S )N_is_hf_initializedFr   )r   r   r   getattrTORCH_INIT_FUNCTIONS)r   r   r   r    r   ]/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/initialization.pyr   *      r   meanstdc                 C   r   )Nr   Fr   )r!   r"   r   r   )r   r!   r"   r   r   r   r   r   2   r    r   valc                 C       t | ddstd | |dS | S )Nr   Fr   )r#   r   )r   r#   r   r   r   r   :      r   c                 C      t | ddstd | S | S )Nr   Fr   r   r   r   r   r   r   @      r   c                 C   r&   )Nr   Fr   r   r'   r   r   r   r   F   r(   r   c                 C   r&   )Nr   Fr	   r   r'   r   r   r   r	   L   r(   r	      groupsc                 C   r$   )Nr   Fr
   )r*   r   )r   r*   r   r   r   r
   R   r%   r
   gainc                 C   "   t | ddstd | ||dS | S )Nr   Fr   r+   r   r   r   r+   r   r   r   r   r   X      r   c                 C   r,   )Nr   Fr   r-   r   r.   r   r   r   r   ^   r/   r   fan_in
leaky_relumodenonlinearityc                 C   &   t | ddstd | ||||dS | S )Nr   Fr   r   r2   r3   r   r   r   r   r2   r3   r   r   r   r   r   d   
   
r   c                 C   r4   )Nr   Fr   r5   r   r6   r   r   r   r   r   r7   r                 @c                 C   s(   t | ddstd | |||||dS | S )Nr   Fr   )r!   r"   r   r   r   r   )r   r!   r"   r   r   r   r   r   r   r      s   r   c                 C   r,   )Nr   Fr   r-   r   r.   r   r   r   r      s   r   {Gz?sparsityc                 C   r   )Nr   Fr   )r;   r"   r   r   )r   r;   r"   r   r   r   r   r      r    r   otherc                 C   sB   t | ddst  | |W  d    S 1 sw   Y  | S )Nr   F)r   torchno_gradcopy_)r   r<   r   r   r   r?      s
   
 r?   normalc                 C   s   t jj| \}}|dkr|}n|dkr|}n
|dkr!|| d }d| }|dkr6t| t|d d d S |d	krEt| t|d d S |d
krYtd| }t| | | d S t	d| )Nr0   fan_outfan_avg   r   truncated_normalg۶%?)r"   r@   uniform   zinvalid distribution )
r=   nninit_calculate_fan_in_and_fan_outr   mathsqrtr   r   
ValueError)r   r2   distributionr0   rA   denomvarianceboundr   r   r   _variance_scaling   s    rQ   c                 C      t | ddst| ddd | S )Nr   Fr0   rD   r2   rM   r   rQ   r'   r   r   r   lecun_normal_      rU   c                 C   rR   )Nr   Fr0   r@   rS   rT   r'   r   r   r   default_flax_embed_init_   rV   rW   )
ztorch.nn.initztorch.nn.modules.activationztorch.nn.modules.transformerztorch.nn.modules.linearztorch.nn.modules.lossztorch.nn.modules.batchnormztorch.nn.modules.convztorch.nn.modules.normalizationztorch.nn.modules.rnnztorch.nn.modules.sparsec                  c   s    t t} zKtD ]*}|tjv r2tj| }t D ]}t||r1t||| | |< t	||t
 |  qqdV  W |  D ]\}}| D ]
\}}t	||| qCq;dS |  D ]\}}| D ]
\}}t	||| q]qUw )a  
    Guard the `torch.nn.init` primitive functions to behave exactly like the functions in this file, i.e. be
    protected against the `_is_hf_initialized` flag to avoid re-init if the param was already loaded.

    Usually, all models are using the init from `transformers` which are already guarded, but just to make extra sure
    and for remote code, we also use this context manager.
    N)r   dictTORCH_MODULES_TO_PATCHsysmodulesr   keyshasattrr   setattrglobalsitems)	originalsmodule_namemodule	func_name	functionsfuncr   r   r   guard_torch_init_functions   s*   	


rg   c            	      c   s    ddl m}  dd }tt}zQtD ]'}|tjv r9tj| }t D ]}t	||r8t
|||| |< t||| q"q| j}|| _dV  W | D ]\}}| D ]
\}}t||| qPqH|| _dS | D ]\}}| D ]
\}}t||| qmqe|| _w )ac  
    Disable weight initialization both at the torch-level, and at the transformers-level (`init_weights`).
    This is used to speed-up initializing an empty model with deepspeed, as we do not initialize the model on meta device
    with deepspeed, but we still don't need to run expensive weight initializations as we are loading params afterwards.
    r)   PreTrainedModelc                  _      d S Nr   argskwargsr   r   r   
empty_func      z#no_init_weights.<locals>.empty_funcN)modeling_utilsri   r   rX   rY   rZ   r[   r   r\   r]   r   r^   init_weightsr`   )	ri   ro   ra   rb   rc   rd   original_init_weightsre   rf   r   r   r   no_init_weights   s4   



rt   c                  c   s>    ddl m}  dd }z| j}|| _dV  W || _dS || _w )a  
    Disable weight tying during loading with `from_pretrained`. This is needed as we want to have access to ALL
    weights in the state_dict during `from_pretrained`, and otherwise tying them would remove them from it, as it's
    called in `post_init` when instantiating.
    r)   rh   c                  _   rj   rk   r   rl   r   r   r   ro   $  rp   z"no_tie_weights.<locals>.empty_funcN)rq   ri   tie_weights)ri   ro   original_tie_weightsr   r   r   no_tie_weights  s   rw   )r   r   N)r)   )r   N)r   r0   r1   N)r   r   r8   r9   N)r)   N)r:   N)r0   r@   )%rJ   rZ   collectionsr   
contextlibr   r=   rG   rH   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   Tensorfloat	Generatorintstrr?   rQ   rU   rW   rY   rg   rt   rw   r   r   r   r   <module>   s  
	
&&







$