o
    i                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZ ddlm	Z	m
Z
 e
eZe	 r,d dlZer4ddlmZ dd Z				d%d	ed
 ded dedB dedB dedef f
ddZ				d%d	ed
 ded dedB dedB dedef f
ddZ			d&d	d
ded dedB dedB dedef f
ddZ			d&d	d
ded dedB dedB dedef f
ddZ			d&d	d
ded dedB dedB dedef f
ddZeeeeedZG dd deddZG d d! d!Zd'd	ed"edB fd#d$ZdS )(    Nwraps)TYPE_CHECKINGOptional	TypedDict   )is_torch_availablelogging)PreTrainedConfigc                    s2   dddddd t d fdd	}|S )	ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    Nc                 S   s  t |d }|du r| j}| j}d}| jjd }n| j| }t| | d}| d}| jj| d }||krgt| | dsQt| }	|	| j||d |d\}
}| j	| d	|
d
d t
| | d|
 dS ||}| j	| d	|d
d t
| | d| dS )zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r   N  original_max_position_embeddings_original_inv_freq__long_inv_freqseq_len
layer_typeinv_freqF
persistentlong_inv_freqoriginal_inv_freq)torchmax	rope_typer   configrope_parametersgetattrhasattrROPE_INIT_FUNCTIONSregister_buffersetattrto)selfposition_idsdevicer   r   r   r   prefixr   rope_init_fnr   r    r(   b/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_update.   s4   




z6dynamic_rope_update.<locals>.longrope_frequency_updatec                 S   s  t |d }|du r| j}| j}| j}d}n| j| }t| | d| j}t| | d}| d}||kr[t| }	|	| j|||d\}
| _| j	| d|
d	d
 t
| | d| || jk r|| jkr||}| j	| d|d	d
 t
| | d| t
| | d| j dS dS dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   Nr   _max_seq_len_cachedr   r   r   r   Fr   r   )r   r   r   max_seq_len_cachedr   r   r   r   attention_scalingr    r!   original_max_seq_lenr"   )r#   r$   r%   r   r   r   r,   r   r&   r'   r   r(   r(   r)   dynamic_frequency_updateQ   s4   


z5dynamic_rope_update.<locals>.dynamic_frequency_updatec                    s   |d u r| j n| j | }|d urd|ini }d|v r' | |fd|ji| n|dkr7| |fd|ji| | ||fi |S )Nr   dynamicr%   longrope)r   r%   )r#   xr$   r   r   kwargsr/   r*   rope_forwardr(   r)   wrapperw   s   z$dynamic_rope_update.<locals>.wrapperNr   )r5   r6   r(   r4   r)   dynamic_rope_update!   s
   

#&	r8   r   r
   r%   ztorch.devicer   r   returnztorch.Tensorc                 C   s   |    |dur| j| n| j}|d }|d }|dd}t| ddp)| j| j }t|| }	d}
d|tjd|	dtj	d	j
|tjd
|	   }|| }||
fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nfactor
rope_thetapartial_rotary_factor      ?head_dimr      dtyper%   rA   )standardize_rope_paramsr   getr   hidden_sizenum_attention_headsintr   arangeint64r"   float)r   r%   r   r   rope_parameters_dictr:   baser<   r>   dimattention_factorr   r(   r(   r)   '_compute_linear_scaling_rope_parameters   s   !,rO   c                 C   s  |    |dur| j| n| j}|d }|dd}t| d| j| j }t|| }|d }	d}
|du r8| j}nt|t	j
rNt	|t	j| j|j|jd}nt|| j}||	| | j |	d  ||d	    }d|t	jd
|d	t	jdj|t	jd|   }||
fS )a	  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
                inference time
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
                will be accessed. The value of `factor` is used to determine the new base frequency, along with the
                current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
                computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
                factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
                context window using an exponent derived from `dim`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
            max_position_embeddings, this value will be overridden by max_position_embeddings.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nr;   r<   r=   r>   r:   rA   r%   r   r?   r   r@   rB   )rC   r   rD   r   rE   rF   rG   max_position_embeddings
isinstancer   TensormaximumtensorrA   r%   r   rH   rI   r"   rJ   )r   r%   r   r   rK   rL   r<   r>   rM   r:   rN   r   r(   r(   r)   _compute_dynamic_ntk_parameters   s&   +&,rV   c                    s  |    |dur| j| n| j}|d }|dd}t| d| j| j }t|| }|d }	|d}
|d}|d	}|d
 }|	du rJ| j| }	ddd}|
du rh|rd|rdt||	|||	| }
n||	}
|dpnd}|dpud}dd   fdd}dd }|t	
d|dj|t	jd|  }d| }d|	|  }| jdd}|||||||\}}d||||d j|t	jd }|d|  ||  }||
fS )a	  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                    If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as available.
                *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                    (only) in the linear ramp function.
                *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                    (only) in the linear ramp function.
                *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                    extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                    value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                    `mscale_all_dim`, if provided.
                *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                    numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                    calculated based on `factor` only.
                *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                    the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                    will be calculated based on `factor` only.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used during pretraining.
                *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    Nr;   r<   r=   r>   r:   rN   mscalemscale_all_dimr   r   c                 S   s"   | dkrdS d| t |  d S )Nr   r=   g?)mathlog)scalerW   r(   r(   r)   
get_mscaleN  s   z,_compute_yarn_parameters.<locals>.get_mscale	beta_fast    	beta_slowc                 S   s*   |t || d t j   dt |  S )zPInverse dimension formula to find the dimension based on the number of rotationsr?   )rY   rZ   pi)num_rotationsrM   rL   rQ   r(   r(   r)   find_correction_dim`  s   *z5_compute_yarn_parameters.<locals>.find_correction_dimc                    sL    | |||} ||||}|rt |}t |}t|dt||d fS )z.Find dimension range bounds based on rotationsr   r   )rY   floorceilr   min)low_rothigh_rotrM   rL   rQ   truncatelowhighrb   r(   r)   find_correction_ranged  s   

z7_compute_yarn_parameters.<locals>.find_correction_rangec                 S   s>   | |kr|d7 }t j|t jd|  ||   }t |dd}|S )NgMbP?r@   r   r   )r   rH   float32clamp)re   r   rM   linear_func	ramp_funcr(   r(   r)   linear_ramp_factorm  s
   z4_compute_yarn_parameters.<locals>.linear_ramp_factorr   r?   rB   rh   T)r   )rC   r   rD   r   rE   rF   rG   rQ   rJ   r   rH   r"   )r   r%   r   r   rK   rL   r<   r>   rM   r:   rN   rW   rX   r   r\   r]   r_   rl   rq   	pos_freqsinv_freq_extrapolationinv_freq_interpolationrh   ri   rj   inv_freq_extrapolation_factorr   r(   rk   r)   _compute_yarn_parameters   sD   :




	"
 
rv   c                 C   s.  |    |dur| j| n| j}|d }|dd}t| d| j| j }t|| }|d }	|d }
|d}|d	}|d
 }|du rI| j| }|du rc|dkrTd}nt	dt
|t
|  }|rs||krstj|	tj|d}n	tj|
tj|d}tjd|dtj|d | }d|||   }||fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
                pretraining. If not provided, defaults to `max_position_embeddings`.
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
                will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                    the value of `factor`.
                *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                    `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                    overridden s the ratio between those values.
                *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
                *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    Nr;   r<   r=   r>   long_factorshort_factorr:   rN   r   r   rP   r   r?   )rC   r   rD   r   rE   rF   rG   rQ   rY   sqrtrZ   r   rU   rm   rH   rI   rJ   )r   r%   r   r   rK   rL   r<   r>   rM   rw   rx   r:   rN   r   ext_factorsinv_freq_shaper   r(   r(   r)   _compute_longrope_parameters  s.   2


r|   c                 C   s2  |    |dur| j| n| j}|d }|dd}t| ddp%| j| j }t|| }d}	d|tjd|dtj	dj
|tjd	|   }
|d
 }|d }|d }|d }|| }|| }dtj |
 }t||k|
| |
}|| | ||  }d| | | ||  }||k  ||k  }t|||}||	fS )au
  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                    wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                    during smoothing.
                *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                    the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
                *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                    the shift applied to the numerator and denominator of the smoothing factor.
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    Nr;   r<   r=   r>   r   r?   r@   rB   r:   low_freq_factorhigh_freq_factorr   r   )rC   r   rD   r   rE   rF   rG   r   rH   rI   r"   rJ   rY   r`   where)r   r%   r   r   rK   rL   r<   r>   rM   rN   r   r:   r}   r~   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqr(   r(   r)   _compute_llama3_parameters  s*   -,r   )linearr0   yarnr1   llama3c                   @   s   e Zd ZU dZeed< edB ed< edB ed< edB ed< edB ed< edB ed< edB ed	< edB ed
< ee dB ed< ee dB ed< edB ed< edB ed< dS )RopeParametersaY
  
    Args:
        rope_theta (`float`):
            The base period of the RoPE embeddings.
        rope_type (`str`, *optional*, defaults to "default"):
            The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
            'llama3'], with 'default' being the original RoPE implementation.
        partial_rotary_factor (`float`, *optional*):
            The percentage of the query and key head embedding on which RoPE will be applied.
        factor (`float`, *optional*):
            Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
            most scaling types, a `factor` of x will enable the model to handle sequences of length x *
            original maximum pre-trained length.
        original_max_position_embeddings (`int`, *optional*):
            Used with 'yarn', 'longrope' and 'llama3'. The original max position embeddings used during
            pretraining.
        attention_factor (`float`, *optional*):
            Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
            computation. If unspecified, it defaults to value recommended by the implementation, using the
            `factor` field to infer the suggested value.
        beta_fast (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
            ramp function. If unspecified, it defaults to 32.
        beta_slow (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
            ramp function. If unspecified, it defaults to 1.
        short_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to short contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        long_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to long contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        low_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
        high_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    r;   Nr   r<   r:   r   rN   r]   r_   rx   rw   r}   r~   )	__name__
__module____qualname____doc__rJ   __annotations__strrG   listr(   r(   r(   r)   r   :  s   
 (r   F)totalc                   @   s  e Zd ZdZdZd!dedB fddZdd Zd!d	d
dedB fddZd!de	dedB fddZ
d!de	dedB fddZd!de	dedB fddZd!de	dedB fddZd!de	dedB fddZd!de	dedB fddZe		d"dededededB dedB f
dd ZdS )#RotaryEmbeddingConfigMixinz[
    A Mixin containing the functionality to standardize and validate RoPE parameters.
    g     @Nignore_keys_at_rope_validationc                 K   s   | dd }|p
| j| _| jd ur| jni | _| dt| d| j}| jd| |dt| dd }|d urL| jd| |d u rEt n|}|dhB }|   | j|d |S )Nrope_scalingr;   r<   ignore_keys)	popr   r   default_theta
setdefaultrD   setrC   validate_rope)r#   r   r3   r   r;   r<   r(   r(   r)   convert_rope_params_to_dictx  s   
z6RotaryEmbeddingConfigMixin.convert_rope_params_to_dictc                 C   sN  t | dd}t | dd}t | ddpi }t | dd}|s%|s%td dS |du s6|i ks6t| |si|d|dd	 |d| |durN||d< |d d
v rht| dr`| j	| j
d< nB| j
d| j n9t|D ]4}|| d|| dd	 || d| |dur||| d< || d d
v r| j
| d| j qm|| _
dS )z
        Helper to standardize the config's rope params field by ensuring the params are defined for each
        later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility)
        r;   Nr<   r   layer_typeszG`standardize_rope_params` was called but no RoPE parameters were found.r   typedefault)r   r   r1   r   )r   loggerwarningr   keysissubsetr   rD   r   r   r   rQ   )r#   r;   r<   r   r   r   r(   r(   r)   rC     s8   
"


z2RotaryEmbeddingConfigMixin.standardize_rope_paramsr#   r
   r   c                 C   s   | j }|du r	dS t| dddurt| | jrnd|i}| D ].}|d|dd}t| d| dd}||d< |durI|||d	 q$t	d
| d q$dS )zY
        Validate the RoPE config arguments, given a `"PreTrainedConfig"` object
        Nr   full_attentionr   r   r   
_validate__rope_parametersr   zMMissing validation function in 'RotaryEmbeddingConfigMixin' for 'rope_type'='')
r   r   r   r   r   r   valuesrD   r   r   )r#   r   rK   r   r   validation_fnr(   r(   r)   r     s$   
z(RotaryEmbeddingConfigMixin.validate_roper   c                 C   s2   ddh}t | }|d }| j||||d d S )Nr   r;   r   )r   r   _check_received_keys)r#   r   r   required_keysreceived_keysr   r(   r(   r)   !_validate_default_rope_parameters  s   z<RotaryEmbeddingConfigMixin._validate_default_rope_parametersc                 C   sh   h d}t | }|d }| j||||d |d }|d u s(t|tr(|dk r2td|  d S d S )N>   r:   r   r;   r   r   r:   r=   ;`rope_parameters`'s factor field must be a float >= 1, got r   r   r   rR   rJ   r   r   r#   r   r   r   r   r   r:   r(   r(   r)    _validate_linear_rope_parameters     z;RotaryEmbeddingConfigMixin._validate_linear_rope_parametersc                 C   sh   ddh}t | }|d }| j||||d |d }|d u s(t|tr(|dk r2td|  d S d S )Nr   r:   r   r=   r   r   r   r(   r(   r)   !_validate_dynamic_rope_parameters  r   z<RotaryEmbeddingConfigMixin._validate_dynamic_rope_parametersc              	   C   sl  h d}h d}t | }|d }| j|||||d |d }|d u s-t|tr-|dk r5td|  |d}|d urOt|trG|d	k rOtd
|  |d}	|	d uret|	tsetd|	  |d}
|
d ur{t|
ts{td|
  |	p~d|
pdk rtd|	 d|
 d | jd }| j	| }||kr|dkrt
d| d| d| d d S d S d S )N>   r:   r   r;   r   >   rW   rh   r]   r_   rX   rN   r   r   r:   r=   r   rN   r   O`rope_parameters`'s attention_factor field must be a float greater than 0, got r]   z9`rope_parameters`'s beta_fast field must be a float, got r_   z9`rope_parameters`'s beta_slow field must be a float, got r^   r   zR`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r   zKThe explicitly set RoPE scaling factor (config.rope_parameters['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'original_max_position_embeddings' fields in the model config.)r   r   r   rR   rJ   r   r   rD   r   rQ   warning_once)r#   r   r   r   optional_keysr   r   r:   rN   r]   r_   r   implicit_factorr(   r(   r)   _validate_yarn_rope_parameters  sH   




z9RotaryEmbeddingConfigMixin._validate_yarn_rope_parametersc                 C   s  h d}ddh}t | }|d }| j|||||d |dd}t| d| j| j }t|| }	|d	}
t|
t	sMt
d
d |
D rMtd|
  t|
|	d krdtd|	d  dt|
  |d}t|t	st
dd |D rtd|  t||	d krtd|	d  dt|  |d}|d }|d u r|d urtd n|d u r|d u rtd nt|tr|dk rtd|  |d}|d urt|tr|dk rtd|  d S d S d S )N>   r   r;   rw   rx   r   rN   r:   r   r   r<   r=   r>   rx   c                 s       | ]
}t |ttfV  qd S r7   rR   rG   rJ   .0r2   r(   r(   r)   	<genexpr>5      zPRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>zF`rope_parameters`'s short_factor field must be a list of numbers, got r?   z8`rope_parameters`'s short_factor field must have length z, got rw   c                 s   r   r7   r   r   r(   r(   r)   r   =  r   zE`rope_parameters`'s long_factor field must be a list of numbers, got z7`rope_parameters`'s long_factor field must have length r   av  This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.z4Missing required keys in `rope_parameters`: 'factor'r   g        r   )r   r   r   rD   r   rE   rF   rG   rR   r   allr   r   lenr   rJ   )r#   r   r   r   r   r   r   r<   r>   rM   rx   rw   r:   r   rN   r(   r(   r)   "_validate_longrope_rope_parameters)  sL   



z=RotaryEmbeddingConfigMixin._validate_longrope_rope_parametersc           
      C   s&  h d}|d }t | }| j||||d |d }|d u s(t|tr(|dk r0td|  |d }|d }|d u sAt|tsItd	|  |d u sRt|tsZtd
|  ||kritd| d|  |d }	|	d u svt|	ts~td|	  |	| jkrtd|	 d| j  d S d S )N>   r:   r   r;   r}   r~   r   r   r   r:   r=   r   r}   r~   z?`rope_parameters`'s low_freq_factor field must be a float, got z@`rope_parameters`'s high_freq_factor field must be a float, got zf`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r   zS`rope_parameters`'s original_max_position_embeddings field must be an integer, got zj`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)	r   r   r   rR   rJ   r   r   rG   rQ   )
r#   r   r   r   r   r   r:   r}   r~   r   r(   r(   r)    _validate_llama3_rope_parameters[  sJ   
z;RotaryEmbeddingConfigMixin._validate_llama3_rope_parametersr   r   r   r   c                 C   s   d|v r|dh8 }| d |pt }d|vr| d |dur$||8 }|| }|r4td|  d| || | }|rItd|  d|  dS dS )z\Compare the received keys in `config.rope_parameters` against the expected and optional keysr   r   r<   Nz<Missing required keys in `rope_parameters` for 'rope_type'='z': z8Unrecognized keys in `rope_parameters` for 'rope_type'=')addr   KeyErrorr   r   )r   r   r   r   r   missing_keysunused_keysr(   r(   r)   r     s   




z/RotaryEmbeddingConfigMixin._check_received_keysr7   )NN)r   r   r   r   r   r   r   rC   r   dictr   r   r   r   r   r   staticmethodr   r   r(   r(   r(   r)   r   q  s4    0

32)r   r   c                 C   s$   t dt |   | j|d dS )zq
    This is a deprecated function.
    It has been kept for backward compatibility with custom code models.
    aX  `rope_config_validation` is deprecated and has been removed. Its functionality has been moved to RotaryEmbeddingConfigMixin.validate_rope method. PreTrainedConfig inherits this class, so please call self.validate_rope() instead. Also, make sure to use the new rope_parameters syntax. You can call self.standardize_rope_params() in the meantime.r   N)warningswarnFutureWarningrC   r   )r   r   r(   r(   r)   rope_config_validation  s   r   )NNNN)NNNr7   ) rY   r   	functoolsr   typingr   r   r   utilsr   r	   
get_loggerr   r   r   configuration_utilsr
   r8   rG   r   tuplerJ   rO   rV   rv   r|   r   r   r   r   r   r   r(   r(   r(   r)   <module>   s   
d

7

H

 


Z

S	7  4