o
    ij                     @   s`   d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ G dd dZdefdd	Zd
S )    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)bytes_to_unicode)PreTrainedTokenizerFastc                   @   sF   e Zd ZdZ				dddZdefdd	Zd
d ZdefddZ	dS )MistralConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                 K   s   || _ || _|| _|| _d S )N)vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   kwargs r   c/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/mistral.py__init__   s   
zMistralConverter.__init__r   c           
         s  | t  fddg }i }t  D ]\\}\}}|| jvrm|||< t|dkr-qg }tdt|D ]%}|d | ||d  }}	| v r[|	 v r[||	  v r[|||	|f q6t| fdddd}|| q|||< qt|dd dd}fd	d
|D }||fS )Nc                    s   d  fdd| dD S )N c                    s   g | ]} t | qS r   )ord).0charbyte_encoderr   r   
<listcomp>       zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>zlatin-1)joindecode)br   r   r   token_bytes_to_string   s   zOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string   c                    s    | d   | d  fS )Nr   r!   r   x)	bpe_ranksr   r   <lambda>-   r   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>F)keyreversec                 S      | d S )N   r   )valr   r   r   r%   1       c                    s$   g | ]} |d   |d fqS )r   r!   r   )r   r*   )r    r   r   r   2   s   $ zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)	r   	enumerateitemsr   lenrangeappendsortedextend)
r   r   mergesidxtokenranklocalindexpiece_lpiece_rr   )r$   r   r    r   extract_vocab_merges_from_model   s,   

z0MistralConverter.extract_vocab_merges_from_modelc                 C   s:   |  | j\}}tt||dd}t|jdrd|j_|S )NF)fuse_unkignore_mergesT)r;   r   r   r   hasattrmodelr=   )r   vocab_scoresr3   	tokenizerr   r   r   rA   5   s
   zMistralConverter.tokenizerreturnc                 C   s^   |   }ttjt| jdddtj| jddg|_t	 |_
|| j tjdd|_|S )NisolatedF)behaviorinvert)r   	use_regex)trim_offsets)rA   r   SequenceSplitr   r   	ByteLevelr   pre_tokenizerr   decoderadd_special_tokensr   r   post_processor)r   rA   r   r   r   	converted<   s   
zMistralConverter.converted)Nr   FN)
__name__
__module____qualname____doc__r   strr;   rA   r   rO   r   r   r   r   r
      s    
r
   tokenizer_filec                 C   s   ddl m} ddlm} || }|jjj}t|jjj	dd d}dd |D }d	d
 t
|D }|| |}|jjjj}tt|||d d}	|	d|i |jj|jj|jj|jjd}
|
 D ]\}}||v rs|	||i qd|	S )z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )SpecialTokens)MistralTokenizerc                 S   r(   )Nr6   r   r"   r   r   r   r%   X   r+   z*convert_tekken_tokenizer.<locals>.<lambda>)r&   c                 S   s   g | ]}|d  qS )	token_strr   )r   r5   r   r   r   r   Y   s    z,convert_tekken_tokenizer.<locals>.<listcomp>c                 S   s   i | ]\}}||qS r   r   )r   r4   r5   r   r   r   
<dictcomp>[   s    z,convert_tekken_tokenizer.<locals>.<dictcomp>)r   r   r   )tokenizer_objectr   )	bos_token	eos_token	pad_token	unk_token)%mistral_common.tokens.tokenizers.baserV   (mistral_common.tokens.tokenizers.mistralrW   	from_fileinstruct_tokenizerrA   _tekken_token2id_nospecialr1   _all_special_tokensr,   update_model_pat_strr	   r
   rO   rM   bosvalueeospadunkr-   )rU   rV   rW   mistral_tokenizerr   sorted_tokensall_specialspecials_tokensr   rA   
MAP_SPECALspecial_keyspecial_tokenr   r   r   convert_tekken_tokenizerL   s6   


rt   N)
tokenizersr   r   r   r   r   tokenizers.modelsr   #transformers.convert_slow_tokenizerr   *transformers.tokenization_utils_tokenizersr	   r
   rT   rt   r   r   r   r   <module>   s    D