o
    i                     @   sz  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4 e45e6Z7dZ8dZ9dZ:dZ;dZ<e*d7 Z*ee e!e"dZ=e8e;dZ>e3e*G dd de-Z?e?Z@dS )z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)copyfile)Any)is_offline_mode)
AddedToken
processors)Encoding)	Tokenizer)Decoder)BPEUnigram)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainercached_file   )convert_gguf_tokenizer)load_gguf_checkpoint)INIT_TOKENIZER_DOCSTRINGBatchEncodingPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategygenerate_merges)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)r   r   	WordLevel	WordPiece)tokenizer_file
vocab_filec                )       s:  e Zd ZdZeZdZdZedmddZ	 fddZ
edefd	d
ZedefddZdndededB dee fddZdd Zedd Zedd Zejdd Zejdd Zdd ZedefddZdeeef fddZedeeef fdd Zedeeef fd!d"Zedeeef fd#d$ZeZeZ deeef fd%d&Z!defd'd(Z"defd)d*Z#ede$fd+d,Z%ede&fd-d.Z'							/dod0e(d1edB d2edB d3ed4ed5ed6ed7edeeee)f e*e( f fd8d9Z+d:edefd;d<Z,d=ededB fd>d?Z-dmd@e*eeB  defdAdBZ.dmdCedefdDdEZ/dmdFee*e B dGedee*e B fdHdIZ0dpdJedCedB dKede*e fdLdMZ1dNe2dOe3dPedQedRedB dSedB fdTdUZ4dd/e2j5e3j6ddVddddddddddd/dfdJe7e8B e*e7 B e*e8 B dWe7e8B e*e7 B e*e8 B dB dKedNe2dOe3dPedB dQedXedRedB dSedB dYedB d1edB d2edB d3ed4ed5ed6ed7edZedB de9f(d[d\Z:d]e*e defd^d_Z;		dqd`ee*e B dGedaedB defdbdcZ<		drdee=j>B ddeedef dfedB dedB deedef f
dgdhZ?			dsdidjZ@e							dtdkdlZA  ZBS )uTokenizersBackendaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    NFc              
      s  t |}|dd}|dur(tj|r(| tu sd| jvs|r(t||d< |S |durtj|rt|j	}t
|dd}t|}W d   n1 sNw   Y  |di dd}| jdu rot|trnttt|}nC| jjd	kr|rt|d
 ttfrdd |D }n*| jjdkrdd t|D }n| jjdks| jjdkrt|trdd t|D }||d< t| dd}	d|di v r|	r|	jdkr|d d }
dd |
D }
|
|d< |dur||d< |S |d}|d}|d}|d}
t|tr#|dr#tj|r#ddlm} ||d|\|d< |d< |S t|trtj|r|drzhddlm} ||j| jfi |}z!ddlm} || j}|durit|dri|jd0i |}W n  t y } zt!"d | j d!| d" W Y d}~nd}~ww t| d#r| j#d0i |}W |S W |S  t y } z+t!"d$| d%| d& dd'lm$} |||d(d)|\|d< |d< W Y d}~|S d}~ww |du rt|trtj|r||d< |d }|
du rt|trtj|r||d< |d }
|
du rX| jdurX| jjdkrXt|t rXd*t%t& d+tt f fd,d- g d.}t' }|D ]}||v rL|( || g q;t)||d/}
|
|d< |S )1zs
        Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
        models, tekken.json, vocab/merges).
        r#   N__init__tokenizer_objectutf-8encodingmodelvocabr   r   c                 S      g | ]}t |qS  )tuple).0itemr.   r.   l/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/tokenization_utils_tokenizers.py
<listcomp>       z>TokenizersBackend.convert_to_native_format.<locals>.<listcomp>r!   c                 S   s   i | ]\}}||qS r.   r.   r0   itokenr.   r.   r2   
<dictcomp>       z>TokenizersBackend.convert_to_native_format.<locals>.<dictcomp>r   r"   c                 S   s(   i | ]\}}t |tr|d  n||qS r   )
isinstancelistr5   r.   r.   r2   r8      s   ( mergesc                 S   s,   g | ]}t |trt|d nt|qS ) )r;   strr/   split)r0   merger.   r.   r2   r3      s   , post_processorr$   merges_fileztekken.jsonr   )MistralConverter)r$   .model)SentencePieceExtractor)SLOW_TO_FAST_CONVERTERSconvert_from_spmz,Could not reorder vocab using converter for z due to z/. Falling back to raw SentencePiece extraction.convert_from_spm_modelz+Could not extract SentencePiece model from z$ using sentencepiece library due to z%. Falling back to TikToken extractor.)TikTokenConverterextra_special_tokens)r$   rK   valuesreturnc                    sH   g }| D ]}|d u rqt |ttfr| | q|t| q|S N)r;   r<   r/   extendappendr?   )rL   	collectedval_iter_special_tokensr.   r2   rT      s   zHTokenizersBackend.convert_to_native_format.<locals>._iter_special_tokens)		pad_token	unk_token	bos_token	eos_token	sep_token	cls_token
mask_tokenadditional_special_tokensrK   )skip_tokensr.   )*dictpopospathisfiler%   __dict__TokenizerFast	from_filerB   openjsonloadgetr+   r;   r<   mapr/   __name__	enumerategetattrr?   endswithconvert_slow_tokenizerrD   extract_vocab_merges_from_modelrF   extractrG   hasattrrH   	ExceptionloggerwarningrI   rJ   r   r   setupdater   )clstrust_remote_codekwargslocal_kwargsfast_tokenizer_file	processortokenizer_handletokenizer_jsonr,   
model_typer=   r$   rC   rD   rF   rG   converter_classerJ   special_tokens_keysr]   keyr.   rS   r2   convert_to_native_formatd   s   







&&
$$0
z*TokenizersBackend.convert_to_native_formatc                    s  | dd }| dd }| dd }|di }|dd}|d}|d}	|d	}
d }|d ur9t|}n|d urItj|rIt|}n|d urt	|d
d|fi |}t
|}|d d }|d }|d }t||\}}|| t|dkr|| n^| jd u r|	d ur|
d urt|	tr|	ndd t|	D }tt||
dd d}n6t|	trtt|	g dd d}n&t|	tr|	rt|	d ttfrtt|	|ddd}n	| jd u rtd|d u r|d u r| jd u r|dd |dd |d ur|| _| jd u rtd| jj}|d ur@| jjd<i | |d|d  |d|d  |d |d   |d!|d"  n| j  | jj}|d ur| jjd<i | |d#|d#  |d$|d%  |d&|d  |d|d'  |d(|d(  d)|vrd*|d)< d+|v pd,|v }|d+d| _|d,d| _| d-d  }r|| j_|p| jjd u | _ t! j"d<i | |d ur|| _#|| _$| j%| j_&d.d/ | j'D   fd0d1t(|) d2d3 d4D }t| j*+ d5d1 |D  }| j,- D ]}|d u rqt.||vr||vr|/| q| j0D ]}t.||vr-||vr-|/| qt|dkrvg }d6d1 | j,- D }|D ])}t|t.rSt1|dd7}nt|t1rg|j2sgt.||v rgd|_2|/| qD|rv| 3| z| j4 }W n t5y   d}Y nw |d8krt6| jd9d d ur| dd  | j7| j| j8d
d f| j8|d:d;|| _| j p| jjd u | _ | j r| 9  d S d S )=Nr'   	gguf_filer#   added_tokens_decoderadd_prefix_spaceFr$   r,   r=   name_or_path configr   	tokenizertokenizer_configr   c                 S   s   i | ]	\}\}}||qS r.   r.   )r0   r6   w_r.   r.   r2   r8         z.TokenizersBackend.__init__.<locals>.<dictcomp>T)r,   r=   fuse_unkdropoutunk_id)r,   r   a9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.rW   z<s>rX   z</s>z3The backend tokenizer is not correctly initialized.
max_lengthtruncation_side	directionstridetruncation_strategystrategyrU   pad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofbackend
tokenizersadd_bos_tokenadd_eos_tokenrB   c                 S   s   h | ]}t t|qS r.   hashreprr0   r7   r.   r.   r2   	<setcomp>K      z-TokenizersBackend.__init__.<locals>.<setcomp>c                    s$   g | ]\}}t t| vr|qS r.   r   )r0   indexr7   added_tokens_decoder_hashr.   r2   r3   L  s
    z.TokenizersBackend.__init__.<locals>.<listcomp>c                 S      | d S Nr   r.   )xr.   r.   r2   <lambda>N      z,TokenizersBackend.__init__.<locals>.<lambda>r   c                 S   r-   r.   r?   r   r.   r.   r2   r3   Q  r4   c                 S   s   g | ]}|rt |qS r.   r   )r0   tr.   r.   r2   r3   b  r   )speciali pre_tokenizerfix_mistral_regex)init_kwargsr   r.   ):r_   ri   copydeepcopyr`   ra   rb   rd   re   r   r   r   rw   len
_tokenizerr;   r^   rl   r   r<   r/   r   
ValueError
setdefault
truncationenable_truncationno_truncationpaddingenable_padding_add_bos_token_add_eos_tokenrB   _should_update_post_processorsuperr&   r$   r   split_special_tokensencode_special_tokensr   sorteditemsadded_tokens_encoderkeys_special_tokens_maprL   r?   rP   _extra_special_tokensr   r   
add_tokensget_vocab_sizeNotImplementedErrorrm   _patch_mistral_regexr   update_post_processor)selfargsrz   r'   r   r|   r   r   r$   r,   r=   fast_tokenizer	gguf_path
gguf_paramarchitecturetokenizer_dictr   additional_kwargs
vocab_dict_truncation_paddingexplicit_bos_eos_in_kwargsrB   tokens_to_addencoderspecial_token_valuer7   tokensall_named_tokens
vocab_size	__class__r   r2   r&      s   




 
 













	zTokenizersBackend.__init__rM   c                 C      dS )NTr.   r   r.   r.   r2   is_fast  s   zTokenizersBackend.is_fastc                 C   s@   d| j v r| j d drt| dr| jrtj| jS dS dS )z
        `bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        r$   rE   FT)vocab_files_namesrn   rr   r$   r`   ra   rb   r   r.   r.   r2   can_save_slow_tokenizer  s
   z)TokenizersBackend.can_save_slow_tokenizersave_directoryfilename_prefixc                 C   sp   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr5t	| j| |fS )NzVocabulary path (z) should be a directory-r   r$   )
r`   ra   isdirrt   errorjoinVOCAB_FILES_NAMESabspathr$   r   )r   r   r   out_vocab_filer.   r.   r2   save_vocabulary  s   z!TokenizersBackend.save_vocabularyc                 C   s   | j }| j}|du r| jrtd| j}| j}|du r#| jr#d| _dS | jr*|d nd d| jr6d| d nd }| | jrEd| d	 nd d
| jrQd| d	 nd }g }| jra|||f | jrk|||f tj	|||d| j
_dS )ze
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        Nz)add_bos_token = True but bos_token = NoneFz:0 r   z$A:0r>   z:0z:1z $B:1)singlepairspecial_tokens)rW   bos_token_idr   r   rX   eos_token_idr   rP   r   TemplateProcessingr   rB   )r   bosr   eosr   r   r   r   r.   r.   r2   r     s&   .6z'TokenizersBackend.update_post_processorc                 C      t | ddS )Nr   Frm   r   r.   r.   r2   r        zTokenizersBackend.add_eos_tokenc                 C   r   )Nr   Fr   r   r.   r.   r2   r     r  zTokenizersBackend.add_bos_tokenc                 C      t | d| |   d S )Nr   object__setattr__r   r   valuer.   r.   r2   r        c                 C   r  )Nr   r  r  r.   r.   r2   r     r  c                 C   s   g }| j  D ]!}|du rqt|tr|| qt|tr(|t|ddd q| jD ]}t|tr9|| q,t|trH|t|ddd q,|rR| j|dd t| dds^| j	j
du rd|   dS dS )a[  
        Post-initialization hook that runs after the tokenizer is fully set up.
        This is called by from_pretrained() after loading the tokenizer, which allows
        us to add any special tokens that may have been passed as AddedToken objects.

        Child classes should call super()._post_init() if they override this method.
        NTF)r   
normalized)r   r   )r   rL   r;   r   rP   r?   r   r   rm   r   rB   r   )r   r   token_valuer7   r.   r.   r2   
_post_init  s(   




zTokenizersBackend._post_initc                 C      | j jddS )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensr   r   r   r.   r.   r2   r     s   zTokenizersBackend.vocab_sizec                 C   r  )NTr  )r   	get_vocabr   r.   r.   r2   r    s   zTokenizersBackend.get_vocabc                 C   s   |   S rN   )r  r   r.   r.   r2   r,     s   zTokenizersBackend.vocabc                 C       dd t | j dd dD S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 S      i | ]\}}|j |qS r.   contentr0   vkr.   r.   r2   r8     r   z:TokenizersBackend.added_tokens_encoder.<locals>.<dictcomp>c                 S   r   r   r.   r1   r.   r.   r2   r     r   z8TokenizersBackend.added_tokens_encoder.<locals>.<lambda>r   r   r   r   r   r.   r.   r2   r     s    z&TokenizersBackend.added_tokens_encoderc                 C   s
   | j  S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `dict[str, int]`: The added tokens.
        )r   get_added_tokens_decoderr   r.   r.   r2   r   
  s   
z&TokenizersBackend.added_tokens_decoderc                 C   r  )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `dict[str, int]`: The added tokens.
        c                 S   r  r.   r  r  r.   r.   r2   r8      r   z5TokenizersBackend.get_added_vocab.<locals>.<dictcomp>c                 S   r   r   r.   r  r.   r.   r2   r      r   z3TokenizersBackend.get_added_vocab.<locals>.<lambda>r   r  r   r.   r.   r2   get_added_vocab  s    z!TokenizersBackend.get_added_vocabc                 C   r   )zN
        Returns True, to avoid expensive `assert tokenizer` gotchas.
        Tr.   r   r.   r.   r2   __bool__"  s   zTokenizersBackend.__bool__c                 C   r  )zD
        Size of the full vocabulary with the added tokens.
        Tr  r  r   r.   r.   r2   __len__(  s   zTokenizersBackend.__len__c                 C   s   | j S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )r   r   r.   r.   r2   backend_tokenizer.  s   z#TokenizersBackend.backend_tokenizerc                 C   s   | j jS )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )r   decoderr   r.   r.   r2   r  5  s   zTokenizersBackend.decoderTr*   return_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 C   s   |du r	d| j v }|du rd| j v }|r |jdur |g|j }	n|g}	tt}
|	D ]>}|
d |j |r=|
d |j |rG|
d |j |rQ|
d |j |r[|
d |j	 |rg|
d t
|j q)|
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        Ntoken_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingr   )model_input_namesoverflowingr   r<   rP   idstype_idsr(  r*  offsetsr   )r   r*   r   r!  r"  r#  r$  r%  r&  	encodingsencoding_dictr   r.   r.   r2   _convert_encoding<  s,   

z#TokenizersBackend._convert_encodingr7   c                 C   s   | j |}|d u r| jS |S rN   )r   token_to_idunk_token_id)r   r7   r   r.   r.   r2   #_convert_token_to_id_with_added_vock  s   z5TokenizersBackend._convert_token_to_id_with_added_vocr   c                 C   s   | j t|S rN   )r   id_to_tokenint)r   r   r.   r.   r2   _convert_id_to_tokenq  s   z&TokenizersBackend._convert_id_to_token
new_tokensc                 C   s   |r| j |S | j |S rN   )r   add_special_tokensr   )r   r:  r   r.   r.   r2   _add_tokenst  s   zTokenizersBackend._add_tokensr   c                 C   s   | j |S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )r   num_special_tokens_to_add)r   r   r.   r.   r2   r=  z  s   z+TokenizersBackend.num_special_tokens_to_addr.  skip_special_tokensc                 C   s`   t |tr| j|S g }|rt| jnt }|D ]}t|}||v r$q|| j| q|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `list[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `list[str]`: The decoded token(s).
        )r;   r8  r   r7  rv   all_special_idsrP   )r   r.  r>  r   ids_to_skipr   r.   r.   r2   convert_ids_to_tokens  s   
z'TokenizersBackend.convert_ids_to_tokenstextr;  c                 K   s   | j d|||d| S )N)rB  	text_pairr;  r.   )_encode_plusr   )r   rB  r   r;  rz   r.   r.   r2   tokenize  s   zTokenizersBackend.tokenizepadding_strategyr   r   r   r   r   c                    s   | j j | j j}|tjkr dur| j   n&|||j| jd} du r'd}	n	 fdd|D }	|	|kr=| j jdi | |t	j
krO|durM| j   dS dS |t	jkrV|nd}
|
|dur_|n| j| j| j| j|d}||krz| j jdi | dS dS )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r   r   r   r   c                    s   i | ]	}|  |d qS rN   ri   )r0   r  r   r.   r2   r8     r   z@TokenizersBackend.set_truncation_and_padding.<locals>.<dictcomp>)r   r   pad_idrU   r   r   r.   )r   r   r   r   DO_NOT_TRUNCATEr   r  r   r   r   
DO_NOT_PAD
no_padding
MAX_LENGTHr   pad_token_idrU   r   r   )r   rF  r   r   r   r   r   r   targetcurrentr   r.   rH  r2   set_truncation_and_padding  s>   !


z,TokenizersBackend.set_truncation_and_paddingr   rC  is_split_into_wordsreturn_tensorsr   c           #         sH  dd }||st d|d ur||st d|r-t|ttfo+|o+t|d ttf}nt|ttf}|rht|tr?td|d urZt|t|krZt dt| dt| d|d urett||n|}n
|ro||fgn|g}t|ttfstd	t| d
j	|||||	|
d |d u rj
}jj|kr|j_jj|||d}fdd|D }i }|d d D ]  fdd|D }|| < qdd |D }rg }t|D ]\}\}} ||gt|d  7 }q||d< |d D ]	}!|!| qt|||d}"|s"|d u r"s"tdd |" D |"j}"|"S )Nc                 S   s   t | trdS t | ttfr_t| dkrdS t | d trdS t | d ttfr]t| d dks9t | d d tr;dS t | d d ttfr[t| d d dkpZt | d d d tS dS dS dS )NTr   F)r;   r?   r<   r/   r   )r   r.   r.   r2   _is_valid_text_input  s   
"*z<TokenizersBackend._encode_plus.<locals>._is_valid_text_inputztext input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs).r   zdwhen tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.zbatch length of `text`: z- does not match batch length of `text_pair`: .z:batch_text_or_text_pairs has to be a list or a tuple (got ))rF  r   r   r   r   r   )r;  is_pretokenizedc                    s&   g | ]}j | d qS ))r*   r   r!  r"  r#  r$  r%  r&  )r3  )r0   r*   )r!  r%  r$  r"  r#  r   r   r&  r.   r2   r3   d  s    z2TokenizersBackend._encode_plus.<locals>.<listcomp>c                    s"   g | ]\}}|  D ]}|q
qS r.   r.   )r0   r1   r   r   r   r.   r2   r3   u  s   " c                 S   s   g | ]\}}|D ]}|qqS r.   r.   )r0   r   r1   r   r.   r.   r2   r3   w  s    r)  overflow_to_sample_mapping)tensor_typec                 S   s8   i | ]\}}|t |d krt|d  tr|d  n|qS r:   )r   r;   r<   )r0   r   r  r.   r.   r2   r8     s    &z2TokenizersBackend._encode_plus.<locals>.<dictcomp>)r   r;   r<   r/   r?   	TypeErrorr   ziptyperQ  r   r   r   encode_batchrl   &_eventual_warn_about_too_long_sequencer   r   r1  )#r   rB  rC  r;  rF  r   r   r   rR  r   r   rS  r   r!  r"  r#  r$  r%  r&  r   rz   rT  
is_batchedbatch_text_or_text_pairsr1  tokens_and_encodingssanitized_tokensstacksanitized_encodingsrX  r6   toksr   r)  batched_outputr.   )	r   r!  r%  r$  r"  r#  r   r   r&  r2   rD    s   &


zTokenizersBackend._encode_plusr   c                 C   s$   | j jd ur| j j|S d|S )Nr>   )r  r  decoder   )r   r   r.   r.   r2   convert_tokens_to_string  s
   z*TokenizersBackend.convert_tokens_to_string	token_idsclean_up_tokenization_spacesc                 K   s   | dd  t|tr|g}t|tr|d }| jj||d}|d ur%|n| j}|ret| dr;t| j	r;| 	|}|S |
dd
dd
d	d

dd
dd
dd
dd
dd
dd
dd}|S )Nuse_source_tokenizerr)  )r>  clean_up_tokenizationz .rU  z ??z !!z ,,z ' 'z n'tzn'tz 'mz'mz 'sz'sz 'vez'vez 'rez're)r_   r;   r8  r^   r   rg  rj  rr   callablerl  replace)r   ri  r>  rj  rz   rB  r.   r.   r2   _decode  s4   



zTokenizersBackend._decode
file_names.legacy_formatc                 C   s@   t |}tj||r|d ndt }| j| ||f }|S )Nr   r   )r?   r`   ra   r   TOKENIZER_FILEr  save)r   r   rt  ru  r   r#   r.   r.   r2   _save_pretrained  s   
z"TokenizersBackend._save_pretrainedc              
      s  t | j }|d}|d}	d}
|d d dkr)i |d d< g |d d< nW|d d d	kre|d d
 durd|d d
 }|d d | d }
 durU|
 v rU |
 }
d|d d
< |
dgg|d d< n|d d dv rti |d d< ntd|d d  d durd|d v r|d d  v r |d d  |d d< tt |g }|D ]5}|dd}|dd}|d d d	kr|sq dur|d  v rՈ |d  |d< |	t
d,i | q|dur|| |d d dkrd|vr|d d dur|d d |d< |d d dkr'd|vr'|d d dur'|d d |d< |d d d	kr9|
dur9|
|d< |d durn|d d dksg|d d dkrnd|d v rntdd |d d D rntj |d< t|d d  }|d,||d|}j|||d |	dur!t  }d|	v r|	d D ]D}|	d | d  } dur fd!d"|D }||	d | d < |D ]}|}|du rtd#qfd$d"|D |	d | d%< qd&D ]0}||	v r|	| \}} dur| v r | }|}|du rtd#||g|	|< q|	|d< tt || j }tjD ]A}t| |durit| |} durF| v rF | }| j|d}t|t
ret
||j|j|j|jd'd(||< q)|||< q)| jrt| j ng }|dur|| t |dkr||d)< |d*< z	| j!d,i |W S  t"y } z!d+t#|v r|d*d | j!d,i |}|_|W  Y d}~S  d}~ww )-uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `list[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokensrB   Nr+   r\  r   r,   r=   r   r   r   g        )r!   r"   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.rV   r   idr  continuing_subword_prefixend_of_word_suffixr   	ByteLevelSequencepretokenizersc                 s   s    | ]	}|d  dkV  qdS )r\  r}  Nr.   )r0   pretokenizerr.   r.   r2   	<genexpr>;  s
    

z<TokenizersBackend.train_new_from_iterator.<locals>.<genexpr>initial_alphabet)r   r   )r   trainerr   r   c                    s   g | ]}  ||qS r.   rG  r   )special_tokens_mapr.   r2   r3   M  r   z=TokenizersBackend.train_new_from_iterator.<locals>.<listcomp>zQAttempted to set a token in the post processor that does not exist in the mappingc                    s   g | ]}  |qS r.   )r4  r   )r   r.   r2   r3   V  r9   r.  )rx   sepT)single_wordlstriprstripr	  r   rK   r'   z7multiple values for keyword argument 'tokenizer_object'r.   )$rg   loadsr   to_strr_   r   rd   from_strdumpsrP   r   rO   anypre_tokenizers_fastr}  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorr4  r   r   r   SPECIAL_TOKENS_ATTRIBUTESrm   r   ri   r;   r  r  r  r	  rK   r   r   rZ  r?   )r   text_iteratorr   r   new_special_tokensr  rz   r   ry  rB   rV   r   r   added_tokenr   r   trainer_classr  trained_tokenizer_jsonr   r   r7   token_idspecial_tokenspecial_token_fullrK   r   new_tokenizerr.   )r  r   r2   train_new_from_iterator  s   "










"





	

z)TokenizersBackend.train_new_from_iteratorc
              
      s0  ddl ddlm  ddlm} ddlm} dtdtf fdd	}t	 r'd
}|dur|s6|s||r||d|||dd|d}d}|durt
|dd}t|}W d   n1 s^w   Y  |d}|d}|r|||dkr|r|dur|dvr|S n|r|||dkr|S d
}|s|s||r|rd|v rt|d|d  |	du rt|ddst|dd td| d |S |	d
u st|ddrt|dd
 ddl}|jj|ddd}|jj}t||jjr||jjd< |S t||jjr|jjddd}|j||g|j_|S )af  
        Patches mistral related tokenizers with incorrect regex if detected
            1) Local file with an associated config saved next to it
                >> Model type one of the mistral models (on older versions)
            2) Remote models on the hub from official mistral models
                >> Tags including `base_model:.*mistralai`
        r   N)
model_info)versionr   model_idrM   c                    s.    | }|j d urdd|j rdS dS )Nzbase_model:.*mistralair   TF)tagssearchr   )r  r+   r  rer.   r2   is_base_mistral  s
   
z?TokenizersBackend._patch_mistral_regex.<locals>.is_base_mistralTzconfig.jsonF)	cache_dirr7   local_files_only%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors_commit_hashr(   r)   transformers_versionr   z4.57.2)mistralmistral3voxtral	ministralpixtralz4.57.3r   z$The tokenizer you are loading from 'a  ' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.z[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+isolated)patternbehavior)r   	use_regex)r  huggingface_hubr  	packagingr  transformers.utils.hubr   r?   boolr   rf   rg   rh   ri   parsesetattrrm   rt   ru   r   pre_tokenizersSplitRegexr  r   r;   r~  	Metaspacer}  )rx   r   pretrained_model_name_or_pathr7   r  r  r  is_localr   r   rz   r  r   r  _config_filemistral_config_detectedf_configr  transformers_model_typer   split_pretokenizercurrent_pretokenizerr.   r  r2   r     s   


	
$z&TokenizersBackend._patch_mistral_regex)FrN   )NNFFFFT)NF)FN)NN)NNN)NNFNFNN)Crk   
__module____qualname____doc__r   r   r+   r   classmethodr   r&   propertyr  r   r   r?   r/   r   r   r   r   setterr  r8  r   r^   r  r,   r   r   r   _added_tokens_encoder_added_tokens_decoderr  r  r  rd   r  DecoderFastr  EncodingFastr   r<   r3  r6  r9  r<  r=  rA  rE  r   r   rQ  rK  rJ  r   r   r   rD  rh  rs  r`   PathLikerx  r  r   __classcell__r.   r.   r   r2   r%   R   sv      



 			

/($
N	

 


.



 Fr%   )Ar  r   rg   r`   collectionsr   collections.abcr   shutilr   typingr   tokenizers.pre_tokenizersr  r  r  r   r   r   r   r	   r  r
   rd   tokenizers.decodersr   r  tokenizers.modelsr   r   tokenizers.trainersr   r   r   r   r  r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utils_baser   r   r   r   r   r   r   utilsr   r   r    
get_loggerrk   rt   rv  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEADDED_TOKENS_FILEr  r   r%   PreTrainedTokenizerFastr.   r.   r.   r2   <module>   s\   $	


         P