o
    il&                    @   s,  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZ ddlmZ d	d
lmZmZmZmZ d	dlmZ eeZg dZeg d Z dddZ!de"de#fddZ$ddee# dB fddZ%G dd dZ&G dd de&Z'de#de"fddZ(G dd  d Z)G d!d" d"e)Z*G d#d$ d$e)Z+G d%d& d&e)Z,G d'd( d(e)Z-G d)d* d*e)Z.G d+d, d,e)Z/G d-d. d.e)Z0G d/d0 d0e)Z1G d1d2 d2e)Z2G d3d4 d4e)Z3G d5d6 d6e)Z4G d7d8 d8e)Z5G d9d: d:e5Z6G d;d< d<e5Z7G d=d> d>e5Z8G d?d@ d@e5Z9G dAdB dBe5Z:G dCdD dDe5Z;G dEdF dFe5Z<G dGdH dHe5Z=G dIdJ dJe5Z>G dKdL dLe5Z?G dMdN dNe5Z@G dOdP dPe5ZAG dQdR dRe5ZBG dSdT dTe5ZCG dUdV dVe5ZDG dWdX dXe5ZEG dYdZ dZe)ZFG d[d\ d\e5ZGG d]d^ d^e)ZHG d_d` d`e)ZIG dadb dbe)ZJG dcdd dde5ZKG dedf dfe5ZLG dgdh dhe5ZMG didj dje)ZNG dkdl dle5ZOG dmdn dne5ZPG dodp dpe5ZQdqdr ZRG dsdt dtZSG dudv dvZTi dwe6dxe2dye7dze*d{eGd|eJd}e8d~eHde/de*de4de9de*de*de*de*de*i de6de,de/de0de*de*de2de>de2de2de*deNde:de;de-de*de2i de<de.deCde1de@deAde2de3de=de*deDdeEdeFde>de?de+deKeMeMeLeMdZUddefddZVdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)
Collection)	lru_cache)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece)tqdm   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CN)af_ZAaz_AZbn_INfa_IRhe_ILhr_HRid_IDka_GEkm_KHmk_MKml_INmn_MNmr_INpl_PLps_AFpt_XXsv_SEsw_KEta_INte_INth_THtl_XXuk_UAur_PKxh_ZAgl_ESsl_SI c                 C   sj   t  rddlm} |S t r.dd l}t|jjtdk r&ddl	m} |S ddl	m
} |S tt| )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecerK   r   google.protobufr   parseprotobuf__version__transformers.utilsrL   ImportErrorr   format)error_messagerK   google rW   e/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.pyimport_protobuf_   s   rY   add_prefix_spacereturnc                 C   s$   | rd}t |ddsd}|S d}|S )NalwayslegacyTfirstnever)getattr)rZ   original_tokenizerprepend_schemerW   rW   rX   _get_prepend_schemep   s   rc   skip_tokensc                    s   |d urt |nt  }|d u}|rt|n }g }| D ]J\}}||v r&qg }tdt|D ](}|d | ||d  }	}
|	|v sF|
|v rGq/|	 v rW|
 v rW||	|
|f q/t| fddd}|| qt|dd |d}dd |D }|S )	Nr   c                        | d   | d  fS Nr   r   rW   xvocabrW   rX   <lambda>       z!generate_merges.<locals>.<lambda>keyc                 S   s   | d t | d t | d fS )N   r   r   )lenvalrW   rW   rX   rk      s    rn   reversec                 S   s   g | ]
}|d  |d fqS r   r   rW   .0rr   rW   rW   rX   
<listcomp>       z#generate_merges.<locals>.<listcomp>)setdictitemsrangerp   appendsortedextend)rj   vocab_scoresrd   rt   mergesmergepiece_scorelocalindexpiece_lpiece_rrW   ri   rX   generate_mergesz   s(   r   c                   @   s@   e Zd ZdZdefddZdeeeef e	e f fddZ
dS )	SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 C   sd   t | d t | d t }| }t|d}||  W d    n1 s(w   Y  || _d S )NrM   rP   rb)r   rY   
ModelProtoopenParseFromStringreadproto)selfr   	model_pb2mfrW   rW   rX   __init__   s   


zSentencePieceExtractor.__init__r[   c           	      K   s   | j jj |du rddlm}m} | j jjdkr|n|}dd | j jD }|jdkr6| j jj|d< ||d	< ndd
l	m
} dd t|D }||}||d	< ||d< dd t| j jD }dd t|dd dD |d< |S )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        Nr   )r   r   r   c                 S      g | ]}|j |jfqS rW   piecescorerw   r   rW   rW   rX   rx      rl   z2SentencePieceExtractor.extract.<locals>.<listcomp>r   unk_idrj   )r   c                 S      i | ]	\}\}}||qS rW   rW   rw   iwordr   rW   rW   rX   
<dictcomp>       z2SentencePieceExtractor.extract.<locals>.<dictcomp>r   c                 S   s,   g | ]\}}|j d v r||j|j dkfqS )      r   )typer   rw   idprW   rW   rX   rx      s   , c                 S       g | ]\}}}t |d |dqS F
normalizedspecialr   rw   r   tokenr   rW   rW   rX   rx          c                 S      | d S Nr   rW   rg   rW   rW   rX   rk          z0SentencePieceExtractor.extract.<locals>.<lambda>rm   additional_special_tokens)r   trainer_specr   tokenizers.modelsr   r   
model_typepieces__name__tokenization_utils_baser   	enumerater   )	r   r   kwargsr   r   rj   r   r   spm_added_tokensrW   rW   rX   extract   s$   



zSentencePieceExtractor.extractN)r   
__module____qualname____doc__strr   tupler{   intlistr   rW   rW   rW   rX   r      s    &r   c                   @   s0   e Zd Zddeeeef ee f fddZdS )GemmaSentencePieceExtractorNr[   c                    sH   | j   fddt  D }d|vr|d|d< t||}||fS )r   c                    s   i | ]}  ||qS rW   )id_to_piece)rw   r   sprW   rX   r      rl   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>	<0x09>)r   r}   GetPieceSizegetr   )r   r   rj   r   rW   r   rX   r      s   
z#GemmaSentencePieceExtractor.extractN)	r   r   r   r   r{   r   r   r   r   rW   rW   rW   rX   r      s    (r   r   c                 C   s&   t | dk p| d dkp| d   S )Nro   ,)rp   isdigit)r   rW   rW   rX   check_number_comma   s   &r   c                   @   s"   e Zd Zdd ZdefddZdS )	Converterc                 C   s
   || _ d S r   )ra   )r   ra   rW   rW   rX   r      s   
zConverter.__init__r[   c                 C   s   t  r   )NotImplementedErrorr   rW   rW   rX   	converted   s   zConverter.convertedN)r   r   r   r   r   r   rW   rW   rW   rX   r      s    r   c                   @      e Zd ZdefddZdS )BertConverterr[   c           
      C      | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixra   rj   r   r   r   r   hasattrr   tokenize_chinese_charsr   do_lower_caser	   BertNormalizer
normalizerr
   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr   decoder
r   rj   	tokenizerr   r   r   clssepr   r   rW   rW   rX   r      :   



zBertConverter.convertedNr   r   r   r   r   rW   rW   rW   rX   r          r   c                   @   r   )SplinterConverterr[   c              
   C   sZ  | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}t| j j}d}	| j j}
| j j}| j j}| j d}| j jdkrx| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}tj| d| d|||
f||f||f|	|fgd|_tjdd|_|S )Nr   Fr   Tr   .rightr    r   r   r   r   r   r   )ra   rj   r   r   r   r   r   r   r   r   r   r	   r   r   r
   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r   r   )r   rj   r  r   r   r   r  r  questiondotr   r   r  dot_token_idr   rW   rW   rX   r     sL   



$"
zSplinterConverter.convertedNr  rW   rW   rW   rX   r  
  r  r  c                   @   r   )FunnelConverterr[   c           
      C   r   )Nr   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   rW   rW   rX   r   =  r  zFunnelConverter.convertedNr  rW   rW   rW   rX   r  <  r  r  c                   @   r   )MPNetConverterr[   c           
   
   C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	| d
||f||	fgd|_tjdd|_|S )Nr   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   rW   rW   rX   r   d  s:   



zMPNetConverter.convertedNr  rW   rW   rW   rX   r  c  r  r  c                   @   r   )OpenAIGPTConverterr[   c              	   C   s   | j j}t| j j }| j j}tt||d t|ddd}|	t|d ur/|
t|g tjdd|_t |_tjdd|_|S )N</w>F)rj   r   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)ra   encoderr   	bpe_rankskeysr   r   r   r   token_to_idadd_special_tokensr	   r   r   r
   r   r   r   
BPEDecoderr   r   rj   r   r   r  rW   rW   rX   r     s&   
zOpenAIGPTConverter.convertedNr  rW   rW   rW   rX   r    r  r  c                   @   @   e Zd Zddeeef dB deeeef  dB defddZ	dS )GPT2ConverterNrj   r   r[   c              	   C   s   |s| j j}|st| j j}tt||d dddd}t| j dd}tj|d|_	t
 |_t| j ddrP| j j}| j j}tj| d| d||fgd	|_|S tjdd
|_|S )NrJ   Frj   r   r  continuing_subword_prefixr  r  rZ   rZ   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)ra   r  r   r  r   r   r`   r
   	ByteLevelr   r   r   	bos_tokenbos_token_idr   r   r   )r   rj   r   r  rZ   bosr,  rW   rW   rX   r     s:   
zGPT2Converter.convertedNN
r   r   r   r{   r   r   r   r   r   r   rW   rW   rW   rX   r#        8r#  c                   @   r   )HerbertConverterr[   c                 C   s   d}d}| j j}t| j j }||d d v r|dd  }tt||d | j j|d}tj	ddd|_
t |_tj|d|_tj| j j| j jf| j j| j jfd	|_|S )
Nz	#version:r  r   r   )r  r   r  F)r   r   r  )r  r  )ra   r  r   r  r  r   r   r   r	   r   r   r
   r   r   r   r   r   r   BertProcessingr   r   r   r   r   )r   tokenizer_info_strtoken_suffixrj   r   r  rW   rW   rX   r     s.   

zHerbertConverter.convertedNr  rW   rW   rW   rX   r1    r  r1  c                   @   r"  )Qwen2ConverterNrj   r   r[   c                 C   s   |s| j j}|st| j j }tt||d d ddddd}t |_	t
t
jtddddt
jt| j ddddg|_t |_tjdd	|_|S )
NrJ   F)rj   r   r  r   r%  r  r  byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertrZ   rZ   	use_regexr(  )ra   r  r   r  r  r   r   r	   NFCr   r
   SequenceSplitr   r*  r`   r   r   r   r   r   )r   rj   r   r  rW   rW   rX   r     sD   

zQwen2Converter.convertedr.  r/  rW   rW   rW   rX   r5    r0  r5  c                   @   r   )RobertaConverterr[   c              	   C   sv   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tj|j|jf|j|jf|j	dd|_|S )NrJ   Fr$  r&  Tr  r  rZ   r)  )ra   r  r   r  r  r   r   r
   r*  rZ   r   r   r   r   RobertaProcessingr   r   r   r   r   r   otrj   r   r  rW   rW   rX   r     s,   


zRobertaConverter.convertedNr  rW   rW   rW   rX   r@    r  r@  c                   @   r   )RoFormerConverterr[   c           
      C   s   ddl m} | jj}tt|t| jjd}d}d}t| jdr*| jj	j
}| jj	j}tjdd||d|_tj|||_t| jj}t| jj}| jj}| jj}	tj| d| d	| d| d
| d||f||	fgd|_tjdd|_|S )Nr   )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsrF  ra   rj   r   r   r   r   r   r   r   r   r	   r   r   r
   PreTokenizercustomr   r   r   r   r   r   r   r   r   r   )
r   rF  rj   r  r   r   r  r  r   r   rW   rW   rX   r   4  s8   

zRoFormerConverter.convertedNr  rW   rW   rW   rX   rE  3  r  rE  c                   @   r   )DebertaConverterr[   c              	   C   s~   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjddd| j dfd| j dfgd	|_|S )
NrJ   Fr$  r&  [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )ra   r  r   r  r  r   r   r
   r*  rZ   r   r   r   r   r   r  r   rC  rW   rW   rX   r   [  s.   
	zDebertaConverter.convertedNr  rW   rW   rW   rX   rJ  Z  r  rJ  c                       s|   e Zd ZdZeZi ZedddZ fddZ	dd Z
d	d
 Zdd Zdd Zdd Zdd Zdd ZdefddZ  ZS )SpmConverterFNc                 K   s   |dur||d< |S )z
        Hook used when converting directly from a SentencePiece model without a slow tokenizer instance.
        By default, return kwargs unchanged.
        Nrj   rW   )r  rj   r   rW   rW   rX   convert_from_spm~  s   zSpmConverter.convert_from_spmc                    s   t | d t j|  t }| }t| jjd}||	  W d    n1 s+w   Y  || _
| j
jjrB| jsDtd d S d S d S )NrP   r   a  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superr   rY   r   r   ra   
vocab_filer   r   r   r   r6  handle_byte_fallbackwarningswarn)r   argsr   r   r   	__class__rW   rX   r     s   
zSpmConverter.__init__c                 C      dd |j D S )Nc                 S   r   rW   r   r   rW   rW   rX   rx     rl   z&SpmConverter.vocab.<locals>.<listcomp>r   r   r   rW   rW   rX   rj        zSpmConverter.vocabc                 C   s   |j jS r   )r   r   r[  rW   rW   rX   r        zSpmConverter.unk_idc           	   	      s   |j j} |}|dkrtt| | jd}n-|dkrD  jj	
|\}}dd t|D }tt|||j jd jd d}ntd fd	d
t|jD }|dd
 t|dd dD  |S )Nr   r   r6  ro   c                 S   r   rW   rW   r   rW   rW   rX   r     r   z*SpmConverter.tokenizer.<locals>.<dictcomp>Tr   r  r6  r  z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                    8   g | ]\}}|j d v r||j|j dkp|j jv fqS r   r   r   r   r   r   rW   rX   rx     
    
z*SpmConverter.tokenizer.<locals>.<listcomp>c                 S   r   r   r   r   rW   rW   rX   rx     r   c                 S   r   r   rW   rg   rW   rW   rX   rk     r   z(SpmConverter.tokenizer.<locals>.<lambda>rm   )r   r   rj   r   r   r   rS  SpmExtractorra   rR  r   r   r   	unk_piece	Exceptionr   
add_tokensr   )	r   r   r   r   r  _r   	bpe_vocabr   rW   r   rX   r    sF   

zSpmConverter.tokenizerc                 C   sJ   |j j}tjdddttddg}|st|S tt|g| S )NFT)leftr	   {2,}   ▁)normalizer_specprecompiled_charsmapr	   StripReplacer   r>  Precompiledr   r   rm  _normalizersrW   rW   rX   r     s   
zSpmConverter.normalizerc                 C      t || j}tj||dS Nreplacementrb   )rc   ra   r
   	Metaspacer   rv  rZ   rb   rW   rW   rX   r        zSpmConverter.pre_tokenizerc                 C      d S r   rW   r   rW   rW   rX   r        zSpmConverter.post_processorc                 C   rs  rt  )rc   ra   r   rw  rx  rW   rW   rX   r     ry  zSpmConverter.decoderr[   c                 C   s   |  | j}| | j}|d ur||_d}d}t| jdr!| jj}| ||}|d ur.||_| |||_|  }|r>||_|S )Nrk  TrZ   )	r  r   r   r   ra   rZ   r   r   r   )r   r  r   rv  rZ   r   r   rW   rW   rX   r     s    zSpmConverter.convertedr   )r   r   r   rS  r   rc  r   classmethodrP  r   rj   r   r  r   r   r   r   r   r   __classcell__rW   rW   rW  rX   rO  y  s    	2rO  c                   @   $   e Zd Zdd Zdd Zdd ZdS )AlbertConverterc                 C   rY  )Nc                 S   2   g | ]}t |jr|j|jfn|j|jd  fqS d   r   r   r   r   rW   rW   rX   rx         $z)AlbertConverter.vocab.<locals>.<listcomp>rZ  r[  rW   rW   rX   rj        zAlbertConverter.vocabc                 C      t ddt ddg}| jjs|t   |t   | jjr)|t   |j	j
}|r7|t | |t tdd t |S Nz``"z''rj  r
  r	   ro  ra   keep_accentsr~   NFKDStripAccentsr   	Lowercaserl  rm  rp  r   r>  r   r   list_normalizersrm  rW   rW   rX   r        


zAlbertConverter.normalizerc                 C   ,   t jddd| jdfd| jdfgdS NrK  rL  rM  rN  r   r   r   ra   r  r   rW   rW   rX   r        zAlbertConverter.post_processorNr   r   r   rj   r   r   rW   rW   rW   rX   r        r  c                   @      e Zd Zdd Zdd ZdS )BarthezConverterc                 C      d}|S Nr   rW   r   r   r   rW   rW   rX   r   *     zBarthezConverter.unk_idc                 C   r  Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r  r   rW   rW   rX   r   .  r  zBarthezConverter.post_processorN)r   r   r   r   r   rW   rW   rW   rX   r  )  s    r  c                   @   2   e Zd Zdd Zdd Zdd Zed
dd	ZdS )CamembertConverterc                 C   2   g d}|dd |j dd  D 7 }|dg7 }|S )N)z
<s>NOTUSED        <pad>r  z</s>NOTUSEDr  <unk>r  )<unk>NOTUSEDic                 S   r   rW   r   r   rW   rW   rX   rx   C  rl   z,CamembertConverter.vocab.<locals>.<listcomp>r   <mask>r  rZ  r   r   rj   rW   rW   rX   rj   :  s   
zCamembertConverter.vocabc                 C      dS r  rW   r[  rW   rW   rX   r   G     zCamembertConverter.unk_idc                 C   r  r  r  r   rW   rW   rX   r   K  r  z!CamembertConverter.post_processorNc                 K   s~   t |dd}t |dd}t |dd}d|dfd	|dfd
g}|d ur2|t|dd   ||df ||d< |S )N	pad_tokenr  r   r  
mask_tokenr  r  r  r  )r        Yr   rj   r   r   r   r   r~   )r  rj   r   r  r   r  
vocab_listrW   rW   rX   rP  U  s   z#CamembertConverter.convert_from_spmr   r   r   r   rj   r   r   r|  rP  rW   rW   rW   rX   r  9  s    
r  c                   @   r~  )DebertaV2Converterc                 C   sH   g }| j jr|tjdd t|| j }|tj||d t|S )Nr7  )r9  ru  )ra   split_by_punctr~   r
   Punctuationrc   rw  r>  )r   rv  rZ   list_pretokenizersrb   rW   rW   rX   r   j  s   
z DebertaV2Converter.pre_tokenizerc                 C   sd   g }| j jr|t  |t  |jj}|r"|t| |t	t
dd t|S )Nrj  r
  )ra   r   r~   r	   r  rn  rl  rm  rp  ro  r   r>  r  rW   rW   rX   r   r  s   
zDebertaV2Converter.normalizerc                 C   r  r  r  r   rW   rW   rX   r     r  z!DebertaV2Converter.post_processorN)r   r   r   r   r   r   rW   rW   rW   rX   r  i  s    r  c                   @   r  )MBartConverterc                 C   >   g d}|dd |j dd  D 7 }|g d7 }|dg7 }|S )Nr  r  r  r  r  r  c                 S   r   rW   r   r   rW   rW   rX   rx     rl   z(MBartConverter.vocab.<locals>.<listcomp>r   )r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r    r  r!   r  r"   r  r#   r  r$   r  r%   r  r&   r  r'   r  r(   r  r)   r  r*   r  r+   r  r,   r  r-   r  r.   r  r  rZ  r  rW   rW   rX   rj     s
   
zMBartConverter.vocabc                 C   r  r  rW   r[  rW   rW   rX   r     r{  zMBartConverter.unk_idc                 C   r  )Nz$A </s> en_XXz$A $B </s> en_XXr   r  r   r  r   rW   rW   rX   r     r  zMBartConverter.post_processorNc           	      K      t |dd}t |dd}t |dd}t |dd}t |d	d
}|df|df|df|dfg}|d urE|t|dd   |dd tD  ||df ||d< |S )Nr+  r  r  r  	eos_tokenr  r   r  r  r  r  r   c                 s       | ]}|d fV  qdS r  NrW   rw   	lang_coderW   rW   rX   	<genexpr>      z2MBartConverter.convert_from_spm.<locals>.<genexpr>rj   )r   r   r   r   MBART_LANGUAGESr~   	r  rj   r   r+  r  r  r   r  r  rW   rW   rX   rP        zMBartConverter.convert_from_spmr   r  rW   rW   rW   rX   r    s    &
r  c                   @   r  )MBart50Converterc                 C   r  )Nr  c                 S   r   rW   r   r   rW   rW   rX   rx     rl   z*MBart50Converter.vocab.<locals>.<listcomp>r   )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )r/   r  )r0   r  )r1   r  )r2   r  )r3   r  )r4   r  )r5   r  )r6   r  )r7   r  )r8   r  )r9   r  )r:   r  )r;   r  )r<   r  )r=   r  )r>   r  )r?   r  )r@   r  )rA   r  )rB   r  )rC   r  )rD   r  )rE   r  )rF   r  )rG   r  )rH   r  )rI   r  r  rZ  r  rW   rW   rX   rj     s
   
zMBart50Converter.vocabc                 C   r  r  rW   r[  rW   rW   rX   r     r{  zMBart50Converter.unk_idc                 C   r  )Nzen_XX $A </s>zen_XX $A $B </s>r   r  r   r  r   rW   rW   rX   r     r  zMBart50Converter.post_processorNc           	      K   r  )Nr   r  r  r  r  r  r   r  r  r  r  r   c                 s   r  r  rW   r  rW   rW   rX   r    r  z4MBart50Converter.convert_from_spm.<locals>.<genexpr>rj   )r   r   r   r   MBART50_LANGUAGESr~   )	r  rj   r   r   r  r  r   r  r  rW   rW   rX   rP    r  z!MBart50Converter.convert_from_spmr   r  rW   rW   rW   rX   r    s    
r  c                   @   r  )NllbConverterc                 C   (   g d}|dd |j dd  D 7 }|S )Nr  c                 S   r   rW   r   r   rW   rW   rX   rx     rl   z'NllbConverter.vocab.<locals>.<listcomp>r   rZ  r  rW   rW   rX   rj        zNllbConverter.vocabc                 C   r  r  rW   r[  rW   rW   rX   r     r{  zNllbConverter.unk_idc                 C   r  )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr  r   r  r   rW   rW   rX   r     r  zNllbConverter.post_processorNc           
      K   s   t |dd}t |dd}t |dd}t |dd}|d	|d
|d|di}|d urNt|tr7| ndd |D }|D ]}	|	|v rGq@t|||	< q@||d< |S )Nr+  r  r  r  r  r  r   r  r   r   ro   r   c                 S   s   g | ]\}}|qS rW   rW   )rw   tokrg  rW   rW   rX   rx   *      z2NllbConverter.convert_from_spm.<locals>.<listcomp>rj   )r   r   
isinstancer{   r  rp   )
r  rj   r   r+  r  r  r   reordered_vocabtokensr   rW   rW   rX   rP    s"    zNllbConverter.convert_from_spmr   r  rW   rW   rW   rX   r    s    

r  c                   @   r~  )SeamlessM4TConverterc                 C   r  )N)r  r  r  r  c                 S   r   rW   r   r   rW   rW   rX   rx   ;  rl   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>r   rZ  r  rW   rW   rX   rj   4  r  zSeamlessM4TConverter.vocabc                 C   s   | j jS r   )ra   unk_token_idr[  rW   rW   rX   r   >  r]  zSeamlessM4TConverter.unk_idc                 C   r  )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r  r   r  r   rW   rW   rX   r   A  r  z#SeamlessM4TConverter.post_processorNr   r   r   rj   r   r   rW   rW   rW   rX   r  3  s    
r  c                   @   r  )XLMRobertaConverterc                 C   r  )Nr  c                 S   r   rW   r   r   rW   rW   rX   rx   T  rl   z-XLMRobertaConverter.vocab.<locals>.<listcomp>r   r  rZ  r  rW   rW   rX   rj   M  s   
zXLMRobertaConverter.vocabc                 C   r  r  rW   r  rW   rW   rX   r   X  r  zXLMRobertaConverter.unk_idc                 C   r  r  r  r   rW   rW   rX   r   \  r  z"XLMRobertaConverter.post_processorNc           	      K   s   t |dd}t |dd}t |dd}t |dd}t |d	d
}|df|df|df|dfg}|d urE|t|dd   ||df ||d< |S )Nr+  r  r  r  r  r  r   r  r  r  r  r   rj   r  r  rW   rW   rX   rP  f  s   z$XLMRobertaConverter.convert_from_spmr   r  rW   rW   rW   rX   r  L  s    
r  c                   @   r~  )XLNetConverterc                 C   rY  )Nc                 S   r  r  r  r   rW   rW   rX   rx   }  r  z(XLNetConverter.vocab.<locals>.<listcomp>rZ  r[  rW   rW   rX   rj   |  r  zXLNetConverter.vocabc                 C   r  r  r  r  rW   rW   rX   r     r  zXLNetConverter.normalizerc                 C   r  )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r  r   rW   rW   rX   r     r  zXLNetConverter.post_processorNr  rW   rW   rW   rX   r  {  r  r  c                   @      e Zd ZdS )ReformerConverterNr   r   r   rW   rW   rW   rX   r        r  c                   @   r  )RemBertConverterc                 C   s   t ddt ddt tddg}| jjs%|t   |t   | jjr0|t 	  |j
j}|r>|t | t |S r  )r	   ro  r   ra   r  r~   r  r  r   r  rl  rm  rp  r>  r  rW   rW   rX   r     s   


zRemBertConverter.normalizerc                 C   r  r  r  r   rW   rW   rX   r     r  zRemBertConverter.post_processorN)r   r   r   r   r   rW   rW   rW   rX   r    s    r  c                   @   r  )BertGenerationConverterNr  rW   rW   rW   rX   r    r  r  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
PegasusConverterc                 C   s   | j jdf| j jdfg}| j jd ur|| j jdfg7 }| j jd ur2| j j| j jk r2|| j jdfg7 }|dd td| j jD 7 }|dd |jdd  D 7 }|S )Nr  c                 S      g | ]
}d | ddfqS )z<unk_>r  rW   rw   r   rW   rW   rX   rx     ry   z*PegasusConverter.vocab.<locals>.<listcomp>ro   c                 S   r   rW   r   r   rW   rW   rX   rx     rl   )	ra   r  r  mask_token_sentr  mask_token_idoffsetr}   r   r  rW   rW   rX   rj     s   

zPegasusConverter.vocabc                 C   s   |j j| jj S r   )r   r   ra   r   r[  rW   rW   rX   r     r\  zPegasusConverter.unk_idc                 C   s(   t || j}tt tj||dgS rt  )rc   ra   r
   r>  WhitespaceSplitrw  rx  rW   rW   rX   r     s   zPegasusConverter.pre_tokenizerc                 C   s0   | j j}|| j jfg}tjd|gdd|g|dS )N$A$Br   )ra   r  eos_token_idr   r   )r   eosr   rW   rW   rX   r     s   
zPegasusConverter.post_processorN)r   r   r   rj   r   r   r   rW   rW   rW   rX   r    s
    	r  c                   @   s*   e Zd Zdd Zdd ZedddZdS )	T5Converterc                 C   s:   | j j}dd |jD }|dd t|d ddD 7 }|S )Nc                 S   r   rW   r   r   rW   rW   rX   rx     rl   z%T5Converter.vocab.<locals>.<listcomp>c                 S   r  )
<extra_id_r  r  rW   r  rW   rW   rX   rx     ry   r   r   )ra   
_extra_idsr   r}   )r   r   num_extra_idsrj   rW   rW   rX   rj     s   zT5Converter.vocabc                 C   &   t jddgg dd| jdfgdS Nr  r  )r  r  r  r  r   r  r   rW   rW   rX   r        zT5Converter.post_processorNc                 K   sf   | dd}dd t|d ddD }|d urt|ng }|dd |D  |d	| ||d
< |S )N	extra_idsr  c                 S   s   g | ]}d | dqS )r  r  rW   r  rW   rW   rX   rx     rl   z0T5Converter.convert_from_spm.<locals>.<listcomp>r   r   c                 s   r  r  rW   rw   r   rW   rW   rX   r    r  z/T5Converter.convert_from_spm.<locals>.<genexpr>r   rj   )r   r}   r   r   
setdefault)r  rj   r   r  extra_tokensr  rW   rW   rX   rP     s   zT5Converter.convert_from_spmr   )r   r   r   rj   r   r|  rP  rW   rW   rW   rX   r    s
    	r  c                   @      e Zd Zdd ZdS )UdopConverterc                 C   r
  r  r  r   rW   rW   rX   r     r  zUdopConverter.post_processorNr   r   r   r   rW   rW   rW   rX   r        r  c                   @   r   )WhisperConverterr[   c           	   	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j j}| j |}| j j}| j j}ddd |D }tj| d| d	| d
| d||fgt||d|_|S )NrJ   Fr$  r&  r
  c                 S   s   g | ]}| d qS )r   rW   r  rW   rW   rX   rx   .      z.WhisperConverter.converted.<locals>.<listcomp>z $A:0 r   z $A:0 $B:1 r   r   )ra   r  r   r  r  r   r   r
   r*  rZ   r   r   r   prefix_tokensconvert_ids_to_tokensr  r  joinr   r   zipr   )	r   rj   r   r  prefix_token_idsprefixesr  r  prefix_templaterW   rW   rX   r     s8   
	zWhisperConverter.convertedNr  rW   rW   rW   rX   r    r  r  c                   @   r  )BigBirdConverterc                 C   r  r  r  r   rW   rW   rX   r   <  r  zBigBirdConverter.post_processorNr  rW   rW   rW   rX   r  ;  r  r  c                   @   r   )CLIPConverterr[   c              
   C   s   | j j}t| j j }| j j}tt||d dddt|d}t	
t	 t	tddt	 g|_t
tjtddd	d
tjddg|_t |_tj| j j| j jf| j j| j jfddd|_|S )NrJ   r  Frj   r   r  r%  r  r  r   z\s+r
  z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr8  r&  rA  )ra   r  r   r  r  r   r   r   r   r	   r>  r=  ro  r   r  r   r
   r?  r*  r   r   r   r   rB  r  r  r+  r,  r   r!  rW   rW   rX   r   H  sD   


zCLIPConverter.convertedNr  rW   rW   rW   rX   r  G  r  r  c                   @   r   )LayoutLMv2Converterr[   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )Nr   FTr   r   r   r   r   r   r   r   r   r   r   rW   rW   rX   r   s  r  zLayoutLMv2Converter.convertedNr  rW   rW   rW   rX   r"  r  r  r"  c                   @   r   )BlenderbotConverterr[   c              	   C   st   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjd|j d|j|jfgd|_|S )NrJ   Fr$  r&  z$A:0 r   )r   r   )ra   r  r   r  r  r   r   r
   r*  rZ   r   r   r   r   r   r  r  r   rC  rW   rW   rX   r     s*   

zBlenderbotConverter.convertedNr  rW   rW   rW   rX   r#    r  r#  c                   @   r~  )XGLMConverterc                 C   s4   g d}|dd |j dd  D 7 }|g d7 }|S )Nr  c                 S   r   rW   r   r   rW   rW   rX   rx     rl   z'XGLMConverter.vocab.<locals>.<listcomp>r   ))z<madeupword0>r  )z<madeupword1>r  )z<madeupword2>r  )z<madeupword3>r  )z<madeupword4>r  )z<madeupword5>r  )z<madeupword6>r  rZ  r  rW   rW   rX   rj     s   zXGLMConverter.vocabc                 C   r  r  rW   r  rW   rW   rX   r     r  zXGLMConverter.unk_idc                 C   r  )Nz</s> $Az</s> $A </s> </s> $Br  r  r   r  r   rW   rW   rX   r     r  zXGLMConverter.post_processorNr  rW   rW   rW   rX   r$    s    r$  c                   @   sF   e Zd ZdZeZddhZ	 dd Zdd Zdd	 Z	d
d Z
dd ZdS )GemmaConverterTz<start_of_turn>z<end_of_turn>c                 C      t ddS Nr
  rk  )r	   ro  r[  rW   rW   rX   r        zGemmaConverter.normalizerc                 C   s|   | j jdf| j jdf| j jdfg}|dd |jdd  D 7 }tdd |D s<tdd t|D d }|d ur<d||< |S )	Nr  c                 S   r   rW   r   r   rW   rW   rX   rx     rl   z(GemmaConverter.vocab.<locals>.<listcomp>r   c                 s   s    | ]	}|d  dkV  qdS )r   r   NrW   )rw   rh   rW   rW   rX   r    s    z'GemmaConverter.vocab.<locals>.<genexpr>c                 s   s$    | ]\}}|d  dkr|V  qdS )r   r   NrW   )rw   r   rh   rW   rW   rX   r    s   " )r   r  )ra   r  r  r+  r   anynextr   )r   r   rj   override_indexrW   rW   rX   rj     s   


zGemmaConverter.vocabc                 C   r&  )Nr
  merged_with_previous)r
   r?  r   rv  rZ   rW   rW   rX   r     r(  zGemmaConverter.pre_tokenizerc                 C   r  r  rW   r  rW   rW   rX   r     r  zGemmaConverter.unk_idc                 C   s    t t ddt  t  gS )Nrk  r
  )r   r>  ro  ByteFallbackFuser-  rW   rW   rX   r     s   
zGemmaConverter.decoderN)r   r   r   rS  r   rc  r   r   rj   r   r   r   rW   rW   rW   rX   r%    s    
r%  c                   @   s@   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dS )LlamaConverterTc                 C   sN   | j ddf| j ddf| j ddfg}|dd |jdd  D 7 }|S )Nr   r  r   ro   c                 S   r   rW   r   r   rW   rW   rX   rx     rl   z(LlamaConverter.vocab.<locals>.<listcomp>r   )ra   r  r   r  rW   rW   rX   rj     s   zLlamaConverter.vocabc                 C   r  r   rW   r  rW   rW   rX   r     r  zLlamaConverter.unk_idc                 C   <   t ddt  t  g}|r|t jdddg7 }t |S Nrk  r
  r   )contentri  r   ro  r.  r/  rn  r>  r   rv  rZ   sequencerW   rW   rX   r        

zLlamaConverter.decoderc                 C   sT   t | jddr(g }t | jddr|tjddg7 }|tjdddg7 }t|S d S )Nr]   TrZ   rk  )prependr
  )patternr3  )r`   ra   r	   Prependro  r>  )r   r   r6  rW   rW   rX   r     s   
zLlamaConverter.normalizerc                 C   s.   t | jddst|| j}tj||ddS d S )Nr]   TFrv  rb   split)r`   ra   rc   r
   rw  rx  rW   rW   rX   r   (  s   zLlamaConverter.pre_tokenizerc                 C   rz  r   rW   r   rW   rW   rX   r   .  r  zLlamaConverter.post_processorN)
r   r   r   rS  rj   r   r   r   r   r   rW   rW   rW   rX   r0    s    	
	r0  c                   @   r   )MarkupLMConverterr[   c           	   
   C   s   | j }|j}t|j }tt||d ddd| j jd}tj	|j
d|_t	 |_t| j j}t| j j}| j j}| j j}tj| d| | d| d| ||f||fgd|_|S )NrJ   Fr   r&  z $A z $B r   )ra   r  r   r  r  r   r   r   r
   r*  rZ   r   r   r   r   r   r   r   r   r   r   r   )	r   rD  rj   r   r  r  r  r   r   rW   rW   rX   r   4  s8   
	zMarkupLMConverter.convertedNr  rW   rW   rW   rX   r=  3  r  r=  c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )MoshiConverterTc                 K   f   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S NrP   r   	r   r   r   rY   r   r   r   r   r   r   rR  r   r   r   r   rW   rW   rX   r   \  s   

zMoshiConverter.__init__c                 C   s:   |j j}tddg}|st|S tt|g| S r'  )rl  rm  r	   ro  r>  rp  rq  rW   rW   rX   r   i  s   

zMoshiConverter.normalizerc                 C   r1  r2  r4  r5  rW   rW   rX   r   s  r7  zMoshiConverter.decoderc                 C   s   d}t j||ddS )Nr^   Fr;  )r
   rw  rx  rW   rW   rX   r   }  s   zMoshiConverter.pre_tokenizerN)r   r   r   rS  r   r   r   r   rW   rW   rW   rX   r>  Y  s    

r>  c                   @   sR   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdS )HeliumConverterTNc                 K   r?  r@  rA  rB  rW   rW   rX   r     s   

zHeliumConverter.__init__c                    s     |}tt| | jd} fddt|jD }|dd t|dd dD  |t	dd	d	d
g |j
ddd |S )Nr^  c                    r`  r   ra  r   r   rW   rX   rx     rb  z-HeliumConverter.tokenizer.<locals>.<listcomp>c                 S   s"   g | ]\}}}t |d |ddqS )FT)r   r   single_wordr   r   rW   rW   rX   rx     s    c                 S   r   r   rW   rg   rW   rW   rX   rk     r   z+HeliumConverter.tokenizer.<locals>.<lambda>rm   
Fr   r  r   )r  pad_id)rj   r   r   r   rS  r   r   rf  r   r   enable_padding)r   r   r   r  r   rW   r   rX   r    s&   

zHeliumConverter.tokenizerc                 C   sB   g }|j D ]}|jdkr|d|jfg7 }q||j|jfg7 }q|S )Nz<0x0A>rE  )r   r   r   )r   r   rj   r   rW   rW   rX   rj     s   

zHeliumConverter.vocabc                 C   r  r   rW   r  rW   rW   rX   r     r  zHeliumConverter.unk_idc                 C   s8   t ddt  t  g}|t jdddg7 }t |S r2  r4  r5  rW   rW   rX   r     s   

zHeliumConverter.decoderc                 C   s   t t dt ddgS r'  )r	   r>  r:  ro  r[  rW   rW   rX   r     s   zHeliumConverter.normalizerc                 C   s   t t ddgS )NrE  
contiguous)r
   r>  r?  r-  rW   rW   rX   r     s   zHeliumConverter.pre_tokenizerc                 C   s   t jddgg ddgdS )Nr  r  )r  r  r  r  )r  r   r   )r   r   r   rW   rW   rX   r     s   zHeliumConverter.post_processorr   )r   r   r   rS  r   r  rj   r   r   r   r   r   rW   rW   rW   rX   rC    s    
		rC  c                   @   s"   e Zd ZdZdddZdd ZdS )ParakeetConverterTNc                 G   sl   || _ t| d t| | t }| }t|d}||  W d    n1 s,w   Y  || _	d S r@  )
rR  r   r   r   rY   r   r   r   r   r   )r   rR  rV  r   r   r   rW   rW   rX   r     s   

zParakeetConverter.__init__c              	      s     |}  j|\}}dd t|D }tt|||jjd j	d d} fddt|j
D }|dd t|dd	 d
D  |S )Nc                 S   r   rW   rW   r   rW   rW   rX   r     r   z/ParakeetConverter.tokenizer.<locals>.<dictcomp>Tr_  c                    r`  r   ra  r   r   rW   rX   rx     rb  z/ParakeetConverter.tokenizer.<locals>.<listcomp>c                 S   r   r   r   r   rW   rW   rX   rx     r   c                 S   r   r   rW   rg   rW   rW   rX   rk     r   z-ParakeetConverter.tokenizer.<locals>.<lambda>rm   )rj   rc  rR  r   r   r   r   r   rd  rS  r   rf  r   )r   r   r   rg  r   rh  r  r   rW   r   rX   r    s,   

zParakeetConverter.tokenizerr   )r   r   r   rS  r   r  rW   rW   rW   rX   rI    s    
rI  c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS rW   )chr)rw   nrW   rW   rX   rx     r  z$bytes_to_unicode.<locals>.<listcomp>)r   r}   ordr~   r{   r  )bscsrR  brW   rW   rX   bytes_to_unicode  s   L
rW  c                   @   sF   e Zd ZdZ				dddZdefdd	Zd
d ZdefddZ	dS )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                 K   4   || _ || _|| _t|tr| | _d S || _d S r   )rR  r9  rZ   r  r{   r  extra_special_tokens)r   rR  r9  rZ   r[  r   rW   rW   rX   r   (  s   zTikTokenConverter.__init__tiktoken_urlc                    s   zddl m} W n ty   tdw || t fddg }i }  D ]P\}}|||< t|dkr:q)g }tdt|D ]%}|d | ||d  }	}
|	 v rh|
 v rh|	|
  v rh||	|
|f qCt	| fddd	d
}|
| q)t	|dd d	d
}fdd|D }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                       d  fdd| dD S )NrJ   c                       g | ]} t | qS rW   rS  rw   charbyte_encoderrW   rX   rx   C  rl   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>latin-1r  decoderV  rc  rW   rX   token_bytes_to_stringB  s   zPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringr   c                    re   rf   rW   rg   )r  rW   rX   rk   P  rl   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>Frs   c                 S   r   Nro   rW   rq   rW   rW   rX   rk   R  r   c                    $   g | ]} |d   |d fqS ru   rW   rv   ri  rW   rX   rx   S     $ zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)tiktoken.loadr]  re  
ValueErrorrW  r|   rp   r}   r~   r   r   )r   r\  r]  r   rj   r   rankr   r   r   r   rW   )r  rd  ri  rX   extract_vocab_merges_from_model7  s6   z1TikTokenConverter.extract_vocab_merges_from_modelc                 C   :   |  | j\}}tt||dd}t|jdrd|j_|S NF)r  ignore_mergesTrq  rR  r   r   r   r   rt  r   r   r   r  rW   rW   rX   r  V  
   zTikTokenConverter.tokenizerr[   c                 C   sr   |   }ttjt| jdddtj| jddg|_t	 |_
| jd ur0|dd | jD  tjdd|_|S )Nr7  Fr8  r;  c                 S   s   g | ]	}t |d ddqS )FTr   r   r  rW   rW   rX   rx   i  r   z/TikTokenConverter.converted.<locals>.<listcomp>r(  )r  r
   r>  r?  r   r9  r*  rZ   r   r   r   r[  r  r   r   r   r  rW   rW   rX   r   ]  s   

zTikTokenConverter.convertedNrY  FN)
r   r   r   r   r   r   rq  r  r   r   rW   rW   rW   rX   rX  #  s    
rX  c                   @   sB   e Zd Z				dddZdefddZd	d
 ZdefddZdS )MistralConverterNrY  Fc                 K   rZ  r   )rR  r9  rZ   r  r{   r  r   )r   rR  r9  rZ   r   r   rW   rW   rX   r   r  s   zMistralConverter.__init__r\  c                    s  dd l  dd l}t| jddd}||}W d    n1 s w   Y  |d d | _dd |d	 D | _|d
 }t tfddg }i }t	| jD ]	\}}	|||	j
< qN fdd|D }t|}
dd t	|D t	t|ddD ]P\}}	|||	< t|	dkrqvg }tdt|	D ]%}|	d | |	|d  }}||
v r||
v r|| |
v r||||f qt|fdddd}|| qvt|dd dd}fdd|D }||fS )Nr   rzutf-8)encodingconfigr9  c                 S   s    g | ]}t |d  |d dqS )	token_str
is_control)r   r   rw   krW   rW   rX   rx     s    zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>r   rj   c                    r^  )NrJ   c                    r_  rW   r`  ra  rc  rW   rX   rx     rl   zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>re  rf  rh  rc  rW   rX   ri    s   zOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringc                    s   g | ]	}  |d  qS )token_bytes)	b64decoder  )base64rW   rX   rx     r   c                 S   s   i | ]\}}||qS rW   rW   )rw   rp  r   rW   rW   rX   r     r  zDMistralConverter.extract_vocab_merges_from_model.<locals>.<dictcomp>z(Converting tekken.json to tokenizer.json)descr   c                    re   rf   rW   rg   )token_to_rankrW   rX   rk     rl   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>Frs   c                 S   r   rj  rW   rq   rW   rW   rX   rk     r   c                    rk  ru   rW   rv   rl  rW   rX   rx     rm  )r  jsonr   rR  loadr9  r   rW  r   r   r3  rz   r   rp   r}   r~   r   r   )r   r\  r  r   untypedr  r   rj   idxr   rank_setrp  r   r   r   r   rW   )r  rd  ri  r  rX   rq    sF   z0MistralConverter.extract_vocab_merges_from_modelc                 C   rr  rs  ru  rv  rW   rW   rX   r    rw  zMistralConverter.tokenizerr[   c                 C   s^   |   }ttjt| jdddtj| jddg|_t	 |_
|| j tjdd|_|S )Nr7  Fr8  r;  r(  )r  r
   r>  r?  r   r9  r*  rZ   r   r   r   rf  r   r   r   rx  rW   rW   rX   r     s   
zMistralConverter.convertedry  )	r   r   r   r   r   rq  r  r   r   rW   rW   rW   rX   rz  q  s    
'rz  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerReformerTokenizerRemBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizerSplinterTokenizerXGLMTokenizer)LlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3TokenizerFc                 C   s   | j j}|tv r|st| }||  S | jdr)| | _td t	| j S ztd t
| j| jd W S  tyK   tdtt  w )a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    ztekken.jsonz#Converting from Mistral tekken.jsonzConverting from Tiktoken)rR  r[  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )rX  r   SLOW_TO_FAST_CONVERTERSr   rR  endswithra   loggerinforz  rX  r[  re  ro  r   r  )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classrW   rW   rX   convert_slow_tokenizer  s.   


r  )rJ   r   )F)Wr   rT  collections.abcr   	functoolsr   	packagingr   
tokenizersr   r   r   r   r	   r
   r   r   r   r   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerr   r  r  r  rY   boolr   rc   r   r   r   r   r   r   r  r  r  r  r#  r1  r5  r@  rE  rJ  rO  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r#  r$  r%  r0  r=  r>  rC  rI  rW  rX  rz  r  r  rW   rW   rW   rX   <module>   sZ  $


4'2''&,' %0!J0//% ($+'4.&)Y0NP	
 !"#$%&'()*+,-./01234;