o
    ‚›i{x  ã                   @   s  d Z ddlmZ ddlZddlmZmZmZmZm	Z	 ddl
mZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ e e¡Zi ddddœ“ddddddddddddœ
“ddddddddddddœ
“ddddddddddddœ
“dddddddddddddd œ“d!ddddddddddd"d#œ“d$dddddddddddœ
“d%dddddddddddddd&œ“d'dddddddddddœ
“d(d)d*d+d,d-œ“d.dddddddddddœ
“d/d0dd1dd2d3œ“d4d5d6d7d8d9d:dd2d;d<dd=œ“d>dddddddd?dd@œ	“dAd0dBdCdDd1d2dEœ“dFdddddddGdHœ“dIdddd2ddJdKdLddMœ	“dddddddddNddœ
dddddddddddOddPœdddddddddddOddPœd5d6d7d8d9d:dd2d;d<dd=œdddddddddddœ
dQœ¥ZdRdSdTdUdVd)d*d+d,dWdXœ
dYdd)d*d+d,dZœd[œZd%d\d]iiZd^d_„ ZG d`da„ daƒZ G dbdc„ dceƒZ!G ddde„ deeƒZ"G dfdg„ dgeƒZ#G dhdi„ dieƒZ$G djdk„ dkeƒZ%G dldm„ dmeƒZ&i de!“de"“de"“d$e"“d%e"“d.e#“d/e$“d'e$“d>e$“dAe$“dFe$“d4e%“dIe$“dne$“doe&“dpe&“dqe%“e!e!drœ¥Z'dse(dte)ee*f fdudv„Z+dS )wz
Integration with GGML / The file is copied and adapted from https://github.com/99991/pygguf
with extra methods beings exposed
é    )ÚarrayN)Ú	TokenizerÚdecodersÚnormalizersÚpre_tokenizersÚ
processors)ÚBPEÚUnigramé   ©Ú
AddedToken)ÚGemmaConverterÚGPT2ConverterÚLlamaConverterÚQwen2ConverterÚT5Converter)Úlogging)ÚtqdmÚgeneralÚ
model_typeÚ_model_name_or_path)ÚarchitectureÚnameÚllamaÚmax_position_embeddingsÚnum_hidden_layersÚintermediate_sizeÚhidden_sizeÚhead_dimÚ
rope_thetaÚnum_attention_headsÚnum_key_value_headsÚrms_norm_epsÚ
vocab_size)
Úcontext_lengthÚblock_countÚfeed_forward_lengthÚembedding_lengthúrope.dimension_countúrope.freq_baseúattention.head_countúattention.head_count_kvú attention.layer_norm_rms_epsilonr#   ÚmistralÚqwen2Ú	qwen2_moeÚnum_expertsÚnum_experts_per_tok)r$   r%   r&   r'   r(   r)   r*   r+   r,   r#   Úexpert_countÚexpert_used_countÚlfm2Úconv_L_cache)r$   r%   r&   r'   r(   r)   r*   r+   r,   r#   zshortconv.l_cacheÚqwen3Ú	qwen3_moe)r$   r%   r&   r'   r(   r)   úattention.key_lengthr*   r+   r,   r#   r2   r3   ÚfalconÚ	tokenizerÚbos_token_idÚeos_token_idÚunk_token_idÚpad_token_id)úggml.bos_token_idúggml.eos_token_idúggml.unknown_token_idúggml.padding_token_idÚphi3ÚbloomÚn_layerÚn_headÚlayer_norm_epsilon)r%   r'   r*   r#   úattention.layer_norm_epsilonÚt5Ún_positionsÚ
num_layersÚd_ffÚd_modelÚd_kvÚ	num_headsÚrelative_attention_num_bucketsÚdecoder_start_token_id)r$   r%   r&   r'   r8   r*   r+   rH   z attention.relative_buckets_countrQ   r#   ÚstablelmÚlayer_norm_eps)	r$   r%   r&   r'   r(   r*   r+   rH   r#   Úgpt2Ún_ctxÚn_embdr&   )r%   r$   r'   r&   r*   rH   Ú
starcoder2Únorm_epsilon)r%   r$   r'   r&   r*   r+   rH   ÚmambaÚconv_kernelÚ
state_sizeÚtime_step_rank)	r#   r$   r'   r,   r%   zssm.conv_kernelzssm.state_sizezssm.time_step_rankzssm.inner_sizeÚnorm_epsÚsliding_window)r$   r%   r&   r'   r(   r)   r8   r*   r+   r,   zattention.sliding_windowr#   )ÚnemotronÚgemma2Úgemma3Úumt5ÚdeciÚtokenizer_typeÚtokensÚscoresÚ
token_typeÚmergesÚadd_prefix_space)
ú
ggml.modelzggml.tokenszggml.scoreszggml.token_typezggml.mergesr?   r@   rA   rB   zggml.add_space_prefixÚchat_template)rk   rj   r?   r@   rA   rB   )r:   Útokenizer_configÚnorm_topk_probTc                 C   sÆ   t |tƒs|g}t|ƒdkr|d }d }n|d dkrtdƒ‚|\}}|dv r/t| d ƒ} | S |dv r;t| d ƒ} | S |dkrGt| d ƒ} | S |dkrXtd	t| ƒƒ ¡  	¡ } | S |dkrat
| |ƒ} | S )
Né   r   é	   zPReceived multiple types, therefore expected the first type to indicate an array.)r   rn   r
   é   é   é   é
   é   )é   é   é   é   ÚB)Ú
isinstanceÚlistÚlenÚ
ValueErrorÚintÚfloatÚboolr   ÚtobytesÚdecodeÚ_gguf_parse_value)Ú_valueÚ	data_typeÚarray_data_type© r‡   ú`/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/ggml.pyrƒ   F  s.   
	øúüþ
rƒ   c                   @   s   e Zd Zdd„ ZdS )ÚGGUFTokenizerSkeletonc                    s¢  |  ¡ D ]
\}}t| ||ƒ qt| dƒs’t| dƒrt| dƒs"tdƒ‚| j}| j‰ ‡ fdd„t|ƒD ƒ‰t d¡ g }t	ˆ  ¡ ƒD ]=\}}g }t
dt|ƒƒD ]}	|d |	… ||	d … }
}|
|v rl||v rl| |
||f¡ qMt|‡fd	d
„dd}| |¡ q@t|dd
„ dd}dd„ |D ƒ}|| _ndd„ | jD ƒ| _t| dƒs­dd„ t
t| jƒƒD ƒ| _t| dƒsµg | _t| dƒs½d | _t| dƒrÍ| jd u rÏ| j| _d S d S d S )Nrh   re   rf   z\tokens and scores need to be passed for a LLaMa tokenizer without merges to be instantiated.c                    s   i | ]	\}}|ˆ | “qS r‡   r‡   )Ú.0ÚiÚt)rf   r‡   rˆ   Ú
<dictcomp>j  ó    z2GGUFTokenizerSkeleton.__init__.<locals>.<dictcomp>z:Merges were not in checkpoint, building merges on the fly.rn   c                    s   ˆ | d  ˆ | d  fS )Nr   rn   r‡   )Úx)Úvocabr‡   rˆ   Ú<lambda>t  s    z0GGUFTokenizerSkeleton.__init__.<locals>.<lambda>T)ÚkeyÚreversec                 S   s   | d S )Nr
   r‡   )Úvalr‡   r‡   rˆ   r‘   v  s    c                 S   s   g | ]
}|d  |d f‘qS )r   rn   r‡   )rŠ   r”   r‡   r‡   rˆ   Ú
<listcomp>w  s    z2GGUFTokenizerSkeleton.__init__.<locals>.<listcomp>c                 S   s   g | ]	}t | d ¡ƒ‘qS )ú )ÚtupleÚsplit)rŠ   Úmerger‡   r‡   rˆ   r•   z  rŽ   c                 S   s   g | ]}d ‘qS ©Nr‡   )rŠ   Ú_r‡   r‡   rˆ   r•   |  s    Úadded_tokensr=   Úunknown_token_id)ÚitemsÚsetattrÚhasattrr}   re   rf   Ú	enumerateÚloggerÚwarningr   Úranger|   ÚappendÚsortedÚextendrh   rœ   r=   r   )ÚselfÚdict_ÚkÚvre   rh   r™   Úpiece_scoreÚlocalÚindexÚpiece_lÚpiece_rr‡   )rf   r   rˆ   Ú__init___  sD   
ÿ
€


ÿzGGUFTokenizerSkeleton.__init__N)Ú__name__Ú
__module__Ú__qualname__r±   r‡   r‡   r‡   rˆ   r‰   ^  s    r‰   c                   @   s<   e Zd Zdd„ Zdd„ Zdd„ Zdd„ Zd	d
„ Zdd„ ZdS )ÚGGUFLlamaConverterc                 C   s0   t |ƒ| _| j| _i | _t| jddƒdk| _d S )Nrd   r   )r‰   ÚprotoÚoriginal_tokenizerÚadditional_kwargsÚgetattrÚis_llama_3_tokenizer©r¨   Útokenizer_dictr‡   r‡   rˆ   r±   Š  s   
zGGUFLlamaConverter.__init__c                 C   ó   t t|j|jƒƒS rš   ©r{   Úzipre   rf   ©r¨   r¶   r‡   r‡   rˆ   r     ó   zGGUFLlamaConverter.vocabc                 C   ó   |j S rš   ©rh   rÀ   r‡   r‡   rˆ   rh   “  ó   zGGUFLlamaConverter.mergesc                 C   sÎ  |   | j¡}|  | j¡}dd„ t|ƒD ƒ}|jd ur |j|j nd }t|dd ƒd ur0|j|j nd }t|dd ƒd ur@|j|j nd }tt	|||dddƒ}g }	t
| jdƒs€|d urc|	 t|ddd	¡ |d urq|	 t|ddd	¡ |d ur|	 t|ddd	¡ n!t t | jj¡d
k¡d }
|
D ]}|	 t| jj| ddd	¡ qt|	ƒdkr¬| |	¡ t| jjƒdkrÀ| dd„ | jjD ƒ¡ || jd< || jd< || jd< | jråd | jd< d| jd< d| jd< d| j_|S )Nc                 S   ó   i | ]	\}\}}||“qS r‡   r‡   ©rŠ   r‹   ÚwordÚ_scorer‡   r‡   rˆ   r   ™  rŽ   z0GGUFLlamaConverter.tokenizer.<locals>.<dictcomp>r;   r<   T)Ú	unk_tokenÚfuse_unkÚbyte_fallbackrg   F©Ú
normalizedÚspecialrp   r   c                 S   s   g | ]	}t |d d d‘qS )FrÌ   r   )rŠ   Úadded_tokenr‡   r‡   rˆ   r•   À  rŽ   z0GGUFLlamaConverter.tokenizer.<locals>.<listcomp>rÉ   Ú	eos_tokenÚ	bos_tokenri   Úclean_up_tokenization_spacesÚlegacy)r   r¶   rh   r¡   r=   re   r¹   r;   r   r   r    r¥   r   ÚnpÚwherer   rg   r|   Úadd_special_tokensrœ   Ú
add_tokensr¸   rº   r·   rÓ   )r¨   r¶   Úvocab_scoresrh   Ú	bpe_vocabrÉ   rÑ   rÐ   r:   Úspecial_tokensÚspecial_tokens_idxÚidxr‡   r‡   rˆ   r:   –  sT     ûÿ
€
ÿ





zGGUFLlamaConverter.tokenizerc                 C   sX   t  ¡ t  ¡ t  dd¡g}| jr|t jddddg7 }|r'|t jdddg7 }t  |¡S )Nõ   â–r–   FT©ri   Útrim_offsetsÚ	use_regexrn   ©ÚcontentÚleft)r   ÚByteFallbackÚFuseÚReplacerº   Ú	ByteLevelÚStripÚSequence©r¨   Úreplacementri   Úsequencer‡   r‡   rˆ   ÚdecoderÐ  s   
ý
zGGUFLlamaConverter.decoderc                 C   s¤   |   | j¡}|  | j¡}|d ur||_d}d}t| jdƒr!| jj}|  ||¡}|d ur.||_|  ||¡|_|  ¡ }|r>||_| j	rPt
jdddd|_t g ¡|_|S )NrÝ   Tri   FrÞ   )r:   r¶   Ú
normalizerr    r·   ri   Úpre_tokenizerrí   Úpost_processorrº   r   rç   r   ré   )r¨   r:   rî   rë   ri   rï   rð   r‡   r‡   rˆ   Ú	convertedÞ  s*   ÿzGGUFLlamaConverter.convertedN)	r²   r³   r´   r±   r   rh   r:   rí   rñ   r‡   r‡   r‡   rˆ   rµ   ‰  s    :rµ   c                       ó*   e Zd Zdd„ Zdef‡ fdd„Z‡  ZS )ÚGGUFQwen2Converterc                 C   ó   t |ƒ| _i | _d S rš   ©r‰   r·   r¸   r»   r‡   r‡   rˆ   r±     ó   

zGGUFQwen2Converter.__init__Úreturnc              	      s^   dd„ t | jjƒD ƒ}| jj}tƒ  ||¡}| tddddtddddtddddg¡ |S )	Nc                 S   ó   i | ]\}}||“qS r‡   r‡   ©rŠ   r‹   rÇ   r‡   r‡   rˆ   r     ó    z0GGUFQwen2Converter.converted.<locals>.<dictcomp>ú<|endoftext|>FTrÌ   z<|im_start|>z
<|im_end|>)r¡   r·   re   rh   Úsuperrñ   rÖ   r   ©r¨   r   rh   r:   ©Ú	__class__r‡   rˆ   rñ     s   ýÿzGGUFQwen2Converter.converted©r²   r³   r´   r±   r   rñ   Ú__classcell__r‡   r‡   rþ   rˆ   ró     ó    ró   c                   @   sB   e Zd Zdd„ Zdd„ Zdd„ Zdd„ Zd	d
„ Zdefdd„Z	dS )ÚGGUFPhi3Converterc                 C   s   t |ƒ| _| j| _i | _d S rš   ©r‰   r¶   r·   r¸   r»   r‡   r‡   rˆ   r±     s   

zGGUFPhi3Converter.__init__c                 C   r½   rš   r¾   rÀ   r‡   r‡   rˆ   r     rÁ   zGGUFPhi3Converter.vocabc                 C   rÂ   rš   rÃ   rÀ   r‡   r‡   rˆ   rh     rÄ   zGGUFPhi3Converter.mergesc                 C   sn  |   | j¡}|  | j¡}dd„ t|ƒD ƒ}tt||ƒƒ}| tddddddtddddtd	dddd
tddddd
tddddd
tddddd
tddddd
tddddd
tddddd
tddddd
tddddd
tddddd
g¡ |jd ur€|j	|j nd | j
d< |jd ur|j	|j nd | j
d< |jd ur |j	|j nd | j
d< |jd ur°|j	|j nd | j
d< |S )Nc                 S   rÅ   r‡   r‡   rÆ   r‡   r‡   rˆ   r   %  rŽ   z/GGUFPhi3Converter.tokenizer.<locals>.<dictcomp>ú</s>TF)ÚrstripÚlstriprÍ   rÎ   rû   rÌ   z<|assistant|>)r  rÍ   rÎ   z<|placeholder1|>z<|placeholder2|>z<|placeholder3|>z<|placeholder4|>z
<|system|>z<|end|>z<|placeholder5|>z<|placeholder6|>z<|user|>rÉ   rÐ   rÑ   Ú	pad_token)r   r¶   rh   r¡   r   r   rÖ   r   r=   re   r¸   r<   r;   r>   )r¨   r¶   rØ   rh   rÙ   r:   r‡   r‡   rˆ   r:   "  s8   ôÿÿÿÿÿzGGUFPhi3Converter.tokenizerc                 C   s<   t  ¡ t  ¡ t  |d¡g}|r|t jdddg7 }t  |¡S )Nr–   rn   rá   )r   rä   rå   ræ   rè   ré   rê   r‡   r‡   rˆ   rí   I  s   
ý
zGGUFPhi3Converter.decoderr÷   c                 C   s:   |   | j¡}d}d}t| jdƒr| jj}|  ||¡|_|S )NrÝ   Tri   )r:   r¶   r    r·   ri   rí   )r¨   r:   rë   ri   r‡   r‡   rˆ   rñ   T  s   zGGUFPhi3Converter.convertedN)
r²   r³   r´   r±   r   rh   r:   rí   r   rñ   r‡   r‡   r‡   rˆ   r    s    'r  c                       rò   )ÚGGUFGPTConverterc                 C   rô   rš   rõ   r»   r‡   r‡   rˆ   r±   b  rö   zGGUFGPTConverter.__init__r÷   c                    s0   dd„ t | jjƒD ƒ}| jj}tƒ  ||¡}|S )Nc                 S   rø   r‡   r‡   rù   r‡   r‡   rˆ   r   g  rú   z.GGUFGPTConverter.converted.<locals>.<dictcomp>)r¡   r·   re   rh   rü   rñ   rý   rþ   r‡   rˆ   rñ   f  s   zGGUFGPTConverter.convertedr   r‡   r‡   rþ   rˆ   r	  a  r  r	  c                   @   ó:   e Zd Zdd„ Zdd„ Zdd„ Zdd„ Zd	efd
d„ZdS )ÚGGUFT5Converterc                 C   s>   dg|d< t |ƒ| _dd„ t| jjƒD ƒ| _| j| _i | _d S )Nú
dummy textrh   c                 S   rø   r‡   r‡   )rŠ   r«   rª   r‡   r‡   rˆ   r   s  rú   z,GGUFT5Converter.__init__.<locals>.<dictcomp>)r‰   r¶   r¡   re   Útoken2idr·   r¸   r»   r‡   r‡   rˆ   r±   n  s
   


zGGUFT5Converter.__init__c                 C   r½   rš   r¾   rÀ   r‡   r‡   rˆ   r   w  rÁ   zGGUFT5Converter.vocabc                 C   sT   t | jddƒr(g }t | jddƒr|tjddg7 }|tjdddg7 }t |¡S d S )NrÓ   Tri   rÝ   )Úprependr–   )Úpatternrâ   )r¹   r·   r   ÚPrependræ   ré   )r¨   r¶   rì   r‡   r‡   rˆ   rî   z  s   
zGGUFT5Converter.normalizerc                 C   s$   t jddgg d¢d| jd fgdS )Nú$Ar  )r  r  z$Br  )ÚsingleÚpairrÚ   )r   ÚTemplateProcessingr  )r¨   r‡   r‡   rˆ   rð   ƒ  s   ÿýzGGUFT5Converter.post_processorr÷   c                 C   s–   |   | j¡}tt|| jjddƒ}|  | j¡}|d ur||_d}d}t| jdƒr,| jj}|  	||¡}|d ur9||_	|  
||¡|_
|  ¡ }|rI||_|S )NF©Úunk_idrË   rÝ   Tri   )r   r¶   r   r	   r=   rî   r    r·   ri   rï   rí   rð   )r¨   rØ   r:   rî   rë   ri   rï   rð   r‡   r‡   rˆ   rñ   Œ  s.   ýÿ	zGGUFT5Converter.convertedN)	r²   r³   r´   r±   r   rî   rð   r   rñ   r‡   r‡   r‡   rˆ   r  m  s    			r  c                   @   r
  )ÚGGUFGemmaConverterc                 C   s&   dg|d< t |ƒ| _| j| _i | _d S )Nr  rh   r  r»   r‡   r‡   rˆ   r±   ­  s   


zGGUFGemmaConverter.__init__c                 C   s‚   t t|j|jƒƒ}g }|D ]1\}}|dkr| d|f¡ qd|v r7t| ¡ ƒdkr7dt|ƒ }| ||f¡ q| ||f¡ q|S )Nz<0x09>ú	r–   r   rÝ   )r{   r¿   re   rf   r¥   r|   Ústrip)r¨   r¶   Úoriginal_vocabÚupdated_vocabÚtokenÚscoreÚunderscoresr‡   r‡   rˆ   r   µ  s   zGGUFGemmaConverter.vocabc                 C   s   t  dd¡S )Nr–   rÝ   )r   ræ   rÀ   r‡   r‡   rˆ   rî   Ä  s   zGGUFGemmaConverter.normalizerc                 C   s<   t  dd¡t  ¡ t  ¡ g}|r|t jdddg7 }t  |¡S )NrÝ   r–   rn   rá   )r   ræ   rä   rå   rè   ré   rê   r‡   r‡   rˆ   rí   Ç  s   
ý
zGGUFGemmaConverter.decoderr÷   c                 C   s†   |   | j¡}tt|| jj| jdƒ}|  | j¡}|d ur||_d}d}t| jdƒr-| jj	}|  
||¡|_
|  ||¡}|d urA||_|S )Nr  rÝ   Tri   )r   r¶   r   r	   r=   Úhandle_byte_fallbackrî   r    r·   ri   rí   rï   )r¨   rØ   r:   rî   rë   ri   rï   r‡   r‡   rˆ   rñ   Ò  s(   ýÿzGGUFGemmaConverter.convertedN)	r²   r³   r´   r±   r   rî   rí   r   rñ   r‡   r‡   r‡   rˆ   r  ¬  s    r  r_   r`   Úgemma3_textrb   )rc   Údecilmr   r÷   c                 C   s"   | }t | |ƒ}| ¡ }||jfS )a6  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        architecture (`str`): The model architecture derived from gguf file.
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    )ÚGGUF_TO_FAST_CONVERTERSrñ   r¸   )r   r¼   Útokenizer_class_nameÚ	converterÚfast_tokenizerr‡   r‡   rˆ   Úconvert_gguf_tokenizer  s   
r&  ),Ú__doc__r   ÚnumpyrÔ   Ú
tokenizersr   r   r   r   r   Útokenizers.modelsr   r	   Ú r   Úconvert_slow_tokenizerr   r   r   r   r   Úutilsr   Úutils.loggingr   Ú
get_loggerr²   r¢   ÚGGUF_CONFIG_MAPPINGÚGGUF_TOKENIZER_MAPPINGÚGGUF_CONFIG_DEFAULTS_MAPPINGrƒ   r‰   rµ   ró   r  r	  r  r  r"  Ústrr—   Údictr&  r‡   r‡   r‡   rˆ   Ú<module>   sÈ  
þÿõûõîöá+ôÕ9õÇFöºRó®aöŸmü“söû õ ù ÷ ì ú á 'ù Ù 0÷ Ð <öòòõö Œ  öúóüÿ
+yK?Aÿþýüûúùø	÷
öõôóòñðïí