o
    i
%                     @  s   d dl mZ d dlmZ ddlmZmZ ddlmZ ddl	m
Z
 ddlmZ e r-d d	lZer5dd
lmZ eeZG dd de
Zd	S )    )annotations)TYPE_CHECKING   )is_torch_availablelogging)
SinqConfig   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                      s   e Zd ZU dZdZded< d, fddZd-d
dZed-ddZ	dd Z
d.ddZd/ddZd0ddZd1dd Zd!d" Zd#d$ Z	%d2d3d(d)Zd4d*d+Z  ZS )5SinqHfQuantizera  
    HF v5 quantizer for SINQ.

    Modes:
      - method="sinq" (default):
          * weight-only SINQ
          * param-level ConversionOps (`SinqQuantize`) during load for pure language models
            (each Linear.weight is turned into a SINQLinear module)
          * module-level quantization after load for multimodal models
      - method="asinq":
          * A-SINQ (activation-aware) SINQ quantization
    Tbool requires_parameters_quantizationquantization_configr   c                   s$   t  j|fi | d | _d| _d S )NF)super__init___normalized_device_str_do_param_level_sinq)selfr   kwargs	__class__ h/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/quantizers/quantizer_sinq.pyr   1   s   
zSinqHfQuantizer.__init__returnc                 C     dS NTr   r   r   r   r   is_serializable7   s   zSinqHfQuantizer.is_serializablec                 C  r   r   r   r   r   r   r   is_trainable:   s   zSinqHfQuantizer.is_trainablec                 C  s@   |d u rt j rdt j i}nddi}td| d |S )N cpuz:The device_map was not initialized. Setting device_map to zJ. If you want to use the model for inference, please set device_map='auto')torchcudais_availablecurrent_deviceloggerinfo)r   
device_mapr   r   r   update_device_map>   s   
z!SinqHfQuantizer.update_device_mapdtypetorch.dtypec                 C  s   |d u rt j}|| _|S N)r"   bfloat16r*   )r   r*   r   r   r   update_dtypeK   s   zSinqHfQuantizer.update_dtypeNonec                 O  s   ddl m} | stdtj std |d}t	|t
r7t| }t|dkr7tdt| d| jjd	krD| jsFtd
d S d S )Nr   )is_sinq_availablezMThe 'sinq' package is not installed. Please install it with: pip install sinqzNo CUDA device is available. Quantization and inference will run on the CPU. Please note that this will significantly slow down inference speed and increase quantization time.r(   r   zkSinqHfQuantizer: multi-GPU device_map detected, but SINQ currently supports only a single CUDA device. Got z. Please use device_map=None.asinqzYou are using `method='asinq'` in the quantization config. Right now the calibrated version of SINQ is not supported in Hugging Face, please refer and use the official SINQ repository `to quantize a model with this method. )utilsr0   ImportErrorr"   r#   r$   r&   warningget
isinstancedictsetvalueslenRuntimeErrorsortedr   methodpre_quantized
ValueError)r   argsr   r0   r(   device_map_valuesr   r   r   validate_environmentQ   s*   


z$SinqHfQuantizer.validate_environmentcfgr7   c              
   C  sJ   ddl m} |j}|t|j|jdurt|jndddddt|j|dS )zI
        Build the dict that SINQLinear expects as quant_config.
        r   )sinq_base_quant_configNFr   )nbits
group_size
quant_zeroquant_scaleview_as_floataxistiling_moder=   )sinq.sinqlinear_hfrD   r=   intrE   rF   strrK   )r   rC   sinq_base_quant_config_fnr=   r   r   r   _build_sinq_quant_dictm   s   z&SinqHfQuantizer._build_sinq_quant_dictmodelr   
param_namerN   c           
      K  sn   ddl m} | jrdS | jjdkrdS | jsdS t||\}}|dkr%dS t||}t|dd}|o4| }	|	S )a-  
        Called per-parameter to decide whether to run `SinqQuantize` on it.

        - If `self.pre_quantized`, we do *not* quantize again (handled by SinqDeserialize instead).
        - For method="asinq": return False (ASINQ is not supported in Hugging Face).
        - For method="sinq": True only for SINQLinear.weight not in modules_to_not_convert.

        Note: After _process_model_before_weight_loading(), the modules are already SINQLinear,
        not nn.Linear. We check for SINQLinear modules that are not yet quantized (ready=False).
        r   )
SINQLinearFr1   weightreadyT)	rL   rS   r>   r   r=   r   r
   r6   getattr)
r   rQ   rR   r   rS   moduletensor_nameis_sinqis_readyresultr   r   r   param_needs_quantization   s   

z(SinqHfQuantizer.param_needs_quantizationc                 C  s   ddl m} || S )z
        Return the ConversionOps used for param-level quantization (Sinq).
        The actual SINQLinear construction is in integrations/sinq.py.
        r   )SinqQuantize)integrations.sinqr]   )r   r]   r   r   r   get_quantize_ops   s   z SinqHfQuantizer.get_quantize_opsc                 C  s>   ddl m} | jrddlm} |g ddg|| gdgS g S )a4  
        If `pre_quantized=True`, interpret a checkpoint produced by SINQLinear.state_dict:

            <prefix>.W_q
            <prefix>.bias
            <prefix>.meta

        via a WeightConverter + SinqDeserialize so that we reconstruct a SINQLinear
        module instead of a plain nn.Linear.
        r   )WeightConverter)SinqDeserialize)z.W_qz.metaz.biasz.weight)source_patternstarget_patterns
operations)core_model_loadingr`   r>   r^   ra   )r   r`   ra   r   r   r   get_weight_conversions   s   z&SinqHfQuantizer.get_weight_conversionsNkeep_in_fp32_moduleslist[str] | Nonec           	      K  s   ddl m} | || jjpg || _| jjdko| j | _| jr"dn| | j}t	|t
rFtt| d}t	|trAd| }nt|}n	tj rMdnd}||| j|| j|| jd	}dS )
a  
        Called on meta-initialized model, before loading any weights.

        For SINQ, we replace nn.Linear modules with empty SINQLinear modules here.
        The actual quantization happens later in SinqQuantize.convert() when weights are loaded.
        r   )replace_with_sinq_linearsinqNr   zcuda:zcuda:0r!   )modules_to_not_convertquant_configcompute_dtypedevicer>   )r^   ri   get_modules_to_not_convertr   rk   r=   r>   r   rP   r6   r7   nextiterr9   rM   rN   r"   r#   r$   r*   )	r   rQ   r(   rg   r   ri   sinq_quant_dictfirst_device
device_strr   r   r   $_process_model_before_weight_loading   s(   



z4SinqHfQuantizer._process_model_before_weight_loadingc                 K  s   ddl m} |  |S )aq  
        Called after *all* weights have been loaded.

        For SINQ:
        1. Move non-SINQLinear modules to GPU (embeddings, norms, lm_head, etc.)
           - SINQLinear modules already have GemLite buffers on GPU
           - We skip moving SINQLinear's W_q/meta to avoid memory duplication
        2. Patch HF save/load methods for SINQ serialization
        r   )patch_hf_pretrained_io)
sinq.hf_iorv   )r   rQ   r   rv   r   r   r   #_process_model_after_weight_loading   s   z3SinqHfQuantizer._process_model_after_weight_loading)r   r   )r   r   )r*   r+   r   r+   )r   r/   )rC   r   r   r7   )rQ   r   rR   rN   r   r   r,   )rQ   r   rg   rh   )rQ   r   )__name__
__module____qualname____doc__r   __annotations__r   r   propertyr   r)   r.   rB   rP   r\   r_   rf   ru   rx   __classcell__r   r   r   r   r   !   s"   
 




"	!+r   )
__future__r   typingr   r2   r   r   utils.quantization_configr   baser	   quantizers_utilsr
   r"   modeling_utilsr   
get_loggerry   r&   r   r   r   r   r   <module>   s   
