o
    i*Z                     @  s  U d Z ddlmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZ ddlZddlmZ ddlmZ d	d
lmZ d	dlmZ d	dlmZ d	dlmZmZ d	dlmZmZmZmZmZm Z m!Z!m"Z"m#Z# ersd	dl$m%Z% d	dlm&Z& e'e(Z)e	G dd dZ*dRddZ+ee*ge,ee-e.ef f f Z/dSddZ0	dTdUd+d,Z1dVd.d/Z2dVd0d1Z3dVd2d3Z4dVd4d5Z5dVd6d7Z6dVd8d9Z7dVd:d;Z8dVd<d=Z9dVd>d?Z:dVd@dAZ;dVdBdCZ<dVdDdEZ=dVdFdGZ>dVdHdIZ?dVdJdKZ@dVdLdMZAdVdNdOZBejCejDejEejFejGejHejIejJejKejLejMejNejOejPejQgZRejSejTejUejVejWejXgZYejZej[gZ\ej]ej^gZ_ej`ejaejbgZcejde2ejee3ejfe3ejge4ejhe5ejie6ejje8ejke9ejle:ejme;ejne?ejoeBejpe>ejqe>ie-reRe7e-reYe<e-re\e=e-re_e@e-receAZsdPetdQ< dS )Wz,
Optimizer utilities for the Trainer class.
    )annotationsN)Callable)	dataclass)TYPE_CHECKINGAny)version)nn   )	Adafactor)LayerWiseDummyOptimizer)check_target_module_exists)OptimizerNamesParallelMode)	is_apollo_torch_availableis_bitsandbytes_availableis_galore_torch_availableis_grokadamw_availableis_lomo_availableis_schedulefree_availableis_torch_optimi_availableis_torchao_available	strtobool)PreTrainedModel)TrainingArgumentsc                   @  s:   e Zd ZU dZded< ded< ded< ded< d	ed
< dS )OptimizerContextz0Context object passed to all optimizer handlers.r   argszPreTrainedModel | Nonemodeldict[str, Any]optimizer_kwargsadam_kwargsdict[str, str]
optim_argsN)__name__
__module____qualname____doc____annotations__ r'   r'   `/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/trainer_optimizer.pyr   6   s   
 r   optim_args_str
str | Nonereturnr    c                 C  s>   | si S i }|  dddD ]}|d\}}|||< q|S )z8Parse optimizer arguments from a comma-separated string.  ,=)replacesplit)r)   r!   mappingkeyvaluer'   r'   r(   _parse_optim_argsA   s   
r5   optimizer_cls_or_factoryr   boolc                 C  s    t | trt| tjjrdS dS )a4  
    Check if the returned value from a handler is a factory rather than an Optimizer class.

    Factory callables are used for complex optimizers like Muon or Dion that need to:
    - Split parameters between multiple internal optimizers
    - Handle complex sharding logic
    - Access the full model structure for parameter grouping

    Args:
        optimizer_cls_or_factory: The first element returned by an optimizer handler.

    Returns:
        `bool`: True if it's not an Optimizer class (i.e., likely a factory), False if it's an Optimizer class.
    FT)
isinstancetype
issubclasstorchoptim	Optimizer)r6   r'   r'   r(   is_optimizer_factoryP   s   r>   Tr   r   r   r   optimizer_namestroptimizer_mappingr   optim_kwargsr   is_layerwise_supportedtuple[Any, dict[str, Any]]c                   sF  |  d}|r| jtjkr|rtd| d|| }| jdu r*td| dt| jt	t
fs:td| j |du rFtd| d	t| jt
oT| jd
ddk}	g | D ]0\}
}t| j|
dd\}}t|tjs|r~|s~t|
 d| d q[|s|	sq[|
d  q[tdkrtd| d| j dfdd| D }fdd| D }d|id|i|g}|r| jdkrtd| di  |D ]}|d|gigfi | |< q|D ]}|d|gi|gfi | |< q fdd}| D ]}|jr|| qt}|d i |d|i ||fS ) z
    Helper function to set up low-rank optimizers like GaLore and Apollo.

    These optimizers apply low-rank projections to specific target modules (typically linear layers).
    	layerwisezLayer-wise z" does not support DDP at this timeNz1You need to define `optim_target_modules` to use z optimizerszX`optim_target_modules` must be a list of strings, a regex string, or 'all-linear'. Got: z'You need to pass a model to initialize z optimizer._-z
all-linearT)return_is_regexz matched but ignored. z only supports linear layers.z.weightr   zNo target modules found for z (z).c                   s   g | ]
\}}| v r|qS r'   r'   .0nptarget_params_namesr'   r(   
<listcomp>       z-_setup_low_rank_optimizer.<locals>.<listcomp>c                   s   g | ]
\}}| vr|qS r'   r'   rI   rM   r'   r(   rO      rP   paramsr	   z
Layerwise z( does not support gradient accumulation!c                   s*   | j d ur |     |    d S d S )N)gradstep	zero_grad)param)optimizer_dictr'   r(   optimizer_hook   s   
z1_setup_low_rank_optimizer.<locals>.optimizer_hookrV   )lowerendswithparallel_moder   DISTRIBUTEDNotImplementedErroroptim_target_modules
ValueErrorr8   listr@   	TypeErrorr0   named_modulesr   r   Linearloggerwarningappendlennamed_parametersgradient_accumulation_steps
parametersrequires_grad"register_post_accumulate_grad_hookr   update)r   r   r?   rA   rB   r   rC   is_layerwiseoptimizer_cls
all_linearmodule_namemoduletarget_module_existsis_regextarget_paramsnon_target_paramsparam_groupsrU   rW   r'   )rV   rN   r(   _setup_low_rank_optimizere   sh   



"
rw   ctxc                 C  s   | j ddd t| j fS )zGet Adafactor optimizer.Fscale_parameterrelative_step)r   rl   r
   rx   r'   r'   r(   _get_adafactor   s   
r}   c                 C  sB   ddl m} | j| j | jjtjkr| jddi || jfS )z/Get PyTorch AdamW optimizer (regular or fused).r   AdamWfusedT)	torch.optimr   r   rl   r   r   r<   r   ADAMW_TORCH_FUSEDrx   r   r'   r'   r(   _get_adamw_torch   s
   
r   c                 C  >   zddl m} | j| j || jfW S  ty   tdw )z'Get Torch XLA syncfree AdamW optimizer.r   r~   z7Trainer failed to import syncfree AdamW from torch_xla.)torch_xla.amp.syncfreer   r   rl   r   ImportErrorr^   r   r'   r'   r(   _get_adamw_torch_xla      r   c                 C  r   )zGet NPU Fused AdamW optimizer.r   )NpuFusedAdamWz3Trainer failed to import FusedAdamW from torch_npu.)torch_npu.optimr   r   rl   r   r   r^   )rx   r   r'   r'   r(   _get_adamw_torch_npu_fused   r   r   c                 C  r   )zGet Apex Fused Adam optimizer.r   )	FusedAdamzFTrainer tried to instantiate apex FusedAdam but apex is not installed!)apex.optimizersr   r   rl   r   r   r^   )rx   r   r'   r'   r(   _get_adamw_apex_fused   r   r   c                 C  s~  t  stdddlm}m}m} | jj}d|v }d|v rdnd}d}| j}d	|v r-|}nud
|v r>|}d| jj	| jj
fi}ndd|v rH|}| j}nZd|v rddlm}	 |	}t| jd| jj	t| jd| jj
t| jddft| jddt| jd| jjd}d| jv rt| jd |d< d| jv rt| jd |d< d|i}
d|vr||
d< | j| | j|
 || jfS )z;Get bitsandbytes optimizer (AdamW, Lion, RMSprop variants).ziYou need to install `bitsandbytes` in order to use bitsandbytes optimizers: `pip install -U bitsandbytes`r   )r   LionRMSproppaged8bit       Nadamlionbetasrmspropademamix)AdEMAMixbeta1beta2beta3gH.?alphag      @eps)r   r   r   t_alphat_beta3
optim_bitsis_paged)r   r   bitsandbytes.optimr   r   r   r   r<   r   
adam_beta1
adam_beta2r!   r   floatgetadam_epsilonintr   rl   )rx   r   r   r   
optim_namer   r   rn   additional_optim_kwargsr   
bnb_kwargsr'   r'   r(   _get_bitsandbytes_optimizer   sL   
	

r   c                 C  s   z=ddl m} | j| j | jt| jddtt	| jddtt	| jddtt	| jdd	d
 || jfW S  t
yH   tdw )z!Get AnyPrecision AdamW optimizer.r   )AnyPrecisionAdamWuse_kahan_summationFalsemomentum_dtypefloat32variance_dtypecompensation_buffer_dtypebfloat16)r   r   r   r   z4Please install https://github.com/pytorch/torchdistx)torchdistx.optimizersr   r   rl   r   r   r!   r   getattrr;   r   r^   )rx   r   r'   r'   r(   _get_adamw_anyprecision&  s    
r   c                 C     t jj| jfS )zGet SGD optimizer.)r;   r<   SGDr   r|   r'   r'   r(   _get_sgd;     r   c                 C  r   )zGet Adagrad optimizer.)r;   r<   Adagradr   r|   r'   r'   r(   _get_adagrad@  r   r   c                 C  r   )zGet RMSprop optimizer.)r;   r<   r   r   r|   r'   r'   r(   _get_rmspropE  r   r   c                 C  s   t  stdddlm}m}m} tj|tj|tj	|tj
|tj|tj|i}t| jddt| jddt| jdd	| jd
dd}t| j| j| jj||| j\}}| jjtj	kre|ddd ||fS )zGet GaLore optimizer.zYou need to install `galore_torch` in order to use GaLore optimizers. Install it with `pip install git+https://github.com/jiaweizzhao/GaLore`r   )GaLoreAdafactorGaLoreAdamWGaLoreAdamW8bitrank   update_proj_gap   scaleg      ?	proj_typestd)r   r   r   r   Fry   )r   r   galore_torchr   r   r   r   GALORE_ADAMWGALORE_ADAMW_8BITGALORE_ADAFACTORGALORE_ADAMW_LAYERWISEGALORE_ADAMW_8BIT_LAYERWISEGALORE_ADAFACTOR_LAYERWISEr   r!   popr   rw   r   r   r<   r   rl   )rx   r   r   r   rA   galore_optim_kwargsrn   r   r'   r'   r(   _get_galore_optimizerJ  s.   
r   c              	   C  s   t  stdddlm} tj|tj|i}t| j	dd| j	dd| j	dd	t| j	d
dt
| j	dd| j	ddd}|| j t| j| j| jj||| jS )zGet Apollo optimizer.zYou need to install `apollo_torch` in order to use APOLLO optimizers. Install it with `pip install git+https://github.com/zhuhanqing/APOLLO`r   )APOLLOAdamWr   r   projrandom
scale_typechannelr   r   r         ?r   r   )r   r   r   r   r   r   )r   r   apollo_torchr   r   APOLLO_ADAMWAPOLLO_ADAMW_LAYERWISEr   r!   r   r   rl   r   rw   r   r   r<   r   )rx   r   rA   apollo_optim_kwargsr'   r'   r(   _get_apollo_optimizerk  s&   r   c                 C  s`   t  std| jdu rtdddlm}m} d| jjv r |n|}| j	
d| ji || j	fS )zGet LOMO optimizer.zjYou need to install `lomo_optim` in order to use LOMO optimizers. Install it with `pip install lomo-optim`NzMYou need to pass a `model` in order to correctly initialize a LOMO optimizer.r   )AdaLomoLomoadar   )r   r   r   r^   
lomo_optimr   r   r   r<   r   rl   )rx   r   r   rn   r'   r'   r(   _get_lomo_optimizer  s   

r   c                 C  s   t  stdddlm} | jt| jddt| jddt| jdd	t| jd
d	t| jddd || jfS )zGet GrokAdamW optimizer.z5Please install grokadamw with `pip install grokadamw`r   )	GrokAdamW
alpha_initg\(\?lamb       @gammag?grokking_signal_decay_rategradient_clippingr   )r   r   r   r   r   )	r   r^   	grokadamwr   r   rl   r   r!   r   )rx   r   r'   r'   r(   _get_grokadamw  s   
	r   c                 C  s   t  rttjdtdk rtdttjdtdkr'tdttjdtdkr>dd	lm}m} ndd	l	m}m} | j
jtjkrP|}n|}| j| jd
dt| jddd | j| j || jfS )z%Get TorchAO 4-bit or 8-bit optimizer.torchaoz0.4.0zYou need to have `torchao>=0.4.0` in order to use torch 4-bit optimizers. Install it with `pip install torchao` or follow the instructions here: https://github.com/pytorch/aor;   z2.4zYou need to have `torch>2.4` in order to use torch 4-bit optimizers. Install it with `pip install --upgrade torch` it is available on pipy. Otherwise, you need to install torch nightly.z0.11.0r   )	AdamW4bit	AdamW8bit
block_size   bf16_stochastic_roundr   )r   r   )r   r   parse	importlibmetadatar   torchao.optimr   r   torchao.prototype.low_bit_optimr   r<   r   ADAMW_TORCH_4BITr   rl   r!   r   r   r   )rx   r   r   rn   r'   r'   r(   _get_torchao_optimizer  s*   "
r   c                 C  s   t  stdddlm}m} i }d}| jjtjkr0t ds"tdddlm	} |}| j
}d}n| jjtjkr=|}| j
}n| jjtjkrG|}ntd	| jj|d
< |rY| jj|d< |t| jddt| jddd | j| || jfS )zGet ScheduleFree optimizer.zwYou need to install `schedulefree` in order to use schedulefree optimizers. Install it with `pip install schedulefree.`r   )AdamWScheduleFreeSGDScheduleFreeTz1.4.0zYou need to install `schedulefree>=1.4.0` in order to use RAdamScheduleFree optimizer. Install it with `pip install schedulefree.`)RAdamScheduleFreeFzInvalid schedulefree optimizerweight_decaywarmup_stepsweight_lr_powerr   rg        )r  r  )r   r   schedulefreer   r   r   r<   r   SCHEDULE_FREE_RADAMr   r   SCHEDULE_FREE_ADAMWSCHEDULE_FREE_SGDr^   r   r   rl   r   r!   r   r   )rx   r   r   r   require_warmupr   rn   r'   r'   r(   _get_schedule_free_optimizer  s@   
r  c                 C  s   t  stdddlm} | jdd}|durt|}| jdd}|dur+t|}| jj	| j
d< t| jdd	||d
}| j| j
 | j| || jfS )z,Get StableAdamW optimizer from torch-optimi.zwYou need to install `torch-optimi` in order to use stable_adamw optimizers. Install it with `pip install torch-optimi`.r   )StableAdamWmax_lrN	kahan_sumr   decouple_lrF)r  r
  r  )r   r   optimir	  r!   r   r   r7   r   r   r   r   rl   )rx   r	  r
  r  stable_adamw_kwargsr'   r'   r(   _get_stable_adamw  s&   
r  zdict[str, OptimizerHandler]_OPTIMIZER_HANDLERS)r)   r*   r+   r    )r6   r   r+   r7   )T)r   r   r   r   r?   r@   rA   r   rB   r   r   r   rC   r7   r+   rD   )rx   r   r+   rD   )ur%   
__future__r   importlib.metadatar   loggingcollections.abcr   dataclassesr   typingr   r   r;   	packagingr   r   optimizationr
   trainer_pt_utilsr   trainer_utilsr   training_argsr   r   utilsr   r   r   r   r   r   r   r   r   modeling_utilsr   r   	getLoggerr"   rc   r   r5   tupledictr@   OptimizerHandlerr>   rw   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  	ADAMW_BNB
ADAMW_8BITPAGED_ADAMWPAGED_ADAMW_8BITADEMAMIXADEMAMIX_8BITPAGED_ADEMAMIXPAGED_ADEMAMIX_8BITLION	LION_8BIT
PAGED_LIONPAGED_LION_8BITRMSPROP_BNBRMSPROP_8BITRMSPROP_32BIT_BITSANDBYTES_OPTIMIZERSr   r   r   r   r   r   _GALORE_OPTIMIZERSr   r   _APOLLO_OPTIMIZERSr   ADAMW_TORCH_8BIT_TORCHAO_OPTIMIZERSr  r  r  _SCHEDULE_FREE_OPTIMIZERS	ADAFACTORADAMW_TORCHr   ADAMW_TORCH_XLAADAMW_TORCH_NPU_FUSEDADAMW_APEX_FUSEDADAMW_ANYPRECISIONr   ADAGRADRMSPROP	GROKADAMWSTABLE_ADAMWLOMOADALOMOfromkeysr  r&   r'   r'   r'   r(   <module>   s   ,




^






2




!



#
,"





