o
    0i9                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ dd Zdd ZG dd dZG dd dZG dd deZG dd de
ZG dd dZG dd dZdS )    N)deepcopy)optim   )AcceleratedOptimizer)AcceleratedScheduler   )DistributedType)is_bnb_available)compare_versionsc                 C   s  dd | j  D }ddlm} |}tdddrUd|d	< t| tj}t rO|sOdd
l	m} t| |j|j
frMz| jdk}W n tyL   | jjdk}Y nw d}|rUd|d	< tdddrt| tj}t r|sdd
l	m} t| |j|jfrz| jdk}W n ty   | jjdk}Y nw |rddlm} |}tddrtdddrddl	m}m}	 t| ||	frz| jdk}
W n ty   | jjdk}
Y nw |
rddlm} |}|| jfi |S )z
    Args:
        optimizer: torch.optim.Optimizer

    Returns the DeepSeedCPUOptimizer (deepspeed.ops) version of the optimizer.
    c                 S   s   i | ]\}}|d v r||qS ))lrweight_decay ).0kvr   r   \/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/accelerate/utils/deepspeed.py
<dictcomp>%   s    z2map_pytorch_optim_to_deepspeed.<locals>.<dictcomp>r   )DeepSpeedCPUAdam	deepspeedz>=z0.3.1F
adamw_modeN    Tz0.5.5)DeepSpeedCPUAdagradz0.38.0)min_versionz0.11.0)Lion	Lion32bit)DeepSpeedCPULion)defaultsitemsdeepspeed.ops.adamr   r
   
isinstancer   AdamWr	   bitsandbytes.optim
AdamW32bit
optim_bitsAttributeErrorargsAdagradAdagrad32bitdeepspeed.ops.adagradr   r   r   deepspeed.ops.lionr   param_groups)	optimizerr   r   optimizer_classis_adawbnb_optis_adar   r   r   is_bnb_32bitsr   r   r   r   map_pytorch_optim_to_deepspeed   sT   

r1   c                 C   s>   | j tjkr
tdt| jts| jS tdd | j D S )z
    Returns the currently active DeepSpeedPlugin.

    Raises:
        ValueError: If DeepSpeed was not enabled and this function is called.
    a!  Couldn't retrieve the active `DeepSpeedPlugin` as none were enabled. Please make sure that either `Accelerator` is configured for `deepspeed` or make sure that the desired `DeepSpeedPlugin` has been enabled (`AcceleratorState().select_deepspeed_plugin(name)`) before calling this function.c                 s   s    | ]}|j r|V  qd S N)selected)r   pluginr   r   r   	<genexpr>t   s    z.get_active_deepspeed_plugin.<locals>.<genexpr>)	distributed_typer   	DEEPSPEED
ValueErrorr   deepspeed_pluginsdictnextvalues)stater   r   r   get_active_deepspeed_plugind   s   r>   c                   @   sd   e Zd ZdZdd Zdd Zdd Zdd	d
ZdddZdd Z	dd Z
dd Zdd Zdd ZdS )HfDeepSpeedConfigaJ  
    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.

    A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
    things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
    it's important that this object remains alive while the program is still running.

    [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
    with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
    the DeepSpeed configuration is not modified in any way.

    Args:
        config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.

    c              
   C   s   t |tr
t|}nUtj|r,t|dd}t|}W d    n1 s&w   Y  n3z!zt	|}W n tj
yK   t|d}t	|}Y nw W n tttfy^   td| w || _|   d S )Nzutf-8)encodingzoExpected a string path to an existing deepspeed config, or a dictionary, or a base64 encoded string. Received: )r   r:   r   ospathexistsopenjsonloadloadsJSONDecodeErrorbase64urlsafe_b64decodedecodeUnicodeDecodeErrorr$   r8   configset_stage_and_offload)selfconfig_file_or_dictrM   fconfig_decodedr   r   r   __init__   s,   

zHfDeepSpeedConfig.__init__c                 C   sj   |  dd| _d| _|  s|  r3tddg}t|  d|  dg}t||@ dkr1d	| _d S d S d S )
Nzzero_optimization.stageFcpunvmez*zero_optimization.offload_optimizer.devicez&zero_optimization.offload_param.devicer   T)	get_value_stage_offloadis_zero2is_zero3setlen)rO   offload_devices_validoffload_devicesr   r   r   rN      s   
z'HfDeepSpeedConfig.set_stage_and_offloadc                 C   sH   | j }|d}| }|D ]}||}|d u rd |f  S q||fS )N.)rM   splitpopget)rO   ds_key_longrM   nodesds_keynoder   r   r   find_config_node   s   

z"HfDeepSpeedConfig.find_config_nodeNc                 C   s&   |  |\}}|du r|S |||S )zG
        Returns the set value or `default` if no value is set
        N)rh   rc   )rO   rd   defaultrM   rf   r   r   r   rW      s   zHfDeepSpeedConfig.get_valueFc                 C   sj   | j }|d}|D ]}|}||}|du r'|r$td| d| j   dS q
|dur3|| dS dS )z
        Deletes a sub-section of the config file if it's found.

        Unless `must_exist` is `True` the section doesn't have to exist.
        r`   NzCan't find z entry in the config: )rM   ra   rc   r8   rb   )rO   rd   
must_existrM   re   rg   parent_configr   r   r   del_config_sub_tree   s   

z%HfDeepSpeedConfig.del_config_sub_treec                 C   s   |  |}|du rdS t|S )z
        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
        specific question of whether the value is set to `True` (and it's not set to `False`` or isn't set).

        NFrW   boolrO   rd   valuer   r   r   is_true   s   
zHfDeepSpeedConfig.is_truec                 C   s    |  |}|du rdS t| S )z
        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
        specific question of whether the value is set to `False` (and it's not set to `True`` or isn't set).
        NFrm   ro   r   r   r   is_false   s   
zHfDeepSpeedConfig.is_falsec                 C   
   | j dkS )Nr   rX   rO   r   r   r   rZ         
zHfDeepSpeedConfig.is_zero2c                 C   rs   )N   rt   ru   r   r   r   r[      rv   zHfDeepSpeedConfig.is_zero3c                 C   s   | j S r2   )rY   ru   r   r   r   
is_offload   s   zHfDeepSpeedConfig.is_offloadr2   )F)__name__
__module____qualname____doc__rS   rN   rh   rW   rl   rq   rr   rZ   r[   rx   r   r   r   r   r?   w   s    

		r?   c                   @   s*   e Zd ZdZdd Zd
ddZdd Zd	S )DeepSpeedEngineWrapperz
    Internal wrapper for deepspeed.runtime.engine.DeepSpeedEngine. This is used to follow conventional training loop.

    Args:
        engine (deepspeed.runtime.engine.DeepSpeedEngine): deepspeed engine to wrap
    c                 C   s
   || _ d S r2   )engine)rO   r~   r   r   r   rS     rv   zDeepSpeedEngineWrapper.__init__Tc                 K   s8   | j j|d | j j|fi | |r| j   d S d S )N)is_boundary)r~   "set_gradient_accumulation_boundarybackwardstep)rO   losssync_gradientskwargsr   r   r   r     s
   zDeepSpeedEngineWrapper.backwardc                 C   s    | j  }t|dr| S |S )z3Get the global gradient norm from DeepSpeed engine.item)r~   get_global_grad_normhasattrr   )rO   	grad_normr   r   r   r     s   

z+DeepSpeedEngineWrapper.get_global_grad_normN)T)ry   rz   r{   r|   rS   r   r   r   r   r   r   r}      s
    
r}   c                       s>   e Zd ZdZ fddZdddZdd Zed	d
 Z  Z	S )DeepSpeedOptimizerWrapperz
    Internal wrapper around a deepspeed optimizer.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
    c                    s$   t  j|dd d t| jd| _d S )NF)device_placementscaleroverflow)superrS   r   r+   __has_overflow__)rO   r+   	__class__r   r   rS   0  s   z"DeepSpeedOptimizerWrapper.__init__Nc                 C      d S r2   r   )rO   set_to_noner   r   r   	zero_grad4     z#DeepSpeedOptimizerWrapper.zero_gradc                 C   r   r2   r   ru   r   r   r   r   7  r   zDeepSpeedOptimizerWrapper.stepc                 C   s   | j r| jjS dS )zTWhether or not the optimizer step was done, or skipped because of gradient overflow.F)r   r+   r   ru   r   r   r   step_was_skipped:  s   z*DeepSpeedOptimizerWrapper.step_was_skippedr2   )
ry   rz   r{   r|   rS   r   r   propertyr   __classcell__r   r   r   r   r   '  s    
r   c                       s(   e Zd ZdZ fddZdd Z  ZS )DeepSpeedSchedulerWrapperz
    Internal wrapper around a deepspeed scheduler.

    Args:
        scheduler (`torch.optim.lr_scheduler.LambdaLR`):
            The scheduler to wrap.
        optimizers (one or a list of `torch.optim.Optimizer`):
    c                    s   t  || d S r2   )r   rS   )rO   	scheduler
optimizersr   r   r   rS   L  s   z"DeepSpeedSchedulerWrapper.__init__c                 C   r   r2   r   ru   r   r   r   r   O  r   zDeepSpeedSchedulerWrapper.step)ry   rz   r{   r|   rS   r   r   r   r   r   r   r   B  s    	r   c                   @   s   e Zd ZdZdddZdS )
DummyOptima  
    Dummy optimizer presents model parameters or param groups, this is primarily used to follow conventional training
    loop when optimizer config is specified in the deepspeed config file.

    Args:
        lr (float):
            Learning rate.
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        weight_decay (float):
            Weight decay.
        **kwargs (additional keyword arguments, *optional*):
            Other arguments.
    MbP?r   c                 K   s   || _ || _|| _|| _d S r2   )paramsr   r   r   )rO   r   r   r   r   r   r   r   rS   c  s   
zDummyOptim.__init__N)r   r   ry   rz   r{   r|   rS   r   r   r   r   r   S  s    r   c                   @   s   e Zd ZdZdddZdS )DummySchedulera  
    Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
    loop when scheduler config is specified in the deepspeed config file.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        total_num_steps (int, *optional*):
            Total number of steps.
        warmup_num_steps (int, *optional*):
            Number of steps for warmup.
        lr_scheduler_callable (callable, *optional*):
            A callable function that creates an LR Scheduler. It accepts only one argument `optimizer`.
        **kwargs (additional keyword arguments, *optional*):
            Other arguments.
    Nr   c                 K   s"   || _ || _|| _|| _|| _d S r2   )r+   total_num_stepswarmup_num_stepslr_scheduler_callabler   )rO   r+   r   r   r   r   r   r   r   rS   |  s
   
zDummyScheduler.__init__)Nr   Nr   r   r   r   r   r   j  s    r   )rI   rE   rA   copyr   torchr   r+   r   r   r   dataclassesr   importsr	   versionsr
   r1   r>   r?   r}   r   r   r   r   r   r   r   r   <module>   s&   G *