o
    0iaO                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dl mZ d dlmZm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZ d dlmZ d	d
lmZ d	dlmZ d	dlmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ee,Z-e$ rd dl.m/  m0Z1 dej2j3de4fddZ5dej2j3de4fddZ6dej2j3de4fddZ7dej2j3de4fddZ8dej2j3dej2j3fddZ9dej2j3fd d!Z:d"ej2j3de4fd#d$Z;	&dLd'e4d(e4d)e4fd*d+Z<d,d- Z=d.e>fd/d0Z?dMd1e4d2e4fd3d4Z@e"d5r	ejAnej/ZBeBjCjDejEeejFgZGe"d6r"eGHejIjJ dNd7d8ZKd9d: ZLd;d< ZMdNd=eeN de4fd>d?ZOdeNfd@dAZPdBdC ZQdDdE ZRdFeSfdGdHZTdOd"ej2j3dIe4deUej2j3 fdJdKZVdS )P    N)encode)OrderedDict)partialreduce)
MethodType)Optional)Version)	save_file   )write_basic_config)
get_logger)PartialState   )FSDP_PYTORCH_VERSION)DistributedType)is_deepspeed_availableis_numpy_availableis_torch_distributed_availableis_torch_xla_availableis_weights_only_available)id_tensor_storage)convert_model)is_torch_versionmodulereturnc                 C   s   t tdsdS t| tjjjS )zD
    Check whether the module was compiled with torch.compile()
    _dynamoF)hasattrtorch
isinstancer   
eval_frameOptimizedModuler    r"   X/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/accelerate/utils/other.pyis_compiled_module6   s   
r$   c                 C   s<   t tdsdS | jr|  D ]}t|tjjjr dS qdS )z\
    Check whether the module has submodules that were compiled with `torch.compile()`.
    r   FT)r   r   _modulesmodulesr   r   r   r    r   	submoduler"   r"   r#   has_compiled_regions@   s   
r)   c                    s$   t  tjjot fdd D S )z
    Check whether the module is a repeated block, i.e. `torch.nn.ModuleList` with all children of the same class. This
    is useful to determine whether we should apply regional compilation to the module.
    c                 3   s     | ]}t | d  jV  qdS )r   N)r   	__class__).0mr!   r"   r#   	<genexpr>U   s    z%is_repeated_blocks.<locals>.<genexpr>)r   r   nn
ModuleListallr!   r"   r!   r#   is_repeated_blocksO   s   $r1   c                 C   s&   | j r|  D ]	}t|r dS qdS )z
    Check whether the module has repeated blocks, i.e. `torch.nn.ModuleList` with all children of the same class, at
    any level of the module hierarchy. This is useful to determine whether we should apply regional compilation to the
    module.
    TF)r%   r&   r1   r'   r"   r"   r#   has_repeated_blocksX   s   r2   c                    sF   dt jjdt jjf fdd  | fi |}d|jvr!| |jd< |S )a_  
    Performs regional compilation where we target repeated blocks of the same class and compile them sequentially to
    hit the compiler's cache. For example, in `GPT2LMHeadModel`, the repeated block/class is `GPT2Block`, and can be
    accessed as `model.transformer.h[0]`. The rest of the model (e.g. model.lm_head) is compiled separately.

    This allows us to speed up the compilation overhead / cold start of models like LLMs and Transformers in general.
    See https://pytorch.org/tutorials/recipes/regional_compilation.html for more details.

    Args:
        module (`torch.nn.Module`):
            The model to compile.
        **compile_kwargs:
            Additional keyword arguments to pass to `torch.compile()`.

    Returns:
        `torch.nn.Module`: A new instance of the model with some compiled regions.

    Example:
    ```python
    >>> from accelerate.utils import compile_regions
    >>> from transformers import AutoModelForCausalLM

    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
    >>> compiled_model = compile_regions(model, mode="reduce-overhead")
    >>> compiled_model.transformer.h[0]
    OptimizedModule(
        (_orig_mod): GPT2Block(
                (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (attn): GPT2Attention(
                (c_attn): Conv1D(nf=2304, nx=768)
                (c_proj): Conv1D(nf=768, nx=768)
                (attn_dropout): Dropout(p=0.1, inplace=False)
                (resid_dropout): Dropout(p=0.1, inplace=False)
            )
            (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): GPT2MLP(
                (c_fc): Conv1D(nf=3072, nx=768)
                (c_proj): Conv1D(nf=768, nx=3072)
                (act): NewGELUActivation()
                (dropout): Dropout(p=0.1, inplace=False)
            )
        )
    )
    ```
    r   r   c                    s   t | rtj }| D ]}|tj|fi | q|S t| rH| j| j}|j	
| j	 i |_|  D ]\}}|| |fi | q5|S tj| fi |}|S N)r1   r   r.   r/   appendcompiler2   r*   __new____dict__updater%   named_children
add_module)r   compile_kwargs
new_moduler(   name_compile_regionsr"   r#   r?      s   

z)compile_regions.<locals>._compile_regions	_orig_mod)r   r.   Moduler7   )r   r;   r<   r"   r>   r#   compile_regionsf   s
   /

rB   c                 K   sd   t | r| D ]
}|jdi | qdS t| r(|  D ]
}t|fi | qdS | jdi | dS )a  
    Performs regional compilation the same way as `compile_regions`, but specifically for `DeepSpeedEngine.module`.
    Since the model is wrapped in a `DeepSpeedEngine` and has many added hooks, offloaded parameters, etc that
    `torch.compile(...)` interferes with, version of trgional compilation uses the inplace `module.compile()` method
    instead.

    Args:
        module (`torch.nn.Module`):
            The model to compile.
        **compile_kwargs:
            Additional keyword arguments to pass to `module.compile()`.
    Nr"   )r1   r5   r2   childrencompile_regions_deepspeed)r   r;   r(   childr"   r"   r#   rD      s   rD   modelc                    s>   t ddrddlm  nddlm  t fdd|  D S )z
    Check if the model has DTensor parameters.

    Args:
        model (`torch.nn.Module`):
            The model to check.

    Returns:
        `bool`: Whether the model has DTensor parameters.
    >=z2.5.0r   DTensorc                 3   s    | ]}t | V  qd S r3   )r   )r+   prH   r"   r#   r-      s    z$model_has_dtensor.<locals>.<genexpr>)r   torch.distributed.tensorrI   torch.distributed._tensorany
parameters)rF   r"   rH   r#   model_has_dtensor   s   
rO   TFkeep_fp32_wrapperkeep_torch_compile	recursivec                    sL  t jjjt jjf}t| }t| }|r| }| j} n	|r"| }| jd } t	 r0ddl
m} ||f7 }tdtrCt rCddlm}	 ||	f7 }t| |rP| j} t| |sH|r\ fdd  | } |s| j}
| jdd	}|d	urt|
d
r~|
j}
|
|krynt|
d
sqt|
| | _t| ddrt| dd |r|r| |_|} | S |r| |jd< |} | S )a  
    Extract a model from its distributed containers.

    Args:
        model (`torch.nn.Module`):
            The model to extract.
        keep_fp32_wrapper (`bool`, *optional*):
            Whether to remove mixed precision hooks from the model.
        keep_torch_compile (`bool`, *optional*):
            Whether to unwrap compiled model.
        recursive (`bool`, *optional*, defaults to `False`):
            Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers
            recursively, not just the top-level distributed containers.

    Returns:
        `torch.nn.Module`: The extracted model.
    r@   r   )DeepSpeedEnginerG   )FullyShardedDataParallelc                    s@   t | dr | j}n| }| D ]\}}t|| | q|S )Nr   )r   r   r9   setattr)r   unwrapped_moduler=   rE   _recursive_unwrapr"   r#   rX   	  s   
z6extract_model_from_parallel.<locals>._recursive_unwrap_original_forwardN__wrapped__ _converted_to_transformer_engineF)to_transformer_engine)r   r.   parallelDistributedDataParallelDataParallelr$   r)   r@   r7   r   	deepspeedrS   r   r   r   2torch.distributed.fsdp.fully_sharded_data_parallelrT   r   r   forwardpopr   rZ   r   getattrr   )rF   rP   rQ   rR   optionsis_compiledhas_compiledcompiled_modelrS   FSDPrb   original_forwardr"   rW   r#   extract_model_from_parallel   sT   







rk   c                   C   s   t    dS )a  
    Introduces a blocking point in the script, making sure all processes have reached this point before continuing.

    <Tip warning={true}>

    Make sure all processes will reach this instruction otherwise one of your processes will hang forever.

    </Tip>
    N)r   wait_for_everyoner"   r"   r"   r#   rl   /  s   
rl   
state_dictc                    s   t t}  D ]\}}t|ts|t| | q	dd | D }t }|	 D ] } fdd|D }|
|dd  |dd D ]} |= qFq,t|dkr\td| d	 d
d   D   S )z
    Cleans the state dictionary from a model and removes tensor aliasing if present.

    Args:
        state_dict (`dict`):
            The state dictionary from a model
    c                 S   s"   i | ]\}}t |d kr||qS )r   )len)r+   ptrnamesr"   r"   r#   
<dictcomp>K  s   " z4clean_state_dict_for_safetensors.<locals>.<dictcomp>c                    s   g | ]}| v r|qS r"   r"   )r+   r=   rm   r"   r#   
<listcomp>S  s    z4clean_state_dict_for_safetensors.<locals>.<listcomp>r   Nr   zRemoved shared tensor zk while saving. This should be OK, but check by verifying that you don't receive any warning while reloadingc                 S   s*   i | ]\}}|t |tjr| n|qS r"   )r   r   Tensor
contiguous)r+   kvr"   r"   r#   rq   [  s   * )collectionsdefaultdictlistitemsr   strr   r4   setvaluesr8   rn   loggerwarning)rm   ptrsr=   tensorshared_ptrs
warn_namesrp   found_namesr"   rr   r#    clean_state_dict_for_safetensors<  s&   


r   save_on_each_nodesafe_serializationc                 C   s   t  jtjkrt| } |r ttddid}t| t	rt
| } ntj}t  jr0|s0|| | dS t  jr=|r?|| | dS dS dS )a  
    Save the data to disk. Use in place of `torch.save()`.

    Args:
        obj:
            The data to save
        f:
            The file (or file-like object) to use to save the data
        save_on_each_node (`bool`, *optional*, defaults to `False`):
            Whether to only save on the global main process
        safe_serialization (`bool`, *optional*, defaults to `False`):
            Whether to save `obj` using `safetensors` or the traditional PyTorch way (that uses `pickle`).
    formatpt)metadataN)r   distributed_typer   XLAxm_maybe_convert_to_cpur   safe_save_filer   r   r   r   saveis_main_processis_local_main_process)objfr   r   	save_funcr"   r"   r#   r   _  s   

r   z2.0.0z1.25.0c              	   K   s   z;t  rtj }d|vrd|d< tjt n|dd tj| fd|i|}W t  r:tj  |r:tj| |S t  rMtj  |rNtj| w w w )a  
    Compatible drop-in replacement of `torch.load()` which allows for `weights_only` to be used if `torch` version is
    2.4.0 or higher. Otherwise will ignore the kwarg.

    Will also add (and then remove) an exception for numpy arrays

    Args:
        f:
            The file (or file-like object) to use to load the data
        map_location:
            a function, `torch.device`, string or a dict specifying how to remap storage locations
        **kwargs:
            Additional keyword arguments to pass to `torch.load()`.
    weights_onlyTNmap_location)	r   r   serializationget_safe_globalsadd_safe_globalsTORCH_SAFE_GLOBALSrc   loadclear_safe_globals)r   r   kwargsold_safe_globals
loaded_objr"   r"   r#   r     s&   


r   c                 C   sH   t | dst | dst| d| } t | dr| jS t | dr | jS t| S )z(
    Gets a pretty name from `obj`.
    __qualname____name__r*   )r   rd   r   r   r|   )r   r"   r"   r#   get_pretty_name  s   

r   c                 C   s@   |   D ]\}}t|tr||i }t|| q|||< q|S )z
    Recursively merges two dictionaries.

    Args:
        source (`dict`): The dictionary to merge into `destination`.
        destination (`dict`): The dictionary to merge `source` into.
    )r{   r   dict
setdefaultmerge_dicts)sourcedestinationkeyvaluenoder"   r"   r#   r     s   

r   portc                 C   sR   | du rd} t  t jt j}|d| fdkW  d   S 1 s"w   Y  dS )z
    Checks if a port is in use on `localhost`. Useful for checking if multiple `accelerate launch` commands have been
    run and need to see if the port is already in use.
    Ni<s  	localhostr   )socketAF_INETSOCK_STREAM
connect_ex)r   sr"   r"   r#   is_port_in_use  s
   $r   c                  C   sJ   t  t jt j} | d |  d W  d   S 1 sw   Y  dS )z
    Gets a free port on `localhost`. Useful for automatic port selection when port 0 is specified in distributed
    training scenarios.

    Returns:
        int: An available port number
    ) r   r   N)r   r   r   bindgetsockname)r   r"   r"   r#   get_free_port  s   

$r   c                 C   sB   dD ]}| dk rt | d d|   S | d } qt | d dS )z7Converts `size` from bytes to the largest possible unit)bytesKBMBGBTBg      @r
    z PB)round)sizexr"   r"   r#   convert_bytes  s
   
r   c                  C   sj   t  } | j}|dkrdS td| j^}}}d}t|t|k r3d| d| d}tj|dd	 dS dS )
zFWarns if the kernel version is below the recommended minimum on Linux.LinuxNz(\d+\.\d+\.\d+)z5.5.0zDetected kernel version z,, which is below the recommended minimum of zo; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.T)main_process_only)	platformunamesystemresplitreleaser   r   r   )infor   _versionmin_versionmsgr"   r"   r#   check_os_kernel  s   r   attrc                 C   s   dd }t || g|d S )z
    Recursive `getattr`.

    Args:
        obj:
            A class instance holding the attribute.
        attr (`str`):
            The attribute that is to be retrieved, e.g. 'attribute1.attribute2'.
    c                 S   s
   t | |S r3   )rd   )r   r   r"   r"   r#   _getattr  s   
z#recursive_getattr.<locals>._getattr.)r   r   )r   r   r   r"   r"   r#   recursive_getattr  s   r   return_fqnsc           
      C   s   |s| nd| f}|g}g }|rS|  }|r|\}}| D ]$\}}t|tjjrA|r<|r2|d | n|}	||	|f q|| q|rL|||f n|| |s|ddd S )aA  Traverse the model in bottom-up order and return the children modules in that order.

    Args:
        model (`torch.nn.Module`): the model to get the children of

    Returns:
        `list[torch.nn.Module]`: a list of children modules of `model` in bottom-up order. The last element is the
        `model` itself.
    r   r   N)rc   r9   r   r   r.   rA   r4   )
rF   r   topstackordered_modulescurrent_modulecurrent_module_namer=   r   
child_namer"   r"   r#   get_module_children_bottom_up  s&   


r   )TTF)FFr3   )F)Wrx   r   r   r   codecsr   r   	functoolsr   r   typesr   typingr   numpynpr   packaging.versionr   safetensors.torchr	   r   commands.config.defaultr   loggingr   stater   	constantsr   dataclassesr   importsr   r   r   r   r   modelingr   transformer_enginer   versionsr   r   r   torch_xla.core.xla_modelcore	xla_modelr   r.   rA   boolr$   r)   r1   r2   rB   rD   rO   rk   rl   r   r   r   _corenp_core
multiarray_reconstructndarraydtyper   r4   dtypesUInt32DTyper   r   r   intr   r   r   r   r|   r   rz   r   r"   r"   r"   r#   <module>   sx   
	H
U#$
	
 
(