o
    0iEz                     @   s  d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZ ddlZddlmZmZ d	d
lmZ d	dlmZmZ d	dlmZmZmZ d	dlmZ e r\ddlm  mZ e reddl m!Z! dd Z"dd Z#dd Z$dd Z%dd Z&e"ddddZ'dnddZ(dd  Z)d!d" Z*d#d$ Z+d%d& Z,d'd( Z-d)d* Z.d+d, Z/d-d. Z0G d/d0 d0e1Z2d1d2 Z3d3d4 Z4e3d5d6 Z5d7efd8d9Z6d7efd:d;Z7dod<d=Z8dpd?d@Z9ej:d	ej;dej<dAej=dBej>dCej?dDej@dEejAdFejBdGejCdHi
ZDdIdJ eDE D ZFdKdL ZGdqdMejHfdNdOZIe3dodPeJfdQdRZKdodPeJfdSdTZLdrdUdVZMdodWdXZNG dYdZ dZeOZPe4dsd[d\ZQdod]d^ZRe3dtdadbZSdcdd ZTG dedf dfZUdgdh ZVdidj ZWedudldmZXdS )vzB
A set of basic tensor ops compatible with tpu, gpu, and multigpu
    N)Mapping)contextmanagernullcontext)update_wrapperwraps)Any   )AcceleratorStatePartialState   )!TORCH_DISTRIBUTED_OPERATION_TYPES)DistributedTypeTensorInformation)is_npu_availableis_torch_distributed_availableis_torch_xla_available)is_torch_version)ReduceOpc                 C   s   t | tjS N)
isinstancetorchTensortensor r   ]/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/accelerate/utils/operations.pyis_torch_tensor-   s   r   c              	   C   s2   t | tjjtjjtjjtjjtjjtjjtjj	S r   )
r   r   xpuFloatTensor
ByteTensor	IntTensor
LongTensor
HalfTensorDoubleTensorBFloat16Tensorr   r   r   r   is_torch_xpu_tensor1   s   r%   c                 C   s
   t | tS r   )r   r   tensor_infor   r   r   is_tensor_information>      
r(   c                 C   s   t | tot| dot| dS )z
    Checks if `data` is a `namedtuple` or not. Can have false positives, but only if a user is trying to mimic a
    `namedtuple` perfectly.
    _asdict_fields)r   tuplehasattrdatar   r   r   is_namedtupleB   s   r0   c                 C   s$   t | rt| t| S t| |S )zO
    Cast a generator to the same type as obj (list, tuple, or namedtuple)
    )r0   typelist)obj	generatorr   r   r   
honor_typeJ   s   r5   F	test_typeerror_on_other_typec                   s   t |ttfrt| fdd|D S t |tr/t| fdd| D S |r>|g R i S rRtdt| dj dj d|S )	ad  
    Recursively apply a function on a data structure that is a nested list/tuple/dictionary of a given base type.

    Args:
        func (`callable`):
            The function to recursively apply.
        data (nested list/tuple/dictionary of `main_type`):
            The data on which to apply `func`
        *args:
            Positional arguments that will be passed to `func` when applied on the unpacked data.
        main_type (`type`, *optional*, defaults to `torch.Tensor`):
            The base type of the objects to which apply `func`.
        error_on_other_type (`bool`, *optional*, defaults to `False`):
            Whether to return an error or not if after unpacking `data`, we get on an object that is not of type
            `main_type`. If `False`, the function will leave objects of types different than `main_type` unchanged.
        **kwargs (additional keyword arguments, *optional*):
            Keyword arguments that will be passed to `func` when applied on the unpacked data.

    Returns:
        The same data structure as `data` with `func` applied to every object of type `main_type`.
    c                 3   s.    | ]}t |g R d V  qdS )r6   Nrecursively_apply).0oargsr8   funckwargsr7   r   r   	<genexpr>n   s    
z$recursively_apply.<locals>.<genexpr>c                    s0   i | ]\}}|t |g R d qS )r6   r9   r;   kvr=   r   r   
<dictcomp>w   s    z%recursively_apply.<locals>.<dictcomp>zUnsupported types (z) passed to `z?`. Only nested list/tuple/dicts of objects that are valid for `z` should be passed.)	r   r,   r2   r5   r   r1   items	TypeError__name__)r?   r/   r7   r8   r>   r@   r   r=   r   r:   U   s,   
	r:   c              
      s4  t | s	t| dr^ dkrd z| j dW S  ty%   |   Y S  tyF } zt r:t tr9d   n|W Y d}~nd}~ww z| j dW S  ty]   |   Y S w t| tt	frst
|  fdd| D S t| trttrgndu rg t|  fd	d
|  D S | S )a  
    Recursively sends the elements in a nested list/tuple/dictionary of tensors to a given device.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to send to a given device.
        device (`torch.device`):
            The device to send the data to.

    Returns:
        The same data structure as `tensor` with all tensors sent to the proper device.
    tonpuznpu:0)non_blockingznpu:Nc                 3   s     | ]}t | d V  qdS )rK   	skip_keysNsend_to_device)r;   tdevicerK   rM   r   r   rA      s    z!send_to_device.<locals>.<genexpr>c              	      s.   i | ]\}}||v r|nt | d qS )rL   rN   )r;   rC   rP   rQ   r   r   rE      s    z"send_to_device.<locals>.<dictcomp>)r   r-   rI   rG   AssertionErrorr   r   intr,   r2   r5   r   strr1   rF   )r   rR   rK   rM   errorr   rQ   r   rO      sF   



rO   c                 C      dd }t || S )aK  
    Recursively gathers the information needed to rebuild a nested list/tuple/dictionary of tensors.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`):
            The data to send to analyze.

    Returns:
        The same data structure as `data` with [`~utils.TensorInformation`] instead of tensors.
    c                 S   s   t | j| jdS )N)shapedtype)r   rX   rY   r   r   r   r   _get_data_structure   s   z/get_data_structure.<locals>._get_data_structurer9   )r/   rZ   r   r   r   get_data_structure      
r[   c                 C   rW   )a:  
    Recursively gathers the shape of a nested list/tuple/dictionary of tensors as a list.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`):
            The data to send to analyze.

    Returns:
        The same data structure as `data` with lists of tensor shapes instead of tensors.
    c                 S   s
   t | jS r   )r2   rX   r   r   r   r   
_get_shape   r)   zget_shape.<locals>._get_shaper9   )r/   r]   r   r   r   	get_shape   r\   r^   c                 C   s   dd }t || tdS )z
    Recursively initializes tensors from a nested list/tuple/dictionary of [`~utils.TensorInformation`].

    Returns:
        The same data structure as `data` with tensors instead of [`~utils.TensorInformation`].
    c                 S   s   t j| jd| jiS NrY   )r   emptyrX   rY   r&   r   r   r   _initialize_tensor      z.initialize_tensors.<locals>._initialize_tensorr7   )r:   r(   )data_structurera   r   r   r   initialize_tensors   s   re   c                 C   s   t | tttfrt| dkrtdt|  dt | ttfr%t| d S t | tr9|  D ]	}t| |   S nt | t	j
sItdt|  d| jd S )a  
    Recursively finds the batch size in a nested list/tuple/dictionary of lists of tensors.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

    Returns:
        `int`: The batch size.
    r   z&Cannot find the batch size from empty .z0Can only find the batch size of tensors but got )r   r,   r2   r   len
ValueErrorr1   find_batch_sizekeysr   r   rG   rX   )r/   rC   r   r   r   ri      s   


ri   c              	   C   s$   zt | W S  ttfy   Y dS w )a  
    Same as [`utils.operations.find_batch_size`] except will ignore if `ValueError` and `TypeErrors` are raised

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

    Returns:
        `int`: The batch size.
    N)ri   rh   rG   r.   r   r   r   ignorant_find_batch_size  s   

rk   c                 C   rW   )aS  
    Recursively finds tensors in a nested list/tuple/dictionary and converts them to a list of numbers.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to convert to regular numbers.

    Returns:
        The same data structure as `data` with lists of numbers instead of `torch.Tensor`.
    c                 S   s,   |    } | jtjkr| tj} |  S r   )detachcpurY   r   bfloat16rI   float32tolistr   r   r   r   _convert_to_list!  s   z!listify.<locals>._convert_to_listr9   )r/   rq   r   r   r   listify  s   
	rr   c                 C   s"   dd }t || dd}t  |S )Nc                 S   s0   | j dkr|  d  } |  s|  } t| S )Nr   )ndimcloneis_contiguous
contiguousxm
all_gatherr   r   r   r   _tpu_gather_one.  s
   

z$_tpu_gather.<locals>._tpu_gather_oneTr8   )r:   rw   	mark_step)r   ry   resr   r   r   _tpu_gather-  s   	r}   c                    sJ   t  tjj jjdkrtddrtj   fdd}t	|| ddS )Nr   z<=z2.8c                    s    j dkr  d     s   jd ur@jdkr@tjj    j	j
d}|  |jdg  dd  R  S  fddtjD }tj|  tj|ddS )	Nr   gloorY   rR   r   c                    s   g | ]}t  qS r   )r   
empty_liker;   _r   r   r   
<listcomp>\      z8_gpu_gather.<locals>._gpu_gather_one.<locals>.<listcomp>dim)rs   rt   ru   rv   backendr   r`   num_processesnumelrY   rR   viewsizerangedistributedrx   cat)r   output_tensors	gather_opstater   r   _gpu_gather_oneD  s   

z$_gpu_gather.<locals>._gpu_gather_oneTrz   )
r
   r   r   all_gather_into_tensorrR   r1   r   r   synchronizer:   )r   r   r   r   r   _gpu_gather<  s   
r   c                   @   s   e Zd ZdZdS )DistributedOperationExceptionz
    An exception class for distributed operations. Raised if the operation cannot be performed due to the shape of the
    tensors.
    N)rH   
__module____qualname____doc__r   r   r   r   r   c  s    r   c                       t   fdd}|S )zv
    Verifies that `tensor` is the same shape across all processes. Only ran if `PartialState().debug` is `True`.
    c                     s  t  jtjkst  js | i |S  j d j }d|v r$|d }n| d }t  jjt	|jkrNt
d| d|jj dt  jj dt  jj d| dt|}t|g}|d d ur||d t|k}|sd	d
d t|D }t
d| d|  | i |S )Nrf   r   r   z%One or more of the tensors passed to z were not on the z+ while the `Accelerator` is configured for z. Please move it to the z before calling z
  - c                 S   s    g | ]\}}d | d| qS )zProcess z: r   )r;   irX   r   r   r   r     s     z5verify_operation.<locals>.wrapper.<locals>.<listcomp>znCannot apply desired operation due to shape mismatches. All shapes across devices must be valid.

Operation: `z`
Input shapes:
  - )r
   distributed_typer   NOdebugr   rH   rR   r1   find_devicer   r^   gather_objectcountrg   join	enumerate)r>   r@   	operationr   shapesoutputare_sameprocess_shape_strfunctionr   r   wrapperq  s8   

z!verify_operation.<locals>.wrapperr   r   r   r   r   r   verify_operationl  s   r   c                    r   )z
    Checks that `verify_operation` failed and if so reports a more helpful error chaining the existing
    `DistributedOperationException`.
    c               
      sN   z | i |W S  t y& } z j d j }t d| d|d }~ww )Nrf   zError found while calling `z1`. Please see the earlier error for more details.)r   r   rH   )r>   r@   er   r   r   r   r     s   
z"chained_operation.<locals>.wrapperr   r   r   r   r   chained_operation  s   	r   c                 C   s.   t  jtjkrt| S t  jtv rt| S | S )a4  
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from all devices.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather.

    Returns:
        The same data structure as `tensor` with all tensors sent to the proper device.
    )r
   r   r   XLAr}   r   r   r   r   r   r   gather  s
   r   objectc                 C   s2   dd t t jD }tj||  dd |D S )Nc                 S   s   g | ]}d qS r   r   r   r   r   r   r     s    z&_gpu_gather_object.<locals>.<listcomp>c                 S   s   g | ]	}|D ]}|qqS r   r   )r;   yxr   r   r   r     s    )r   r
   r   r   r   all_gather_object)r   output_objectsr   r   r   _gpu_gather_object  s   r   c                 C   s.   t  jtjkrtdt  jtv rt| S | S )a5  
    Recursively gather object in a nested list/tuple/dictionary of objects from all devices.

    Args:
        object (nested list/tuple/dictionary of picklable object):
            The data to gather.

    Returns:
        The same data structure as `object` with all the objects sent to every device.
    z&gather objects in TPU is not supported)r
   r   r   r   NotImplementedErrorr   r   )r   r   r   r   r     s
   r   c                 C   s   ddd}t || d|dS )Nr   c                 S   s   t jj| |d | S )Nsrc)r   r   	broadcast)r   r   r   r   r   _gpu_broadcast_one  s   z*_gpu_broadcast.<locals>._gpu_broadcast_oneT)r8   r   r   r9   )r/   r   r   r   r   r   _gpu_broadcast  s   
r   broadcast tensorc                    sh   t | ttfrt|  fddt| D S t | tr)t|  fdd|  D S t	 | fddS )Nc                 3   s*    | ]\}}t |  d | dV  qdS )r   nameN_tpu_broadcast)r;   r   rP   r   r   r   rA        ( z!_tpu_broadcast.<locals>.<genexpr>c                    s(   i | ]\}}|t |  d | dqS )r   r   r   rB   r   r   r   rE        ( z"_tpu_broadcast.<locals>.<dictcomp>c                       |   S r   r   r   r   r   r   <lambda>      z _tpu_broadcast.<locals>.<lambda>)
r   r2   r,   r5   r   r   r1   rF   rw   mesh_reduce)r   r   r   r   )r   r   r   r     s
   
r                     	   
   c                 C   s   i | ]\}}||qS r   r   rB   r   r   r   rE     r   rE   c                 C   s   d}t  }tj|tj|jd}| dur/| j}t| j }tjt	||g td|dt
|d < t|dd}||  }t|dd d	 }|dd }||fS )
ze
    Grabs the shape of `tensor` only available on one process and returns a tensor of its shape
    i   r   NrY   r   sum	reductionr   r   )r
   r   r`   rT   rR   rX   TENSOR_TYPE_TO_INTrY   r   r2   rg   reducenonzero)r   max_tensor_dimensionr   base_tensorrX   tensor_dtyperY   r   r   r   gather_tensor_shape  s   
(r   returnc                 C   s@   t  }t| \}}| du rtj|t| d|j} t| ddS )a  
    Copies a tensor that only exists on a single device and broadcasts it to other devices. Differs from `broadcast` as
    each worker doesn't need to know its shape when used (and tensor can be `None`)

    Args:
        tensor (`torch.tensor`):
            The tensor that should be sent to all devices. Must only have it be defined on a single device, the rest
            should be `None`.
    Nr   r   r   )r
   r   r   zerosTENSOR_INT_TO_DTYPErI   rR   r   )r   r   rX   rY   r   r   r   copy_tensor_to_devices	  s
   
r   from_processc                 C   s8   t  jtjkrt| |ddS t  jtv rt| |dS | S )a  
    Recursively broadcast tensor in a nested list/tuple/dictionary of tensors to all devices.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather.
        from_process (`int`, *optional*, defaults to 0):
            The process from which to send the data

    Returns:
        The same data structure as `tensor` with all tensors broadcasted to the proper device.
    zaccelerate.utils.broadcast)r   r   r   )r
   r   r   r   r   r   r   )r   r   r   r   r   r     s
   r   c                    s^   t  jtjkrt| D ]\}}td| fdd| |< q| S t  jtv r-tj	j
|  d | S )a  
    Broadcast a list of picklable objects from one process to the others.

    Args:
        object_list (list of picklable objects):
            The list of objects to broadcast. This list will be modified inplace.
        from_process (`int`, *optional*, defaults to 0):
            The process from which to send the data.

    Returns:
        The same list containing the objects from process 0.
    z&accelerate.utils.broadcast_object_listc                    r   r   r   r   r   r   r   r   ?  r   z'broadcast_object_list.<locals>.<lambda>r   )r
   r   r   r   r   rw   r   r   r   r   broadcast_object_list)object_listr   r   r3   r   r   r   r   0  s   r   c                 C   s   dd }t || |S )aN  
    Recursively takes a slice in a nested list/tuple/dictionary of tensors.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`):
            The data to slice.
        tensor_slice (`slice`):
            The slice to take.

    Returns:
        The same data structure as `data` with all the tensors slices.
    c                 S   s   | | S r   r   )r   tensor_slicer   r   r   _slice_tensorS     z$slice_tensors.<locals>._slice_tensorr9   )r/   r   process_indexr   r   r   r   r   slice_tensorsE  s   r   c                    s   t  d ttfrt d  fddtt d D S t  d tr9t d  fdd d  D S t  d t	j
sLtdt d  t	j dS )a  
    Recursively concatenate the tensors in a nested list/tuple/dictionary of lists of tensors with the same shape.

    Args:
        data (nested list/tuple/dictionary of lists of tensors `torch.Tensor`):
            The data to concatenate.
        dim (`int`, *optional*, defaults to 0):
            The dimension on which to concatenate.

    Returns:
        The same data structure as `data` with all the tensors concatenated.
    r   c                 3   s*    | ] t  fd dD dV  qdS )c                       g | ]}|  qS r   r   r;   dr   r   r   r   g      z)concatenate.<locals>.<genexpr>.<listcomp>r   Nconcatenater;   r/   r   r   r   rA   g  r   zconcatenate.<locals>.<genexpr>c                    s(   i | ]  t  fd dD dqS )c                    r   r   r   r   rC   r   r   r   i  r   z*concatenate.<locals>.<dictcomp>.<listcomp>r   r   r   r   r   r   rE   i  r   zconcatenate.<locals>.<dictcomp>z%Can only concatenate tensors but got r   )r   r,   r2   r5   r   rg   r   r1   rj   r   r   rG   r   r   r   r   r   r   Y  s   *(r   c                   @   s   e Zd ZdS )CannotPadNestedTensorWarningN)rH   r   r   r   r   r   r   r   o  s    r   c                 C   s   ddd}t || d|||dS )	a3  
    Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so they
    can safely be gathered.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather.
        dim (`int`, *optional*, defaults to 0):
            The dimension on which to pad.
        pad_index (`int`, *optional*, defaults to 0):
            The value with which to pad.
        pad_first (`bool`, *optional*, defaults to `False`):
            Whether to pad at the beginning or the end.
    r   Fc           	         s  t | ddrtdt | S  t| jks t| j k r| S  dk r* t| j7  tj| j| jdd  }t	|
 }t fdd|D | j  krO| S | jt}| < | t|| }|rwt fddtt|D }nt fd	dtt|D }| ||< |S )
N	is_nestedFzHCannot pad nested tensors without more information. Leaving unprocessed.r   )rR   c                 3   s    | ]}|  V  qd S r   r   )r;   sr   r   r   rA     s    zFpad_across_processes.<locals>._pad_across_processes.<locals>.<genexpr>c                 3   s2    | ]}| krt    nt d V  qd S r   slicer;   r   r   max_sizeold_sizer   r   rA     s    "
c                 3   .    | ]}| krt d   nt dV  qdS r   Nr   r   r   r   r   r   rA        , )getattrwarningswarnr   rg   rX   r   r   rR   r   rm   maxr2   	new_zerosr,   r   )	r   r   	pad_index	pad_firstr   sizesnew_size
new_tensorindicesr   r   r   _pad_across_processes  s4   

 z3pad_across_processes.<locals>._pad_across_processesT)r8   r   r
  r  Nr   r   Fr9   )r   r   r
  r  r  r   r   r   pad_across_processess  s   
"r  c                 C   s   ddd}t || d|||dS )z
    Takes a `tensor` of arbitrary size and pads it so that it can work given `num_processes` needed dimensions.

    New tensors are just the last input repeated.

    E.g.:
      Tensor: ([3,4,4]) Num processes: 4 Expected result shape: ([4,4,4])

    r   c           
         s   || }|||  }|| dkr|| }n|||  }|||@   kr'dk r-n n|| }| j t}|| |d< | t|}t fddtt|D }	| ||	< |S )Nr   r   c                 3   r  r  r   r   r  r   r   rA     r  z@pad_input_tensors.<locals>._pad_input_tensors.<locals>.<genexpr>)rX   r2   r	  r,   r   rg   )
r   
batch_sizer   r   	remainderlast_inputsto_padr  r  r  r   r  r   _pad_input_tensors  s   
 z-pad_input_tensors.<locals>._pad_input_tensorsT)r8   r  r   r   Nr   r9   )r   r  r   r   r  r   r   r   pad_input_tensors  s   
r  mean      ?c                 C   s   ddd}t || d||dS )	aW  
    Recursively reduce the tensors in a nested list/tuple/dictionary of lists of tensors across all processes by the
    mean of a given operation.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to reduce.
        reduction (`str`, *optional*, defaults to `"mean"`):
            A reduction method. Can be of "mean", "sum", or "none"
        scale (`float`, *optional*):
            A default scaling value to be applied after the reduce, only valid on XLA.

    Returns:
        The same data structure as `data` with all the tensors reduced.
    r  r  c                 S   s   t  }|  }|jtjkr|S |jtjkr't  ttj	|g| t  n|jj
tv r5tj|tj |dkr>||j }|S )Nr  )r
   rt   r   r   r   r   rw   r{   
all_reduce
REDUCE_SUMvaluer   r   r   r   SUMr   )r   r   scaler   cloned_tensorr   r   r   _reduce_across_processes  s   

z(reduce.<locals>._reduce_across_processesT)r8   r   r  Nr  r  r9   )r   r   r  r!  r   r   r   r     s   

r   c                 C   s   dd }dd }t || |dS )av  
    Recursively converts the elements nested list/tuple/dictionary of tensors in FP16/BF16 precision to FP32.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to convert from FP16/BF16 to FP32.

    Returns:
        The same data structure as `tensor` with all tensors that were in FP16/BF16 precision converted to FP32.
    c                 S   s   |   S r   )floatr   r   r   r   _convert_to_fp32	  r   z)convert_to_fp32.<locals>._convert_to_fp32c                 S   s$   t | s	t| do| jtjtjfv S r_   )r   r-   rY   r   float16rn   r   r   r   r   _is_fp16_bf16_tensor  s   z-convert_to_fp32.<locals>._is_fp16_bf16_tensorrc   r9   )r   r$  r&  r   r   r   convert_to_fp32  s   r'  c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	ConvertOutputsToFp32ae  
    Decorator to apply to a function outputting tensors (like a model forward pass) that ensures the outputs in FP16
    precision will be convert back to FP32.

    Args:
        model_forward (`Callable`):
            The function which outputs we want to treat.

    Returns:
        The same function as `model_forward` but with converted outputs.
    c                 C   s   || _ t| | d S r   )model_forwardr   )selfr)  r   r   r   __init__"  s   zConvertOutputsToFp32.__init__c                 O   s   t | j|i |S r   )r'  r)  )r*  r>   r@   r   r   r   __call__&  rb   zConvertOutputsToFp32.__call__c                 C   s
   t d)NzCannot pickle a prepared model with automatic mixed precision, please unwrap the model with `Accelerator.unwrap_model(model)` before pickling it.)picklePicklingError)r*  r   r   r   __getstate__)  s   z!ConvertOutputsToFp32.__getstate__N)rH   r   r   r   r+  r,  r/  r   r   r   r   r(    s
    r(  c                    s   t    fdd} |_|S )Nc                     s    | i |S r   r   )r>   r@   r)  r   r   forward2  s   z(convert_outputs_to_fp32.<locals>.forward)r(  __wrapped__)r)  r1  r   r0  r   convert_outputs_to_fp32/  s   r3  c                 C   s~   t | tr|  D ]}t|}|dur|  S q	dS t | ttfr4| D ]}t|}|dur1|  S q#dS t | tjr=| jS dS )z
    Finds the device on which a nested dict/list/tuple of tensors lies (assuming they are all on the same device).

    Args:
        (nested list/tuple/dictionary of `torch.Tensor`): The data we want to know the device of.
    N)	r   r   valuesr   r,   r2   r   r   rR   )r/   r3   rR   r   r   r   r   ;  s"   
r   Tc                 c   s|    t  jtjkst  jdurt  j st }nddl}|jj	| |||d}| dV  W d   dS 1 s7w   Y  dS )z
    Wrapper around `deepspeed.runtime.zero.GatheredParameters`, but if Zero-3 is not enabled, will be a no-op context
    manager.
    Nr   )modifier_rank
fwd_moduleenabled)
r	   r   r   	DEEPSPEEDdeepspeed_pluginis_zero3_init_enabledr   	deepspeedzeroGatheredParameters)paramsr5  r6  r7  gather_param_contextr;  r   r   r   r=  P  s   
"r=  )FNr   )r   r   r   )NNr  r"  )NNT)Yr   r-  r  collections.abcr   
contextlibr   r   	functoolsr   r   typingr   r   r   r	   r
   	constantsr   dataclassesr   r   importsr   r   r   versionsr   torch_xla.core.xla_modelcore	xla_modelrw   torch.distributedr   r   r%   r(   r0   r5   r:   rO   r[   r^   re   ri   rk   rr   r}   r   	Exceptionr   r   r   r   r   r   r   r   r#  doublehalfrn   uint8int8int16int32int64boolr   rF   r   r   r   r   rT   r   r   r   r   UserWarningr   r  r  r   r'  r(  r3  r   r=  r   r   r   r   <module>   s   
34'	#


	


7()