o
    i&                     @   sj   d dl Z d dlZd dlZd dlmZ ddlmZmZmZm	Z	 e	
eZdefddZdd	 Zdd
dZdS )    N)
DataLoader   )WEIGHTS_NAMEPushToHubMixinis_torch_xla_availablelogging
dataloaderc                 C   s`   t  r.dd lm  m} t| |jsJ ddd lm  m} ||	 d}|| j
d< | S | S )Nr   zPThe dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`.)fsdpNinput_sharding)r   %torch_xla.distributed.parallel_loaderdistributedparallel_loader
isinstanceMpDeviceLoadertorch_xla.distributed.spmdspmdShardingSpecget_global_mesh_parallel_loader_kwargs)r   plxssharding_spec r   _/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/integrations/tpu.pytpu_spmd_dataloader   s   
r   c                    s  ddl m  m ddlm  m ddlm} zddlm	  ddlm
 ddlm}m} r5ddlm W n tyA   td	w d}d}t| d
d}|jd|}	|jd dkretj||jd d}n%|	durt }
|	D ]}|| |}|du r}td|
| qntj||
d}|j}|jd r| jjrtd d| j_ fdd}rfdd}| |||d} n | f||d|} di ffdd	}|_| S )a.  
    Wraps a model with XLA Fully Sharded Data Parallelism (FSDP).

    Handles both FSDP v1 (`XlaFullyShardedDataParallel`) and v2 (`SpmdFullyShardedDataParallel`),
    including auto-wrap policies, gradient checkpointing, and patching `xm.optimizer_step`.

    Args:
        model (`torch.nn.Module`): The model to wrap.
        args (`TrainingArguments`): The training arguments containing FSDP configuration.
        is_fsdp_xla_v2_enabled (`bool`): Whether FSDP v2 (SPMD) is enabled.

    Returns:
        `torch.nn.Module`: The FSDP-wrapped model.
    r   Nr   )get_module_class_from_name)XlaFullyShardedDataParallel)checkpoint_module)size_based_auto_wrap_policytransformer_auto_wrap_policy)SpmdFullyShardedDataParallelzJMissing XLA FSDP related module; please make sure to use torch-xla >= 2.0._no_split_modulestransformer_layer_cls_to_wrapmin_num_params)r#   z@Could not find the transformer layer class to wrap in the model.)transformer_layer_clsxla_fsdp_grad_ckptzX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fc                    s&   s n}|| g|R i |S Nr   )margskwargs
target_cls)FSDPFSDPv2r   is_fsdp_xla_v2_enabledr   r   auto_wrapper_callablet   s   z2wrap_model_xla_fsdp.<locals>.auto_wrapper_callablec                    sh   ddl m} d }t| tjr| }nt| tr| d }nt| |r#| j}|d u r+td ||d d S )Nr   )CausalLMOutputWithPastr   zASomething went wrong, the output of the model shouldn't be `None`)r	   NN)	modeling_outputsr/   r   torchTensortuplelogits
ValueErrormark_sharding)outputmeshr/   real_output)r   r   r   shard_output{   s   


z)wrap_model_xla_fsdp.<locals>.shard_output)r:   auto_wrap_policyr.   )r;   r.   c                    s    | j di |}|r   |S )Nr   )step	mark_step)	optimizerbarrieroptimizer_argsloss)xmr   r   patched_optimizer_step   s   z3wrap_model_xla_fsdp.<locals>.patched_optimizer_step)torch_xla.core.xla_modelcore	xla_modelr   r   r   trainer_pt_utilsr   torch_xla.distributed.fsdpr   r   torch_xla.distributed.fsdp.wrapr   r   7torch_xla.experimental.spmd_fully_sharded_data_parallelr    ImportErrorgetattrfsdp_configget	functoolspartialset	Exceptionaddxla_fsdp_configconfig	use_cacheloggerwarning_onceoptimizer_step)modelr(   r-   r   r   r   r;   r.   %default_transformer_cls_names_to_wrap"fsdp_transformer_layer_cls_to_wraptransformer_cls_to_wraplayer_classtransformer_clsfsdp_kwargsr:   rC   r   )r+   r,   r   r-   rB   r   r   wrap_model_xla_fsdp.   sz   


	ra   c              	   C   s   ddl m  m} |dur|n|j}td|  |  |jddr6tj	|dd t
|tj|d tf}|d	 |r|  |  d
}tj|d|j d|j dt }	|j||	dd |d |jrddlm}
 |
tj|ddt dd\}}| jj} || }t||r|j||d nVtd ||tj|t nEt| |st|| |r|| j||j||  d n&td ||  }||tj|t n| j||j||  d |dur|jr|| dS dS dS )a  
    Saves a model checkpoint on TPU/XLA devices.

    Handles FSDP v1 sharded checkpoints (with consolidation on master), as well as
    standard XLA model saving via `save_pretrained` or `xm.save`.

    Args:
        model (`torch.nn.Module`): The model to save.
        args (`TrainingArguments`): The training arguments.
        accelerator (`Accelerator`): The accelerator instance.
        processing_class: The processing class (tokenizer/processor) to save alongside the model.
        is_fsdp_xla_v1_enabled (`bool`): Whether FSDP XLA v1 is enabled.
        output_dir (`str`, *optional*): The directory to save to. Defaults to `args.output_dir`.
    r   NzSaving model checkpoint to F)localT)exist_okztraining_args.binsaving_checkpoint)rZ   shard_metadatarankz-of--)master_onlysave_full_checkpoints)%consolidate_sharded_model_checkpoints zrank*-of-*-)ckpt_prefixckpt_suffix
save_model)
state_dictzETrainer.model is not a `PreTrainedModel`, only saving its state dict.)is_main_processro   )rD   rE   rF   
output_dirrW   infor=   is_master_ordinalosmakedirsr1   savepathjoinr   
rendezvousro   get_shard_metadataprocess_index
world_sizer   should_saverH   rj   moduleunwrap_modelr   save_pretrained_maybe_convert_to_cpu)rZ   r(   acceleratorprocessing_classis_fsdp_xla_v1_enabledrq   rB   supported_classesckpt	ckpt_pathrj   full_state_dict_unwrapped_modelro   r   r   r   save_tpu_checkpoint   s`   
$







r   r&   )rO   rt   r1   torch.utils.datar   utilsr   r   r   r   
get_logger__name__rW   r   ra   r   r   r   r   r   <module>   s   
w