o
    0i                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlm	  m
Z d dlmZmZmZ d dlmZ ddlmZ ddlmZ d	d
lmZ d	dlmZmZ e rd dlmZmZ d dlmZ  d dlm!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZAmBZB d dlCmDZDmEZEmFZFmGZGmHZH d dlImJZJmKZKmLZL d dlMmNZN d dlOmPZPmQZQmRZRmSZSmTZTmUZU d dlVmWZW d dlXmYZYmZZZm[Z[m\Z\m]Z]m^Z^ d d l_m`Z`maZambZbmcZc dHd"d#Zdd$d% ZeG d&d' d'Zfd(d) Zgd*d+ ZhG d,d- d-eZid.d/ ZjG d0d1 d1ZkG d2d3 d3eZld4d5 ZmG d6d7 d7eZnG d8d9 d9enZoG d:d; d;enZpG d<d= d=enZqd>d? Zrdi fd@dAZsG dBdC dCej	jtZudDdE ZvdFdG ZwdS )I    N)ABC)partial)BCEWithLogitsLossCrossEntropyLossMSELoss)DistributedDataParallel   )AcceleratedOptimizer)AcceleratedScheduler   )is_megatron_lm_available)recursively_applysend_to_device)mputensor_parallel)finalize_model_grads)	ModelType)get_num_microbatches)get_megatron_optimizer)get_tensor_model_parallel_group"get_tensor_model_parallel_src_rank)get_forward_backward_func)get_model_config)broadcast_int_listbroadcast_tensor)%beam_search_and_return_on_first_stage/generate_tokens_probs_and_return_on_first_stage)build_train_valid_test_datasets)	BertModelFloat16ModuleGPTModelT5Model)Classification)get_argsget_tensorboard_writerget_tokenizerprint_rank_last)_add_data_args_add_validation_args!core_transformer_config_from_args
parse_argsvalidate_args)load_args_from_checkpointload_checkpointsave_checkpoint)set_global_variables)_compile_dependencies_init_autoresume_initialize_distributed_set_random_seedset_jit_fusion_optionswrite_args_to_tensorboard)_vocab_size_with_padding)%build_train_valid_test_data_iteratorsget_optimizer_param_schedulernum_floating_point_operationssetup_model_and_optimizer
train_steptraining_log))average_losses_across_data_parallel_groupcalc_params_l2_normget_ltor_masks_and_position_idsunwrap_modelTc           	   	   C   s   t  }|jrdnd}|jdkrtd|j d| d td t|}|jdkrK|jr?|jr0d	nd}t|||jd
| |d}|S t||j	d	| |d}|S |jdkr[t
|dd
| |d}|S |jdkrmt|dd
| |||d}|S td|j )zBuild the model.zpre-trainingzfine-tuningr   z	Building z model in the z mode.zThe Megatron LM model weights are initialized at random in `accelerator.prepare`. Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup.bertr   T)confignum_tokentypesadd_binary_headparallel_outputpre_processpost_process)rB   num_classesrC   rF   rG   gpt)rB   rC   rE   rF   rG   t5)rB   rC   rE   rF   rG   add_encoderadd_decoderUnsupported model type: )r#   pretraining_flagrankprintmodel_type_namer)   bert_binary_headr   r"   
num_labelsr    r!   
ValueError)	rF   rG   rK   rL   argsmoderB   rC   model rX   ^/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/accelerate/utils/megatron_lm.pymodel_provider_func[   sb   

$

rZ   c                 C   s   |  d t }| jjjd ur3| jjjd u rtd| jjj}| jj|}t| |}t| |d d}n(t	j
}|jdkr>t	j}t}| jjjd urL| jjj}t|||j|j|jd\}}}t||_|||fS )Nz#Preparing model optimizer schedulerzaYou must provide a `custom_model_provider_function` when using a `custom_prepare_model_function`.)	schedulerrJ   )no_wd_decay_condscale_lr_condlr_mult)rP   r#   statemegatron_lm_plugincustom_prepare_model_functioncustom_model_provider_functionrT   prepare_optimizerprepare_schedulerr   encoder_or_decoderrQ   encoder_and_decoderrZ   r:   r\   r]   r^   len	model_len)acceleratorrU   custom_model_provider_funcrW   	optimizerr[   
model_typemodel_provider_func_rX   rX   rY   !prepare_model_optimizer_scheduler   s4   






rn   c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )MegatronLMDummyDataLoaderz
    Dummy dataloader presents model parameters or param groups, this is primarily used to follow conventional training

    Args:
        **dataset_kwargs: Megatron data arguments.
    c                 K   sH   t  }t|}t|}| }t|d | _| j| d| jd< d S )Nr   Tmegatron_dataset_flag)argparseArgumentParserr'   r(   parse_known_argsvarsdataset_argsupdate)selfdataset_kwargsparser	data_argsrX   rX   rY   __init__   s   z"MegatronLMDummyDataLoader.__init__c              
   C   s^   t  }| j D ]$\}}t||d}||kr&td| d| d| d|  t||| qd S )N z<WARNING: MegatronLMDummyDataLoader overriding arguments for : with )r#   ru   itemsgetattrrP   setattr)rw   rU   keyvalue	old_valuerX   rX   rY   set_megatron_data_args   s   z0MegatronLMDummyDataLoader.set_megatron_data_argsc                 C   s   dd }|j jjd ur|j jjS z9t }|jdkr%ddlm} d|_|W S |jdkr6ddlm} d|_|W S |jdkrGddl	m} d|_|W S W |S  t
yS   Y |S w )	Nc                 S   s   t  }t|jttfr|jn|jg|j| |jd}|jdkr)||j	|j
d n'|jdkr7|d|j	i n|jdkrH||j|jdd ntd|j td
i |\}}}|||fS )z&Build train, valid, and test datasets.)data_prefixsplits_stringtrain_valid_test_num_samplesseedrA   )max_seq_lengthbinary_headrI   r   rJ   )r   max_seq_length_decdataset_typerM   NrX   )r#   
isinstance	data_pathlisttuplesplitr   rQ   rv   
seq_lengthrR   encoder_seq_lengthdecoder_seq_lengthrT   r   )train_val_test_num_samplesrU   ru   train_dsvalid_dstest_dsrX   rX   rY   "train_valid_test_datasets_provider   s6   



zlMegatronLMDummyDataLoader.get_train_valid_test_datasets_provider.<locals>.train_valid_test_datasets_providerrA   r   )r   TrI   rJ   )r_   r`   *custom_megatron_datasets_provider_functionr#   rQ   pretrain_bertr   is_distributedpretrain_gptpretrain_t5ImportError)rw   ri   r   rU   rX   rX   rY   &get_train_valid_test_datasets_provider   s.   #



z@MegatronLMDummyDataLoader.get_train_valid_test_datasets_providerc           	      C   s   t  }| |}|jd ur=g }g }g }tt|ddD ] }t| t|}||d  ||d  ||d  qnt|\}}}|||fS )Nrh   r   r   r   )	r#   r   $virtual_pipeline_model_parallel_sizeranger   r   (set_virtual_pipeline_model_parallel_rankr7   append)	rw   ri   rU   !train_valid_test_dataset_providertrain_data_iteratorvalid_data_iteratortest_data_iteratori	iteratorsrX   rX   rY   r7     s"   




z?MegatronLMDummyDataLoader.build_train_valid_test_data_iteratorsN)__name__
__module____qualname____doc__r{   r   r   r7   rX   rX   rX   rY   ro      s    	
<ro   c                 C   sR   G dd d}|d u }t j|t j| jd}t jj|t t d |s'|r'| S |S )Nc                   @   s   e Zd Zdd Zdd ZdS )z?_handle_megatron_data_iterator.<locals>.DummyMegatronDataloaderc                 S   s   | S NrX   rw   rX   rX   rY   __iter__     zH_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__iter__c                 S   s   i S r   rX   r   rX   rX   rY   __next__!  r   zH_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__next__N)r   r   r   r   r   rX   rX   rX   rY   DummyMegatronDataloader  s    r   dtypedevicegroup)torchtensorboolr   distributed	broadcastr   r   )ri   data_iteratorr   is_data_iterator_emptyis_src_data_iterator_emptyrX   rX   rY   _handle_megatron_data_iterator  s   
r   c           	   
      sh  |  d t }|jspddlm m} |j|j } fdd D }|d d u rHt|d t	j
jjr9||d _n|d= |d= |d= ||d	 _n|d	= ||d< t	j
jjjfi ||| jt t d
d| j | jdS |jd ur|j\|_|_|_nd\|_|_|_|j|j |_| \}}}|j|j |_t| |d}t| |d}t| |d}|||fS )NzPreparing dataloaderr   )_PYTORCH_DATALOADER_KWARGSprepare_data_loaderc                    s   i | ]}|t | | qS rX   )r   ).0kr   
dataloaderrX   rY   
<dictcomp>5  s    z'prepare_data_loader.<locals>.<dictcomp>
batch_sizesamplershufflebatch_samplerFT)num_processesprocess_indexsplit_batchesput_on_device	rng_typesdispatch_batches)r   r   r   )ri   r   )rP   r#   rp   data_loaderr   r   micro_batch_sizenum_micro_batchesr   r   utilsdataBatchSamplerr   
DataLoaderdatasetr   r   get_data_parallel_world_sizeget_data_parallel_rankr   copyr   consumed_samplesconsumed_train_samplesconsumed_valid_samplesconsumed_test_samplesr7   r   )	ri   r   rU   r   r   kwargsr   r   r   rX   r   rY   r   .  s`   


r   c                       s:   e Zd Z fddZd
ddZdd Zedd	 Z  ZS )MegatronLMOptimizerWrapperc                    s   t  j|dd d d S )NF)device_placementscalersuperr{   )rw   rk   	__class__rX   rY   r{   q  s   z#MegatronLMOptimizerWrapper.__init__Nc                 C      d S r   rX   )rw   set_to_nonerX   rX   rY   	zero_gradt  r   z$MegatronLMOptimizerWrapper.zero_gradc                 C   r   r   rX   r   rX   rX   rY   stepw  r   zMegatronLMOptimizerWrapper.stepc                 C   s   | j jS )zTWhether or not the optimizer step was done, or skipped because of gradient overflow.)rk   skipped_iterr   rX   rX   rY   step_was_skippedz  s   z+MegatronLMOptimizerWrapper.step_was_skippedr   )	r   r   r   r{   r   r   propertyr   __classcell__rX   rX   r   rY   r   p  s    
r   c                 C   s$   |  d t }t||j|j|jS )NzPreparing optimizer)rP   r#   r   r\   r]   r^   )ri   rW   rU   rX   rX   rY   rc     s   
rc   c                   @   s   e Zd ZdZdddZdS )MegatronLMDummySchedulera  
    Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
    loop when scheduler config is specified in the deepspeed config file.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        total_num_steps (int):
            Total number of steps.
        warmup_num_steps (int):
            Number of steps for warmup.
        **kwargs (additional keyword arguments, *optional*):
            Other arguments.
    Nr   c                 K   s   || _ || _|| _|| _d S r   )rk   total_num_stepswarmup_num_stepsr   )rw   rk   r   r   r   rX   rX   rY   r{     s   
z!MegatronLMDummyScheduler.__init__Nr   )r   r   r   r   r{   rX   rX   rX   rY   r     s    r   c                       s$   e Zd Z fddZdd Z  ZS )MegatronLMSchedulerWrapperc                    s   t  || d S r   r   )rw   r[   
optimizersr   rX   rY   r{     s   z#MegatronLMSchedulerWrapper.__init__c                 O   r   r   rX   )rw   rU   r   rX   rX   rY   r     r   zMegatronLMSchedulerWrapper.step)r   r   r   r{   r   r   rX   rX   r   rY   r     s    r   c                 C   s   |  d t|}|S )NzPreparing scheduler)rP   r8   )ri   rk   r[   rX   rX   rY   rd     s   
rd   c                       8   e Zd ZdZ fddZdd Zdd Zdd	 Z  ZS )
AbstractTrainStepz;Abstract class for batching, forward pass and loss handler.c                    s   t    || _d S r   )r   r{   name)rw   r   r   rX   rY   r{     s   

zAbstractTrainStep.__init__c                 C   r   r   rX   )rw   ri   rp   rX   rX   rY   get_batch_func  r   z AbstractTrainStep.get_batch_funcc                 C   r   r   rX   r   rX   rX   rY   get_forward_step_func  r   z'AbstractTrainStep.get_forward_step_funcc                 C   r   r   rX   )rw   ri   rX   rX   rY   get_loss_func  r   zAbstractTrainStep.get_loss_func)	r   r   r   r   r{   r   r   r   r   rX   rX   r   rY   r     s    r   c                       r   )
BertTrainStepzg
    Bert train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    sh   t  d | ||j| _| ||j|j| _| 	|j|j
| _|js)d | _d S ddlm} || _d S )Nr   r   )SequenceClassifierOutput)r   r{   r   rp   	get_batchr   rN   rS   	loss_funcr   rR   forward_stepmodel_return_dictmodel_output_classtransformers.modeling_outputsr  )rw   ri   rU   r  r   rX   rY   r{     s   

zBertTrainStep.__init__c                 C   X   dd }dd }|j jjd ur|j jjS |r*z	ddlm} |W S  ty)   Y |S w |S )Nc                 S   s   g d}t j}| durt| }nd}t|||}|d  }|d  }|d  }|d  }|d  }	|d  }
|||||	|
fS )	Build the batch.)texttypeslabels	is_random	loss_maskpadding_maskNr
  r  r  r  r  r  r   int64nextr   broadcast_datalongfloat)r   keysdatatyper   data_btokensr  sentence_orderr  	lm_labelsr  rX   rX   rY   get_batch_megatron  s   
z8BertTrainStep.get_batch_func.<locals>.get_batch_megatronc                 S   s   t | }t|tj }|d  }|d  }d|v r#|d  }nd}d|v r:|d  }|d dktj}nd}d}d|v rI|d  }nd}||||||fS )r	  	input_idsattention_masktoken_type_idsNr  next_sentence_label)r  r   r   cudacurrent_devicer  tor  )r   r   r  r  r  r  r  r  rX   rX   rY   get_batch_transformer  s    z;BertTrainStep.get_batch_func.<locals>.get_batch_transformerr   r  )r_   r`   custom_get_batch_functionr   r  r   rw   ri   rp   r  r%  r  rX   rX   rY   r     s   
zBertTrainStep.get_batch_funcc                    s:   dd } fdd}|j jjd ur|j jjS |r|S |S )Nc           	      S   s   |\}}|  }|   } t|d| d |   }|d urKtj|dd  |ddd}|  }|| }t||g}||d |d dfS |}t|g}|d|d ifS )Nr   )ignore_indexr   r   )lm losszsop lossr+  )r  r   sumviewreshapeFcross_entropyr=   )	r  r  output_tensorlm_loss_
sop_logitslm_losssop_losslossaveraged_lossesrX   rX   rY   loss_func_pretrain  s   ""
z7BertTrainStep.get_loss_func.<locals>.loss_func_pretrainc                    s    dkrt  }||d| d}n&jdkr1| jtjtjfv r1t }||d | d}nt }||| }t	|g}|d|d ifS )Nr   r)  r6  r   )
r   r-  rS   r   r   r  intr   r   r=   )r  logitsloss_fctr6  r7  rS   rw   rX   rY   loss_func_finetune%  s   

z7BertTrainStep.get_loss_func.<locals>.loss_func_finetuner_   r`   custom_loss_function)rw   ri   rN   rS   r8  r=  rX   r<  rY   r     s   
zBertTrainStep.get_loss_funcc                    s    fdd}|S )Nc           
         sb    | \}}}}}} sd}r"|||||d}|tj||fS ||||d}	|	tj|fS )Forward step.Ntokentype_idsr  )rB  r  r   r  )
r   rW   r  r  r  r  r  r  r1  r:  rR   rN   rw   rX   rY   r  ;  s   z9BertTrainStep.get_forward_step_func.<locals>.forward_steprX   )rw   rN   rR   r  rX   rD  rY   r   :  s   z#BertTrainStep.get_forward_step_func	r   r   r   r   r{   r   r   r   r   rX   rX   r   rY   r     s    @)r   c                       r   )
GPTTrainStepzf
    GPT train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    s   t  d | ||j| _| || _|  | _|j	d | _
|jd ur+t }|j| _
|j| _|j| _|j| _|js?d | _d S ddlm} || _d S )NrF  r   r   )!CausalLMOutputWithCrossAttentions)r   r{   r   rp   r  r   r  r   r  padded_vocab_size	eod_token
vocab_filer%   eodreset_position_idsreset_attention_maskeod_mask_lossr  r  r  rG  )rw   ri   rU   	tokenizerrG  r   rX   rY   r{   S  s   



zGPTTrainStep.__init__c                    s`    fdd} fdd}|j jjd ur|j jjS |r.z	ddlm} |W S  ty-   Y |S w |S )Nc                    s   dg}t j}| durt| }nd}t|||}|d  }|ddddf  }|ddddf  }t| j j	 j
 j\}}	}
|||	||
fS )zGenerate a batchr
  Nr   r)  )r   r  r  r   r  r  
contiguousr?   rI  rL  rM  rN  )r   r  r  r   r  tokens_r  r  r  r  position_idsr   rX   rY   r  g  s   

z7GPTTrainStep.get_batch_func.<locals>.get_batch_megatronc           	         s   t | }d|d i}t|tj }|d  }tj|jd df|j|j	d j
 }tj||gdd}|d d dd f  }|d d d df  }t| j
 j jd\}}}|||||fS )Nr  r   r   r   dimr)  T)r  r   r   r"  r#  r  zerosshaper   r   rI  concatrP  r?   rL  rM  )	r   r   rQ  paddingr  r  r  r  rR  r   rX   rY   r%    s   $
z:GPTTrainStep.get_batch_func.<locals>.get_batch_transformerr   r&  )r_   r`   r'  r   r  r   r(  rX   r   rY   r   f  s   
zGPTTrainStep.get_batch_funcc                    s.   t    fdd}|jjjd ur|jjjS |S )Nc                    s   j r|\}}n|}| }| d }  jdkrDtt|d|  d|  dg}tjj|t	
 d |d |d  }nt|d|  |   } jrrtj }| rrJ d| dtj  dt d  t|g}d|d i} j r|d	|i ||fS )
Nr)  r   r   r   zRank z7: found NaN in local forward loss calculation. Device: z, node: r+  r:  )return_logitsr  r-  context_parallel_sizer   catr,  r   
all_reducer   get_context_parallel_groupcheck_for_nan_in_loss_and_gradget_rankisnanr"  r#  osunamer=   rv   )r  r1  lossesr:  r6  global_rankaveraged_lossoutput_dictrU   rX   rY   r    s0   

.



z-GPTTrainStep.get_loss_func.<locals>.loss_func)r#   r_   r`   r?  rw   ri   r  rX   rg  rY   r     s
   
zGPTTrainStep.get_loss_funcc                        fdd}|S )Nc                    s4     | \}}}}}|||||d}|t j|fS )r@  )r  rC  )r   rW   r  r  r  r  rR  r1  r   rX   rY   r    s   z8GPTTrainStep.get_forward_step_func.<locals>.forward_steprX   rw   r  rX   r   rY   r     s   z"GPTTrainStep.get_forward_step_funcrE  rX   rX   r   rY   rF  K  s    8%rF  c                       s\   e Zd ZdZ fddZedd Zedd Zedd	 Zd
d Z	dd Z
dd Z  ZS )T5TrainStepze
    T5 train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    sX   t  d | ||j| _| || _|  | _|j	s!d | _
d S ddlm} || _
d S )Nrk  r   )Seq2SeqLMOutput)r   r{   r   rp   r  r   r  r   r  r  r  r  rl  )rw   ri   rU   rl  r   rX   rY   r{     s   


zT5TrainStep.__init__c                 C   s(   |  d}|  d}|| }|dk }|S )Nr   r         ?)	unsqueeze)r  attention_mask_b1sattention_mask_bs1attention_mask_bssextended_attention_maskrX   rX   rY   attn_mask_postprocess  s
   

z!T5TrainStep.attn_mask_postprocessc                 C   s&   t t jd| | f|d}|dk }|S Nr   r   rm  )r   trilones)r   r   r  rX   rX   rY   get_decoder_mask  s   zT5TrainStep.get_decoder_maskc           	      C   s<   | j \}}| d}tj||df|d}|| }|dk }|S rt  )rV  rn  r   rw  )	r  dec_seq_lengthr   r   _ro  rp  rq  rr  rX   rX   rY   get_enc_dec_mask  s   

zT5TrainStep.get_enc_dec_maskc                 C   r  )Nc                 S   s   g d}t j}| durt| }nd}t|||}|d  }|d  }|d  }|d  }|d dk }	|d	 dk }
|d
 dk }|||||	|
|fS )r	  )text_enctext_decr  r  enc_maskdec_maskenc_dec_maskNr|  r}  r  r  r~  rm  r  r  r  )r   r  r  r   r  
tokens_enc
tokens_decr  r  r~  r  r  rX   rX   rY   r    s   
z6T5TrainStep.get_batch_func.<locals>.get_batch_megatronc           	      S   s   t | }t|tj }|d  }|d  }|dktj}d|v r+|d  }n'|j|j	|j
tjd}|dddf  |dd	df< d
|d< ||dkd
 t|d  }t|j	d	 |j
}t|d  |j	d	 |j
}|||||||fS )r	  r  r  r   decoder_input_ids)r   r   .Nr)  r   r   ).r   r  )r  r   r   r"  r#  r  r$  r  	new_zerosrV  r   clonemasked_fill_rk  rs  rx  r{  )	r   r   r  r  r  r  r~  r  r  rX   rX   rY   r%    s"    z9T5TrainStep.get_batch_func.<locals>.get_batch_transformerr   r&  )r_   r`   r'  r   r  r   r(  rX   rX   rY   r     s   
zT5TrainStep.get_batch_funcc                 C   s$   dd }|j jjd ur|j jjS |S )Nc                 S   sH   |  }t|d| d |   }|}t|g}|d|d ifS )Nr)  r+  r   )r  r   r,  r-  r.  r=   )r  r1  r2  r4  r6  r7  rX   rX   rY   r  B  s
   "
z,T5TrainStep.get_loss_func.<locals>.loss_funcr>  rh  rX   rX   rY   r   A  s   	
zT5TrainStep.get_loss_funcc                    ri  )Nc           
   	      s>     | \}}}}}}}||||||d|d}	|	t j|fS )r@  NrA  rC  )
r   rW   r  r  r  r  r~  r  r  r1  r   rX   rY   r  P  s   z7T5TrainStep.get_forward_step_func.<locals>.forward_steprX   rj  rX   r   rY   r   O  s   z!T5TrainStep.get_forward_step_func)r   r   r   r   r{   staticmethodrs  rx  r{  r   r   r   r   rX   rX   r   rY   rk    s    


?rk  c                  C   s:   t  } t  | jdkrtd| j d t| j| j d S )Nr   z> setting random seeds to z ...)r#   r2   rO   rP   r   r3   data_parallel_random_initrg  rX   rX   rY   finish_mpu_init`  s
   
r  c              
   C   s4  |  d tj sJ dt|dd}| D ],\}}t||d d ur>|jdkr>t d| dt|| d| d| dd	 t||| q|j	sN|
d
dr[|jd usWJ dt| t| t| t  t  t  t  t }t|dd d u rt|j||_|jdkr|jr|jdkrd|_nd|_d|_d S )NzInitializing Megatron-LMzMegatron requires CUDA.T)ignore_unknown_argsr   z*WARNING: overriding default arguments for r}   r~   )flushuse_checkpoint_argsFz/--use-checkpoints-args requires --load argumentrH  rA   r   )rP   r   r"  is_availabler*   r   r   rO   r   r  getloadr,   r+   r/   r  r1   r0   r4   r#   r6   orig_vocab_sizerH  rQ   rN   rS   rR   	iteration)ri   extra_args_providerargs_defaultsrU   r   r   rX   rX   rY   
initializem  s6   

 
r  c                       s   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd Zdd Z								dddZ  ZS )MegatronEnginez
    Megatron-LM model wrapper

    Args:
        accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
        model: Megatron-LM model
        optimizer: Megatron-LM optimizer
        lr_scheduler: Megatron-LM lr scheduler
    c                    s   t    || _|d | _|| _|| _t }|jjj	d ur-|jjj	|fi |jjj
| _n,|jdkr9t||| _n |jdkrEt||| _n|jdkrQt||| _ntd|j d| j_i | _i | _d| _d| _d| _d | _|jd uryt  d S d S )Nr   rA   rI   rJ   rM   FT)r   r{   module
base_modelrk   r[   r#   r_   r`   custom_train_step_classcustom_train_step_kwargstrain_step_handlerrQ   r   rF  rk  rT   r   total_loss_dicteval_total_loss_dictr  report_memory_flag$num_floating_point_operations_so_farmodule_configtensorboard_dirr5   )rw   ri   rW   rk   r[   rU   r   rX   rY   r{     s:   







zMegatronEngine.__init__c                    s   t  }t jd } jj|_t jd trR|jrR|j	d u s#J ddd  jD |_	t
 jdkr9|j	d |_	|jrRdd  jD |_t
 jdkrR|jd |_|jrt|jrt fddtt
 jD |_t
 jdkrt|jd |_t|_|S )Nr   zWhen overlap_grad_reduce is True, config.no_sync_func must be None; a custom no_sync_func is not supported when overlapping grad-reducec                 S      g | ]}|j qS rX   )no_syncr   model_chunkrX   rX   rY   
<listcomp>      z4MegatronEngine.get_module_config.<locals>.<listcomp>r   c                 S   r  rX   )start_grad_syncr  rX   rX   rY   r    r  c                    s   g | ]	  fd dqS )c                    s   j  | S r   )rk   finish_param_sync)x)model_indexrw   rX   rY   <lambda>  s    z=MegatronEngine.get_module_config.<locals>.<listcomp>.<lambda>rX   )r   r   )r  rY   r    s    )r#   r   r  rk   
scale_lossgrad_scale_funcr   LocalDDPoverlap_grad_reduceno_sync_funcrg   delay_grad_reducegrad_sync_funcoverlap_param_gatherdelay_param_gatherr   param_sync_funcr   finalize_model_grads_func)rw   rU   rB   rX   r   rY   get_module_config  s,   

z MegatronEngine.get_module_configc                 C   s4   | j D ]}|  q| jd u r|  | _|   d S r   )r  trainr  r  log_eval_resultsrw   model_modulerX   rX   rY   r    s
   



zMegatronEngine.trainc                 C   s0   | j D ]}|  q| jd u r|  | _d S d S r   )r  evalr  r  r  rX   rX   rY   r    s
   


zMegatronEngine.evalc                    s   t   g t|dkr, jdkr)td jD ] fdd| D  qn|gt| jdkrSt|dkrIfddtt| jD }|S d gt| j }|S t|dkr]tnd }|S )Nr   r   c                    s.   i | ]\}}|| j  d   j   qS )r   )r   )r   r   v)rU   r   rX   rY   r     s    z:MegatronEngine.get_batch_data_iterator.<locals>.<dictcomp>c                    s   g | ]}t  qS rX   )iterr   rz  )data_chunksrX   rY   r        z:MegatronEngine.get_batch_data_iterator.<locals>.<listcomp>)r#   rg   r   r   r   r   r  r  )rw   
batch_databatch_data_iteratorrX   )rU   r  r   rY   get_batch_data_iterator  s,   
z&MegatronEngine.get_batch_data_iteratorc                 K   sJ   |  |}t| jj|| j| j| j| jd\}}}}|dk| j_||||fS )z
        Training step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to train on.
        )forward_step_funcr   rW   rk   opt_param_schedulerrB   r   )	r  r;   r  r  r  rk   r[   r  r   )rw   r  r  loss_reducedr   	grad_normnum_zeros_in_gradrX   rX   rY   r;   
  s   
	zMegatronEngine.train_stepc              	      s   t  }| |}t }|| jj|| jt |j|jdd}|j	dkr&t
j  | jt |j t  7  _tjddrji }|d D ]&  fdd|D }t|d jdkr`t|t| | < qAt
|| < qA|S i S )z
        Evaluation step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to evaluate on.
        T)r  r   rW   num_microbatchesr   r   forward_onlyr   )ignore_virtualr   c                    s   g | ]}|  qS rX   rX   )r   r  r   rX   rY   r  A  r  z,MegatronEngine.eval_step.<locals>.<listcomp>)r#   r  r   r  r  r  r   r   r   empty_unused_memory_levelr   r"  empty_cacher   r   r   is_pipeline_last_stagerg   rV  r,  rW  )rw   r  rU   r  forward_backward_func
loss_dictsr  losses_reduced_for_keyrX   r  rY   	eval_step!  s4   



zMegatronEngine.eval_stepc                 K   s  t  }| jd jre| jd
i |\}}}}|  jd7  _t |j t  }| j	|7  _	|  j
t||7  _
|jd urd| j  }d }	|jrMt| j}	t|| j| jjd d | j|| j|||	|
| _n?| jd
i |}|jd ur|D ]/}
| j|
tjdg||
  | j|
< | j|
d tjdgtjdg | j|
d < qttjdtj d}|D ]}
t||
 j dkr|||
 7 }qd }d|v r|d }| j!j"d ur| j!j"||d	S |S )Nr   r   lr        
_num_iters      ?ru  r:  )r6  r:  rX   )#r#   r  trainingr;   r  r   r   r   r   r   r  r9   r  rk   get_loss_scaleitemlog_params_normr>   rW   r<   r  param_groupsr  r  r  r  r   r"  FloatTensorr   r#  rg   rV  r  r  )rw   r  rU   	loss_dictr   r  r  r   
loss_scaleparams_normr   r6  r:  rX   rX   rY   forwardI  s\   


zMegatronEngine.forwardc                 C   s  t  }|jd u s| jdkrd S t  }t }d| j d}| jD ]R}|dr'q| j| | j|d   }|| d| d7 }ttd|	 }|j
rT|| d| d7 }|rq|| d|	 | j |j
rq|| d	|| j qt|d
 }td|  t| td|  i | _d S )Nr   zvalidation loss at iteration z | r  z value:    z PPL: z validationz validation pplr   -)r#   r  r  r$   r  endswithmathexpminr  rN   
add_scalarrg   r&   )rw   rU   writerstringr   r   ppllengthrX   rX   rY   r    s0   


zMegatronEngine.log_eval_resultsc                 C   sH   |    t }||_tj  t| j| j| j	| j
| jd tj  d S )N)r  )r  r#   saver   r   barrierr.   r  r  rk   r[   r  )rw   
output_dirrU   rX   rX   rY   r.     s   
zMegatronEngine.save_checkpointc                 C   st   t  }||_d|_d|_tj  t| j| j	| j
\}}tj  || _|| _|jr6| jdkr8| j	  d S d S d S r   )r#   r  r   r   r   r   r  r-   r  rk   r[   r  r  fp16reload_model_params)rw   	input_dirrU   r  r  rX   rX   rY   r-     s   

zMegatronEngine.load_checkpointNc
                 K   sZ  t  }|jdkrtd|jdkrtd|jrtd|jdur%td|jdu r.td|du r:|du r:td	|du rAd
}nd|  k rNdksStd td|du rZd}nd|  krgdksltd td|du rsd}n|dkr|dkrtdd|  krd
kstd td|
dd}d|  krd
kstd td|
dd}d|  krd
kstd td|
dd}t	|t
std|}|durt	|tstd|dk rtd|jd dkrdS t }|
d|j}|dur	t	|ts	td|	du rd
}	d}d}d}tj dkr|du r3tj|jd g|jd  }n|jdd  }|du rG||jd  }|dkrPtd!|r||jd  d }d"t|d"  }||jd d  }tj|jg| g|jd  }tjtj|dddf dd | |gdd }n2||jd  }d"t|d"  }||jd  }tj|jg| g|jd  }tj| |gdd }|d|dg}td#|dd$}| }t|tj|dd%}t|d tj|dd%}|
d&d}tj| t | j!t"t#t$f}|durt%|||||d|	d'\}}|S t&|||d|||||d(d)
\}}}|S )*a  
        Generate method for GPT2 model. This method is used for inference. Supports both greedy and beam search along
        with sampling. Refer the Megatron-LM repo for more details

        Args:
            inputs (torch.Tensor): input ids
            attention_mask (torch.Tensor, optional): attention mask. Defaults to None.
            max_length (int, optional): max length of the generated sequence. Defaults to None.
            Either this or max_new_tokens should be provided.
            max_new_tokens (int, optional): max number of tokens to be generated. Defaults to None.
            Either this or max_length should be provided.
            num_beams (int, optional): number of beams to use for beam search. Defaults to None.
            temperature (float, optional): temperature for sampling. Defaults to 1.0.
            top_k (int, optional): top k tokens to consider for sampling. Defaults to 0.0.
            top_p (float, optional): tokens in top p probability are considered for sampling. Defaults to 0.0.
            length_penalty (float, optional): length penalty for beam search. Defaults to None.
            kwargs: additional key-value arguments
        rI   z1Generate method is not implemented for this modelr   z1Generate method requires data parallelism to be 1z9Generate method requires sequence parallelism to be FalseNz2Checkpoint activations cannot be set for inferencez$Vocab file is required for inferencez;`max_length` or `max_new_tokens` are required for inferencer  r  g      Y@zAtemperature must be a positive number less than or equal to 100.0r   i  z:top_k must be a positive number less than or equal to 1000z/top_p and top_k sampling cannot be set togetherz'top_p must be less than or equal to 1.0top_p_decayz-top_p_decay must be less than or equal to 1.0top_p_boundz-top_p_bound must be less than or equal to 1.0add_BOSFzadd_BOS must be a booleanzbeam_width must be an integerz!beam_width must be greater than 0z,When doing beam_search, batch size must be 1
stop_tokenzstop_token must be an integerr)  )axisz%max_new_tokens must be greater than 0   r   )int_listrO   )r   rO   random_seed)r  num_return_genlength_penaltyT)return_output_log_probstop_ktop_pr  r  temperature#use_eod_token_for_early_termination)'r#   rQ   NotImplementedErrordata_parallel_sizerT   sequence_parallelrecompute_granularityrJ  r  r   r   r9  rV  r%   rK  r   r   r_  r"  
LongTensorr,  r  ceilrW  rn  sizer   tolistr   r  randommanual_seedr@   r  torchDDPr  r   r   r   )rw   inputsr  
max_lengthmax_new_tokens	num_beamsr  r  r   r  r   rU   r  r  r  
beam_widthrO  r  
sizes_listprompts_tokens_tensorprompts_length_tensorrX  sizes_tensorsizescontext_tokens_tensorcontext_length_tensorr  unwrapped_modelr  rz  rX   rX   rY   megatron_generate  s   !








 

 $ 

z MegatronEngine.megatron_generate)NNNNNNNN)r   r   r   r   r{   r  r  r  r  r;   r  r  r  r.   r-   r  r   rX   rX   r   rY   r    s,    
	(?r  c                 C   s   t | S )z
    Average losses across data parallel group.

    Args:
        losses (List[Tensor]): List of losses to average across data parallel group.
    )r=   )rc  rX   rX   rY   %avg_losses_across_data_parallel_groupq  s   r  c                 C   s   dd }t || ddS )z
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from data parallel ranks.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather across data parallel ranks.

    c                    s^    j dkr  d    fddttjjt dD }tjj| t d tj	|ddS )Nr   c                    s   g | ]}t  qS rX   )r   
empty_liker  r   rX   rY   r    s    zOgather_across_data_parallel_groups.<locals>._gpu_gather_one.<locals>.<listcomp>r   rS  )
ndimr  r   r   r   get_world_sizer   get_data_parallel_group
all_gatherr[  )r   output_tensorsrX   r  rY   _gpu_gather_one  s   

z;gather_across_data_parallel_groups.<locals>._gpu_gather_oneT)error_on_other_type)r   )r   r$  rX   rX   rY   "gather_across_data_parallel_groups|  s   

r&  )TTTT)xrq   r  ra  abcr   	functoolsr   r   torch.nn.functionalnn
functionalr/  torch.nnr   r   r   torch.nn.parallel.distributedr   r  rk   r	   r[   r
   importsr   
operationsr   r   megatron.corer   r   megatron.core.distributedr  r   megatron.core.enumsr   )megatron.core.num_microbatches_calculatorr   megatron.core.optimizerr   megatron.core.parallel_stater   r   megatron.core.pipeline_parallelr   megatron.core.utilsr   0megatron.inference.text_generation.communicationr   r   -megatron.inference.text_generation.generationr   r   "megatron.legacy.data.dataset_utilsr   megatron.legacy.modelr   r   r    r!   $megatron.legacy.model.classificationr"   megatron.trainingr#   r$   r%   r&   megatron.training.argumentsr'   r(   r)   r*   r+   megatron.training.checkpointingr,   r-   r.   megatron.training.global_varsr/   megatron.training.initializer0   r1   r2   r3   r4   r5   %megatron.training.tokenizer.tokenizerr6   megatron.training.trainingr7   r8   r9   r:   r;   r<   megatron.training.utilsr=   r>   r?   r@   rZ   rn   ro   r   r   r   rc   r   r   rd   r   r   rF  rk  r  r  Moduler  r  r&  rX   rX   rX   rY   <module>   sz     
	5mB   0   W