o
    i                     @   sh   d dl Z d dlZd dlZddlmZ ddlmZ ddlmZ e	e
ZG dd dZG dd	 d	eZdS )
    N   )TrainerCallback)PREFIX_CHECKPOINT_DIR)loggingc                   @   s<   e Zd ZddefddZdd Zdd Zd	d
 Zdd ZdS )CheckpointManager   	kill_waitc                 C   s   || _ d| _d| _|| _dS )aD  
        Initialize the CheckpointManager for Just-In-Time checkpoint handling.

        Args:
            trainer: The Trainer instance that will be used to save checkpoints when SIGTERM is received.
            kill_wait (`int`, *optional*, defaults to 3): Grace period to distinguish between SIGTERM and SIGKILL.
        FN)traineris_checkpoint_requested_original_sigterm_handlerr   )selfr	   r    r   e/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/trainer_jit_checkpoint.py__init__   s   
zCheckpointManager.__init__c                 C   s    t  t j| j| _td d S )Nz4JIT checkpoint signal handler registered for SIGTERM)signalSIGTERM_sigterm_handlerr   loggerinfor   r   r   r   setup_signal_handler   s   z&CheckpointManager.setup_signal_handlerc                 C   s6   | j rd S td| j d t| j| j  d S )Nz4SIGTERM received, will request JIT checkpoint after s)r
   r   r   r   	threadingTimer_enable_checkpointstart)r   signumframer   r   r   r      s   z"CheckpointManager._sigterm_handlerc                 C   s   t d d| _d S )Nz/Kill wait period elapsed, requesting checkpointT)r   r   r
   r   r   r   r   r   &   s   

z$CheckpointManager._enable_checkpointc              
   C   s.  zd| _ td | jjj}td|  | jjd d}t d| }tj	
||}tj|dd tj	
||d}t|d	}|d
| d W d    n1 sSw   Y  td|  | jj| jjd d tj	|rzt| td td W d S  ty } z	td|   d }~ww )NFzStarting JIT checkpointing...zSaving JIT checkpoint at step )trial-T)exist_okzcheckpoint-is-incomplete.txtwzCheckpoint started at step z and in progress...z2Created checkpoint progress sentinel marker file: zSentinel marker file removedz/Immediate JIT checkpoint completed successfullyzFailed to save JIT checkpoint: )r
   r   r   r	   stateglobal_step_get_output_dirr   ospathjoinmakedirsopenwrite_save_checkpointmodelexistsremove	Exceptionerror)r   current_step
output_dircheckpoint_foldercheckpoint_pathsentinel_filefer   r   r   execute_jit_checkpoint*   s0   



z(CheckpointManager.execute_jit_checkpointN)r   )	__name__
__module____qualname__intr   r   r   r   r8   r   r   r   r   r      s    r   c                   @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )JITCheckpointCallbackaN  
    Callback for Just-In-Time checkpointing on SIGTERM signals.

    When SIGTERM is received, the checkpoint manager sets `is_checkpoint_requested=True`.
    The callbacks detect this flag and set `control.should_training_stop=True`, which signals
    the Trainer's training loop to exit gracefully after saving the checkpoint.
    c                 C   s   d | _ d | _d S )N)r	   jit_managerr   r   r   r   r   X   s   
zJITCheckpointCallback.__init__c                 C   s6   || _ |jjrt|d| _| j  td d S d S )N)r	   zJIT checkpointing enabled)r	   argsenable_jit_checkpointr   r>   r   r   r   )r   r	   r   r   r   set_trainer\   s   
z!JITCheckpointCallback.set_trainerc                 K   *   | j r| j jrd|_| j   d S d S d S NTr>   r
   should_training_stopr8   r   r?   r"   controlkwargsr   r   r   on_pre_optimizer_stepc      z+JITCheckpointCallback.on_pre_optimizer_stepc                 K   rB   rC   rD   rF   r   r   r   on_step_beginh   rJ   z#JITCheckpointCallback.on_step_beginc                 K   0   | j r| j jrd|_d|_| j   d S d S d S NFTr>   r
   should_saverE   r8   rF   r   r   r   on_step_endm   
   z!JITCheckpointCallback.on_step_endc                 K   rL   rM   rN   rF   r   r   r   on_epoch_ends   rQ   z"JITCheckpointCallback.on_epoch_endc                 K   s:   | j r| j jd urttj| j j td d S d S d S )Nz;Restored original SIGTERM handler after training completion)r>   r   r   r   r   r   rF   r   r   r   on_train_endy   s   z"JITCheckpointCallback.on_train_endN)r9   r:   r;   __doc__r   rA   rI   rK   rP   rR   rS   r   r   r   r   r=   O   s    r=   )r%   r   r   trainer_callbackr   trainer_utilsr   utilsr   
get_loggerr9   r   r   r=   r   r   r   r   <module>   s    
B