o
    inR                  #   @   s<  d dl mZ d dlZd dlmZ ddlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZmZ ddgZG dd deZd	d
e de de de d	 e_							d)dee dee dee dee dedB dedB dedB dededB dedededededededdf"d dZd!d" Zdee dee dee dee dedB dedB dededededededededdfd#d$Zdee dee dee dee dedB dedB dededededededededdfd%d&Zdee dee dee dee dedB dedB dededededededededdfd'd(ZdS )*    )castN)Tensor   )_default_to_fused_or_foreach_device_dtype_check_for_fused_differentiable_doc_foreach_doc_get_scalar_dtype
_get_value_maximize_doc_params_doc
_to_scalar_use_grad_for_differentiable_view_as_real	OptimizerParamsTAdagradadagradc                       s   e Zd Z						ddddddedeeB d	ed
ededededB dedededB ddf fddZ fddZdddZ	dd Z
edddZ  ZS )r   {Gz?r   绽|=NF)maximizedifferentiablefusedparamslrlr_decayweight_decayinitial_accumulator_valueepsforeachr   r   r   returnc             
      sb  t |tr| dkrtdd|kstd| d|ks%td| d|ks0td| d|ks;td| d|ksFtd| ||||||||	|
d		}t || |
rj|	ratd
|rgtdd| _| jD ]A}|d D ]:}| j	| }|d rt
jdt|d d|jdnt
jdt d|d< t
|rt||n|}t
j||t
jd|d< qsqmd S )Nr   zTensor lr must be 1-elementg        zInvalid learning rate: zInvalid lr_decay value: zInvalid weight_decay value: z)Invalid initial_accumulator_value value: zInvalid epsilon value: )	r   r   r   r   r   r   r   r   r   z)`fused` does not support `differentiable`z0`fused` and `foreach` cannot be `True` together.Tr   r    is_fused)dtypedevicer$   step)memory_formatsum)
isinstancer   numel
ValueErrorsuper__init__RuntimeError"_need_device_dtype_check_for_fusedparam_groupsstatetorchzerosr	   r%   tensor
is_complexcomplex	full_likepreserve_format)selfr   r   r   r   r   r   r   r   r   r   defaultsgrouppr2   
init_value	__class__r!   U/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/torch/optim/adagrad.pyr.      sf   

zAdagrad.__init__c                    s   t  | d }| jD ]}|dd  |dd |dd |dd }qt| j }t|dko;t	|d d }|sS|D ]}tj
t|d t|dd	|d< q@d S d S )
Nr   r   Fr   r   r   r'   r"   r&   )r-   __setstate__r1   
setdefaultlistr2   valueslenr3   	is_tensorr5   floatr	   )r:   r2   r   r<   state_valuesstep_is_tensorsr?   r!   rA   rB   b   s$   

zAdagrad.__setstate__c                 C   s4   | j D ]}|d D ]}| j| }|d   q	qdS )z6Calls tensor.share_memory_() on the state sum tensors.r   r)   N)r1   r2   share_memory_)r:   r<   r=   r2   r!   r!   rA   share_memoryw   s   

zAdagrad.share_memoryc           
      C   s   d\}}|d D ]E}|j d urM|d r"t| ddr"t|dd d| _||j jO }|t|O }|| ||j  | j| }	||	d  ||	d	  q||fS )
N)FFr   r   r0   T)cuda_unsupportedFr)   r'   )	gradgetattrr   r0   	is_sparser3   r6   appendr2   )
r:   r<   params_with_gradgrads
state_sumsstate_stepshas_sparse_gradhas_complexr=   r2   r!   r!   rA   _init_group~   s&   



zAdagrad._init_groupc           
      C   s   d}|durt   | }W d   n1 sw   Y  | jD ]A}g }g }g }g }| |||||\}}	t|||||d |d |d |d ||d |d |d |	|d	 t| d
dt| ddd q |S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r   r   r   
grad_scale	found_inf)r   r   r   r   rW   r   r   r   rX   r   rZ   r[   )r3   enable_gradr1   rY   r   rP   )
r:   closurelossr<   rS   rT   rU   rV   rW   rX   r!   r!   rA   r'      s@   




zAdagrad.step)r   r   r   r   r   N)r    NN)__name__
__module____qualname__r   rH   r   boolr.   rB   rM   rY   r   r'   __classcell__r!   r!   r?   rA   r      sN    

F
a[  Implements Adagrad algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta)
                \text{ (objective)}, \: \lambda \text{ (weight decay)},                          \\
            &\hspace{12mm}    \tau \text{ (initial accumulator value)}, \: \eta\text{ (lr decay)}\\
            &\textbf{initialize} :  state\_sum_0 \leftarrow \tau                          \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \tilde{\gamma}    \leftarrow \gamma / (1 +(t-1) \eta)                  \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm}state\_sum_t  \leftarrow  state\_sum_{t-1} + g^2_t                      \\
            &\hspace{5mm}\theta_t \leftarrow
                \theta_{t-1}- \tilde{\gamma} \frac{g_t}{\sqrt{state\_sum_t}+\epsilon}            \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.
    z
    Args:
        a  
        lr (float, Tensor, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        initial_accumulator_value (float, optional): initial value of the
            sum of squares of gradients (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)
        z	
        a  
        fused (bool, optional): whether the fused implementation (CPU only) is used.
            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
            are supported. (default: None). Please note that the fused implementations does not
            support sparse or complex gradients.
    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html

    Fr   rT   rU   rV   r   rZ   r[   rW   r   r   rX   r   r   r   r   r   r    c                C   s   t dd |D std|du r|du rt| |	dd\}}|du r$d}|du r*d}|r5tj r5td|r@tj r@td|rJtj sJt}n|rTtj sTt}nt}|| ||||||||||	|
||d	 dS )
ztFunctional API that performs Adagrad algorithm computation.

    See :class:`~torch.optim.Adagrad` for details.
    c                 s   s    | ]	}t |tjV  qd S r_   )r*   r3   r   ).0tr!   r!   rA   	<genexpr>  s    zadagrad.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers
r   r   r   r   rW   r   r   rX   rZ   r[   )	allr/   r   r3   jitis_scripting_fused_adagrad_multi_tensor_adagrad_single_tensor_adagrad)r   rT   rU   rV   r   rZ   r[   rW   r   r   rX   r   r   r   r   r   _funcr!   r!   rA   r      sJ   

c                 C   s   |   }t|||S r_   )sizer3   sparse_coo_tensor)rO   grad_indicesrE   rr   r!   r!   rA   _make_sparse>  s   ru   c             	   C   s  |d us|d urt dtj st|}t| |||ddD ]\}}}}|d7 }t|}|s0|n| }|dkrE|jr>td|j	||d}|d|d |   }|jr|
 }| }| }|t|||d ||}|  |	}|jt|||| | d qt|}|rt|}t|}t|}|j||dd	 |r| |	 }n| |	}|j||| d	 |rt|}t|}qd S )
N,Expected grad_scale and found_inf to be NoneT)strictr   r   z;weight_decay option is not compatible with sparse gradientsalpha   value)AssertionErrorr3   rk   rl   r   zipr
   rQ   r/   addcoalesce_indices_valuesadd_ru   powsparse_masksqrt_r6   view_as_realaddcmul_sqrtaddcdiv_view_as_complex)r   rT   rU   rV   rZ   r[   r   r   r   r   rW   r   r   rX   paramrO   	state_sumstep_tr'   clrrt   grad_valuesstd
std_valuesr6   r!   r!   rA   ro   C  sT   








ro   c                   s  |rt d|d us|d urt dt| dkrd S t  t| |||g}| D ]\\}}}}}ttt |}ttt |}ttt |}ttt |}|
oYt	dd |D }|rot
|||| ||	d|||||d q+|rwt||| |r~t|}tj s|d jrtj|tjdd	d
dd nt|d |dkr|rtj|||d ntj|||d} fdd|D }tj|||dd t|}t||	 |dks|rt|| |}nt||}t||| q+d S )Nz#_foreach ops don't support autogradrv   r   c                 s   s    | ]}|j V  qd S r_   )rQ   )re   rO   r!   r!   rA   rg     s    
z(_multi_tensor_adagrad.<locals>.<genexpr>Tri   g      ?cpu)r%   rx   r   c                    s&   g | ]}  d t |d     qS )r   )r
   )re   r'   r   r   r!   rA   
<listcomp>  s    z)_multi_tensor_adagrad.<locals>.<listcomp>r{   )r}   rF   r   r   "_group_tensors_by_device_and_dtyperE   r   rD   r   anyro   r   r3   _foreach_negcompileris_compilingis_cpu_foreach_add_r5   _foreach_add_foreach_addcmul__foreach_sqrt_foreach_mul__foreach_mul_foreach_addcdiv_)r   rT   rU   rV   rZ   r[   r   r   r   r   rW   r   r   rX   grouped_tensorlistsdevice_params_device_grads_device_state_sums_device_state_steps_rp   device_paramsdevice_gradsdevice_state_sumsdevice_state_stepsdevice_has_sparse_grad	minus_clrr   	numeratorr!   r   rA   rn     s   


rn   c                C   s~  | sd S |
s|rt d|rt dt|}|d ur|j|ind }|d ur*|j|ind }t| |||g}| D ]\\}}\\}}}}}ttt |}ttt |}ttt |}ttt |}d\}}|d ur~|d ur~||vrz|j	|dd||< || }|d ur|d ur||vr|j	|dd||< || }t
|d t
j||||||||	|||d |d urt
||gt|  q9d S )Nz5`fused` does not support sparse grad or complex paramz<adagrad with fused=True does not support differentiable=True)NNT)non_blockingr   )r   r   r   r   r   rZ   r[   )r/   r   r%   r   r   itemsr   rD   r   tor3   r   _fused_adagrad__foreach_sub_rF   )r   rT   rU   rV   rZ   r[   r   r   r   r   rW   r   r   rX   grad_scale_dictfound_inf_dictgrouped_tensorsr%   rp   r   r   r   r   r   r   r   r   device_grad_scaledevice_found_infr!   r!   rA   rm     sp   
rm   )NNNFNFF)typingr   r3   r   	optimizerr   r   r   r   r	   r
   r   r   r   r   r   r   r   __all__r   __doc__rD   rc   rH   r   ru   ro   rn   rm   r!   r!   r!   rA   <module>   s6  < (
8

J	

D	

o	
