o
    0i                  	   @   s   d Z ddlmZ ddlmZmZmZ ddlZddlm	Z	m
Z
 er)e	 r)ddlmZ dejjfd	d
Zdedee defddZdedefddZe
dejjfddZe
defdejjded dee fddZdS )z,
Needed utilities for torchao FP8 training.
    )partial)TYPE_CHECKINGCallableOptionalN   )is_torchao_availabletorchao_required)Float8LinearConfigmodelc                 C   s@   d\}}|   D ]\}}t|tjjr|du r|}|}q||fS )z
    Finds the first and last linear layer names in a model.

    This is needed during FP8 to avoid issues with instability by keeping the first and last layers unquantized.

    Ref: https://x.com/xariusrke/status/1826669142604141052
    )NNN)named_modules
isinstancetorchnnLinear)r
   first_linearlast_linearnamemodule r   U/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/accelerate/utils/ao.pyfind_first_last_linear_layers    s   r   fqnlayers_to_filterreturnc                 C   s>   t | tjjr| jd dks| jd dkrdS ||v rdS dS )a  
    A function which will check if `module` is:
    - a `torch.nn.Linear` layer
    - has in_features and out_features divisible by 16
    - is not part of `layers_to_filter`

    Args:
        module (`torch.nn.Module`):
            The module to check.
        fqn (`str`):
            The fully qualified name of the layer.
        layers_to_filter (`List[str]`):
            The list of layers to filter.
       r   FT)r   r   r   r   in_featuresout_features)r   r   r   r   r   r   filter_linear_layers1   s   r   c                 C   s   t | \}}t| |||gdS )a  
    A filter function which will filter out all linear layers except the first and last.

    <Tip>

        For stability reasons, we skip the first and last linear layers Otherwise can lead to the model not training or
        converging properly

    </Tip>

    Args:
        module (`torch.nn.Module`):
            The module to check.
        fqn (`str`):
            The fully qualified name of the layer.
    r   )r   r   )r   r   r   r   r   r   r   #filter_first_and_last_linear_layersH   s   r   c                 C   s2   ddl m} |  D ]\}}t||r dS q
dS )Nr   )Float8LinearTF)torchao.float8.float8_linearr    r   r   )r
   r    r   r   r   r   r   has_ao_layers]   s   
r"   configr	   module_filter_funcc                 C   sB   ddl m} t| \}}|du rtt||gd}|| ||d dS )a
  
    Converts all `nn.Linear` layers in the model (except the first and last) to torchao's `Float8Linear` layer inplace.

    Args:
        model (`torch.nn.Module`):
            The model to convert.
        config (`torchao.float8.Float8LinearConfig`, *optional*):
            The configuration for the FP8 training. Recommended to utilize
            `torchao.float8.recipe_name_to_linear_config` to generate this. In general, the default config should be
            sufficient (what is passed when set to `None`).
        module_filter_func (`Callable`, *optional*, defaults to `filter_linear_layers`):
            Optional function that must take in a module and layer name, and returns a boolean indicating whether the
            module should be converted to FP8. Defaults to `filter_linear_layers`. See it for an example.

    Example:

    ```python
    from accelerate.utils.ao import convert_model_to_fp8_ao

    model = MyModel()
    model.to("cuda")
    convert_to_float8_training(model)

    model.train()
    ```
    r   )convert_to_float8_trainingNr   )module_filter_fnr#   )torchao.float8r%   r   r   r   )r
   r#   r$   r%   r   r   r   r   r   convert_model_to_fp8_aog   s
    r(   )__doc__	functoolsr   typingr   r   r   r   importsr   r   r!   r	   r   Moduler   strlistboolr   r   r"   r(   r   r   r   r   <module>   s.   	