o
    iQ                     @   sl  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ eeZeG dd deZG dd dej Z!	d(dej dej"dej"dej"dej"dB de#de#fddZ$G dd dej Z%G d d! d!ej Z&G d"d# d#eZ'G d$d% d%ej Z(G d&d' d'ej Z)dS ))zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)Callable)	dataclass)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONS)Unpack)ModelOutputTransformersKwargscan_return_tuplelogging)is_flash_attention_requested   )IdeficsVisionConfigc                   @   sj   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dS )IdeficsVisionModelOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tupler    r    r    d/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/models/idefics/vision.pyr   )   s   
 r   c                       s\   e Zd Zdef fddZdejdededejfdd	Zddej	de
dejfddZ  ZS )IdeficsVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   )
persistent)super__init__r#   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr#   	__class__r    r!   r.   H   s"   
"z IdeficsVisionEmbeddings.__init__
embeddingsheightwidthreturnc                 C   s  |j d d }| | j}|j d d }||kr||kr|S |dddf }|ddddf }|j d }	|| jj }
|| jj }|
d |d }
}t|}|dt|t||	}|	dddd}|j
tjk}|rvtd |tj}tjj||
| || fd	d
d}|r|tj}t|
|j d kst||j d krtdt|
t|f d|j d |j d f d|	dddddd|	}tj|d|fddS )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   r+   g?r   r)   zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.bicubicF)scale_factormodealign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shaper<   r*   r#   r2   mathsqrtreshapeintpermutedtyper   bfloat16loggerwarning_oncetofloatr   
functionalinterpolate
ValueErrorviewcat	unsqueeze)rA   rD   rE   rF   r9   	pos_embedr:   class_pos_embedpatch_pos_embedr0   num_h_patchesnum_w_patchessqrt_num_positionsfp32_upcastingr    r    r!   interpolate_pos_encoding_   sH   	

$z0IdeficsVisionEmbeddings.interpolate_pos_encodingFpixel_valuesri   c              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)rV   r)   r   r+   rN   )rP   r1   r^   r8   weightrV   rZ   flatten	transposer5   r?   r   r`   ri   r<   r*   )rA   rj   ri   
batch_sizer7   rE   rF   target_dtypepatch_embedsclass_embedsrD   r    r    r!   forward   s(   
zIdeficsVisionEmbeddings.forwardF)r   r   r   r   r.   r   TensorrT   ri   r   boolrs   __classcell__r    r    rB   r!   r"   G   s    $1r"           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr+   rL   )rO   rV   )ptrainingr   r)   )r   matmulrn   r   r\   softmaxfloat32rZ   rV   r   r   
contiguous)
ry   rz   r{   r|   r}   r~   r   kwargsattn_weightsattn_outputr    r    r!   eager_attention_forward   s   
r   c                       sn   e Zd ZdZdef fddZ			ddejdejdB d	ejdB d
edB de	ejejdB f f
ddZ
  ZS )IdeficsVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr#   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r-   r.   r#   r/   r0   num_attention_heads	num_headshead_dimr^   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr@   rB   r    r!   r.      s$   

zIdeficsVisionAttention.__init__NFr   r}   causal_attention_maskoutput_attentionsrG   c              
   C   s"  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
t| j	sX|durQ|durQ|| }n|durW|}n|du| _
t| j	jt}|| ||	|
|| j
| j| jstdn| jd\}}|||| }| |}|sd}||fS )z#Input shape: Batch x Time x Channelr   r)   Nrx   )r   r~   r   )rP   r   r   r   r_   r   r   rn   r   r#   r   r
   get_interface_attn_implementationr   r   r   r   rS   r   r   )rA   r   r}   r   r   ro   
seq_lengthr0   querieskeysvaluesattention_interfacer   r   r    r    r!   rs      s@   	







zIdeficsVisionAttention.forward)NNF)r   r   r   r   r   r.   r   ru   rv   r   rs   rw   r    r    rB   r!   r      s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )IdeficsVisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r-   r.   r#   r   
hidden_actactivation_fnr   r   r/   intermediate_sizefc1fc2r@   rB   r    r!   r.     s
   
zIdeficsVisionMLP.__init__r   rG   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rA   r   r    r    r!   rs     s   


zIdeficsVisionMLP.forward)r   r   r   r.   r   ru   rs   rw   r    r    rB   r!   r     s    r   c                       sV   e Zd Zdef fddZ	ddejdejdedB d	ee	 d
e
ej f
ddZ  ZS )IdeficsVisionEncoderLayerr#   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)eps)r-   r.   r/   r0   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r@   rB   r    r!   r.     s   


z"IdeficsVisionEncoderLayer.__init__Fr   r}   r   Nr   rG   c                 K   sj   |}|  |}| jd|||d|\}}|| }|}| |}| |}|| }|f}|r3||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r}   r   Nr    )r   r   r   r   )rA   r   r}   r   r   residualr   outputsr    r    r!   rs   %  s$   




z!IdeficsVisionEncoderLayer.forwardrt   )r   r   r   r   r.   r   ru   rv   r   r   r   r   rs   rw   r    r    rB   r!   r     s    r   c                       sp   e Zd ZdZdef fddZe				ddejdB de	dB de	dB d	e	dB d
e
e deeB fddZ  ZS )IdeficsVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`IdeficsVisionEncoderLayer`].

    Args:
        config: IdeficsVisionConfig
    r#   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r    )r   ).0_r#   r    r!   
<listcomp>[  s    z1IdeficsVisionEncoder.__init__.<locals>.<listcomp>F)	r-   r.   r#   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr@   rB   r   r!   r.   X  s   
 
zIdeficsVisionEncoder.__init__Nr}   r   output_hidden_statesreturn_dictr   rG   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]#\}
}|r<||	f }||	|fd|i|}|d }	|rT||d f }q1|r\||	f }t|	||dS )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr    r   r   r   )r   r   r   )r#   r   r   use_return_dict	enumerater   r   )rA   inputs_embedsr}   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputsr    r    r!   rs   ^  s6    

zIdeficsVisionEncoder.forward)NNNN)r   r   r   r   r   r.   r   r   ru   rv   r   r   r   r   rs   rw   r    r    rB   r!   r   O  s*    r   c                       sj   e Zd Zdef fddZ					ddejdB dedB dedB d	edB d
edB dee	B fddZ
  ZS )IdeficsVisionTransformerr#   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )r-   r.   r#   r/   r"   rD   r   r   r   pre_layrnormr   encoderpost_layernorm)rA   r#   r0   rB   r    r!   r.     s   


z!IdeficsVisionTransformer.__init__NFrj   r   r   ri   r   rG   c           
      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| j||d}| |}| j||||d}|d }|dddddf }	| |	}	|s[||	f|dd  S t	||	|j
|jdS )z
        Returns:

        Nz You have to specify pixel_values)ri   )r   r   r   r   r   r   )r   pooler_outputr   r   )r#   r   r   r   r^   rD   r   r   r   r	   r   r   )
rA   rj   r   r   ri   r   r   encoder_outputsr   pooled_outputr    r    r!   rs     s2   

z IdeficsVisionTransformer.forward)NNNFN)r   r   r   r   r.   r   r   rv   r   r	   rs   rw   r    r    rB   r!   r     s(    r   )rx   )*r   rQ   collections.abcr   dataclassesr   r   r   activationsr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   processing_utilsr   utilsr   r   r   r   utils.genericr   configuration_ideficsr   
get_loggerr   rX   r   Moduler"   ru   r[   r   r   r   r   r   r   r    r    r    r!   <module>   sN   
k
J3Q