
    im                        d Z ddlZddlZddlmc mZ ddlmZ ddlmZ	 ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)  G d de%      Z* G d de$      Z+ G d de#      Z, G d de)      Z- G d de!      Z. G d de       Z/ G d dej`                        Z1 G d  d!e      Z2 G d" d#e'      Z3 G d$ d%e      Z4 G d& d'e(      Z5 G d( d)ej`                        Z6e G d* d+e             Z7 ed,-       G d. d/e7             Z8 ed0-       G d1 d2e7             Z9e G d3 d4e             Z:g d5Z;y)6z%Pytorch implementation of AIMv2 Model    N)nn   )initialization)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededededededef fdZ xZ	S )Aimv2VisionConfiga  
    This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2816):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
        use_head (`str`, *optional*, defaults to `True`):
            Whether to use Attention Pooling Head or Not.
        is_native (`str`, *optional*, defaults to `False`):
            Whether to use ckpt trained for image native resolution or not.
    Example:

    ```python
    >>> from transformers import SiglipVisionConfig, SiglipVisionModel

    >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
    >>> configuration = Aimv2VisionConfig()

    >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
    >>> model = Aimv2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_bias
hidden_actinitializer_rangeuse_head	is_nativec                     t        |   d|||||||||
d	| || _        || _        |	| _        || _        |
| _        || _        || _        | `	y )N)	r   r   r    r!   r)   r"   r#   r$   r'    )
super__init__r+   r*   r&   r(   r'   r%   r,   layer_norm_eps)selfr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   kwargs	__class__s                    q/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/aimv2/modular_aimv2.pyr0   zAimv2VisionConfig.__init__b   sx    & 	 	
#// 3!%!!	
 	
 !!2!2  ("    )i   i         r         h㈵>        FFsilu{Gz?TF
__name__
__module____qualname____doc__intfloatboolstrr0   __classcell__r4   s   @r5   r   r   )   s    6t  !%!##$"#& #'!( (  (  	( 
 !(  (  (  (  (  !(  (  (  (  !(  (   !(  ( r6   r   c                   t     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededededef fdZ xZ	S )Aimv2TextConfiga
  
    This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
    AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 49408):
            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Aimv2Model`].
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        eos_token_id (`int`, *optional*, defaults to 49407):
            The id of the end-of-sequence token in the vocabulary.
        max_position_embeddings (`int`, *optional*, defaults to 77):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
    
vocab_sizer   r   r    r!   r%   r&   r'   r(   r)   eos_token_idmax_position_embeddingsr*   c                     t        |   d||||||
||d| || _        || _        |	| _        || _        || _        | `| `| `	| `
y )N)rL   r   r   r    r!   r)   rN   rM   r.   )r/   r0   r*   r&   r(   r'   r%   bos_token_idpad_token_idprojection_sizer1   )r2   rL   r   r   r    r!   r%   r&   r'   r(   r)   rM   rN   r*   r3   r4   s                  r5   r0   zAimv2TextConfig.__init__   sy    " 	 
	
!#// 3!$;%
	
 
	
 "3!2  ( r6   )i   i   i         r;   r<   FFr=   i  M   r>   r?   rI   s   @r5   rK   rK      s    'V  !%!##$"#& !')"&& &  &  	& 
 &  !&  &  !&  &  &  &  &  "%&   &  & r6   rK   c                   &     e Zd ZdZ	 d fd	Z xZS )Aimv2Configa@  
    [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
    instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
        projection_dim (`int`, *optional*, defaults to 512):
            Dimensionality of text and vision projection layers.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The initial value of the *logit_scale* parameter.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import Aimv2Config, Aimv2Model

    >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
    >>> configuration = Aimv2Config()

    >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
    >>> model = Aimv2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
    >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
    >>> config_text = Aimv2TextConfig()
    >>> config_vision = Aimv2VisionConfig()

    >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
    ```c                 V    || _         || _        d| _        t        |   ||fi | | `y )Ng      Y@)projection_dimlogit_scale_init_valuemax_logit_scaler/   r0   initializer_factor)r2   text_configvision_configrY   rZ   r3   r4   s         r5   r0   zAimv2Config.__init__  s7     -&<#$m>v>#r6   )NNi   g/L
F@)r@   rA   rB   rC   r0   rH   rI   s   @r5   rW   rW      s    +\ `f$ $r6   rW   c                       e Zd Zy)Aimv2OutputNr@   rA   rB   r.   r6   r5   r`   r`         r6   r`   c                       e Zd Zy)Aimv2RMSNormNra   r.   r6   r5   rd   rd     rb   r6   rd   c                       e Zd Zy)Aimv2MLPNra   r.   r6   r5   rf   rf   !  rb   r6   rf   c                        e Zd Zdef fdZedddej                  fdej                  fd       Z	dej                  dej                  fd	Z
 xZS )
Aimv2VisionEmbeddingsconfigc                 B   t         |           || _        |j                  | _        t	        j
                  |j                  |j                  |j                  |j                        | _        t        |j                  |j                        | _        |j                  |j                  z  dz  }| j                  j                  s%t	        j                  ||j                        | _        | j!                  dt#        j$                  |      j'                  d      d       y )N)kernel_sizestrider   position_ids   F)
persistent)r/   r0   ri   r$   r   Conv2dr"   r   patch_embedrd   r%   rms_normr#   r,   	Embeddingposition_embeddingregister_buffertorcharangeexpand)r2   ri   num_patchesr4   s      r5   r0   zAimv2VisionEmbeddings.__init__&  s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir6      g     @cpureturnc                 :   t        j                  t        |      ||      }t        j                  t        |       ||      }t        j                  ||d      \  }}|dz  }t        j                  |||      |z  }	d||	z  z  }	|j	                         d   |	d d d f   z  }
|j	                         d   |	d d d f   z  }t        j
                  |
j                         |
j                         |j                         |j                         gd      d d d d d f   S )	Ndtypedevicexy)indexing   g      ?).Nro   dim)rx   ry   rD   meshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   r   grid_wgrid_hpos_dimomegaout_hout_ws               r5   "build_2d_sincos_position_embeddingz8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding4  s     c%jfEc&kvFFq.WE&AGK{E)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr6   pixel_valuesc                    |j                         \  }}}}| j                  |      j                  d      j                  dd      }| j	                  |      }| j
                  j                  rY| j                  || j                  z  || j                  z  | j
                  j                  |j                  |j                        }n| j                  | j                        }||z   }|S )Nr   ro   )r   r   r   )sizers   r   	transposert   ri   r,   r   r$   r   r   r   rv   rm   )r2   r   _r   r   hidden_states	pos_embeds          r5   forwardzAimv2VisionEmbeddings.forwardE  s    *//11fe((6>>qAKKAqQm4;;  ??$//)(++11$++#)) @ I //0A0ABI%	1r6   )r@   rA   rB   r   r0   staticmethodrx   float32Tensorr   r   rH   rI   s   @r5   rh   rh   %  s]    j0 j !$'%u}}e	e e ELL U\\ r6   rh   c                       e Zd Zy)Aimv2TextEmbeddingsNra   r.   r6   r5   r   r   Y  rb   r6   r   c                        e Zd Z fdZ xZS )Aimv2Attentionc                    t         |   |       t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _	        y )Nbias)
r/   r0   r   Linearr   r'   k_projv_projq_projout_projr2   ri   r4   s     r5   r0   zAimv2Attention.__init__^  s     iiV__UiiV__UiiV__U		$..$..vWr6   )r@   rA   rB   r0   rH   rI   s   @r5   r   r   ]  s    X Xr6   r   c            	            e Zd Zdef fdZ	 d	dej                  dej                  dz  dee   dej                  fdZ	 xZ
S )
Aimv2EncoderLayerri   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y N)r/   r0   r   	attentionrf   ffnrd   r   r%   	rms_norm1	rms_norm2r   s     r5   r0   zAimv2EncoderLayer.__init__g  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr6   Nr   attention_maskr3   r~   c                     | j                  |      } | j                  d||d|\  }}||z   }| j                  |      }| j                  |      }||z   }|S )N)r   r   r.   )r   r   r   r   )r2   r   r   r3   norm_hidden_statesattn_outputr   
mlp_outputs           r5   r   zAimv2EncoderLayer.forwardn  sl     "^^M:'r6HYgrkqrQ%3!^^M:XX01
%
2r6   r   )r@   rA   rB   r   r0   rx   r   r   r   r   rH   rI   s   @r5   r   r   f  sY    O0 O /3|| t+ +,	
 
r6   r   c                       e Zd Zy)Aimv2EncoderNra   r.   r6   r5   r   r     rb   r6   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Aimv2AttentionPoolingHeadri   c                 &   t         |           |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  t        j                  dd| j                              | _        t        j                  | j                  | j                  d      | _        y )Nr   ro   T)r/   r0   r   r!   	num_headsr   r   r'   r   r   	Parameterrx   zeros	cls_tokenoutput_projr   s     r5   r0   z"Aimv2AttentionPoolingHead.__init__  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr6   r   r~   c                    |j                   \  }}}| j                  j                  |dd      }| j                  |      j	                  ||| j
                  || j
                  z        }| j                  |      j	                  ||| j
                  || j
                  z        }|j	                  |d| j
                  || j
                  z        }|j                  dddd      }|j                  dddd      }|j                  dddd      }t        j                  |||      }	|	j                  dd      j	                  |d|      }	|	j                  d      }	| j                  |	      }
|
S )Nrp   ro   r   r   r   r   )shaper   rz   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )r2   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              r5   r   z!Aimv2AttentionPoolingHead.forward  sH   *7*=*='
GZNN))*b"=	kk-(00WdnnV`dhdrdrVrsM*22:wXbfjftftXtu!!*at~~A]^kk!Q1%aAq)aAq)44UCG!++Aq199*aT!&&1&-!!+.r6   )	r@   rA   rB   r   r0   rx   r   r   rH   rI   s   @r5   r   r     s-    	T0 	TU\\ ell r6   r   c                   v     e Zd ZU dZeed<   dZdZdZg dZ	dZ
dZdZ ej                          fd       Z xZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    ri   aimv2)imageT)r   r   rh   r   c                 $   t         |   |       t        |d      rYt        |j                  t
        j                        r4t        j                  |j                  t        j                  d             y y t        |t              r7t        j                  |j                  d| j                  j                         y t        |t               rZt        j"                  |j$                  t'        j(                  |j$                  j*                  d         j-                  d             y t        |t.              rZt        j"                  |j$                  t'        j(                  |j$                  j*                  d         j-                  d             y y )Nlogit_scaleg$I$I,@r<   )r   stdrp   rn   )r/   _init_weightshasattr
isinstancer   r   r   init	constant_mathlogr   normal_r   ri   r*   rh   copy_rm   rx   ry   r   rz   r   )r2   moduler4   s     r5   r   z"Aimv2PreTrainedModel._init_weights  s   f%6=)&,,bll;v11488H3EF < 9:LL))9V9VW 56JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5r6   )r@   rA   rB   rC   rW   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnrx   no_gradr   rH   rI   s   @r5   r   r     sY    
 !&*# NU]]_
i 
ir6   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                        e Zd ZU eed<   dZeedZdef fdZ	de
j                  fdZ ed      ed	ee   defd
              Z xZS )Aimv2VisionModelri   r   r   
attentionsc                 6   t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                  rt        |      | _        | j                          y r   )r/   r0   ri   rh   
embeddingsr   encoderrd   r   r%   rt   r+   r   head	post_initr   s     r5   r0   zAimv2VisionModel.__init__  sq     /7#F+$V%7%79L9LM==1&9DIr6   r~   c                 .    | j                   j                  S r   )r   rs   r2   s    r5   get_input_embeddingsz%Aimv2VisionModel.get_input_embeddings  s    ***r6   Ftie_last_hidden_statesr3   c                     | j                  |      } | j                  dd|i|}|j                  }| j                  |      }| j                  r| j                  |      nd}t        ||      S )a3  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```inputs_embedsNlast_hidden_statepooler_outputr.   )r   r   r  rt   r+   r   r	   )r2   r   r3   r   encoder_outputsr  r  s          r5   r   zAimv2VisionModel.forward  sz    : 5+74<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
r6   )r@   rA   rB   r   r   main_input_namer   r   _can_record_outputsr0   r   Moduler   r   r   r   r   r	   r   rH   rI   s   @r5   r   r     sv     $O*$
0 +bii + u5*
 +,*
 
$	*
  6*
r6   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c            	            e Zd ZdZeedZdef fdZde	j                  fdZd Z ed	      e	 ddej                   d
z  dee   defd              Z xZS )Aimv2TextModel	input_idsr   ri   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                          y r   )r/   r0   ri   r   r   r   r   rd   r   r%   rt   rM   r   r   s     r5   r0   zAimv2TextModel.__init__"  sa     -f5#F+$V%7%79L9LM"//r6   r~   c                 .    | j                   j                  S r   r   token_embeddingr   s    r5   r   z#Aimv2TextModel.get_input_embeddings-  s    ...r6   c                 &    || j                   _        y r   r  )r2   r   s     r5   set_input_embeddingsz#Aimv2TextModel.set_input_embeddings0  s    */'r6   Fr   Nr   r3   c                    | j                  |      }|j                  \  }}}t        j                  |t        j                  |j
                        }|j                  d      j                  |d      }	|t        | j                  ||	||d       } | j                  d	||d|}
|
j                  }| j                  |      }|t        j                  |j                  d   |j
                        |j                  t        j                  |j
                        | j                  k(  j                         j!                  d      f   }t#        ||      S )
Nr   r   rp   )ri   input_embedsrm   r   cache_positionpast_key_values)r   r   )r   r   r   r.   )r   r   rx   ry   longr   	unsqueezerz   r   ri   r   r  rt   torD   rM   argmaxr	   )r2   r	  r   r3   r   r   r   r   r  rm   r  r  pooled_outputs                r5   r   zAimv2TextModel.forward3  sN    	2!.!4!4
GQgUZZH\H\]%//299*bI%/{{*)-- $N '$,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
r6   r   )r@   rA   rB   r  r   r   r  rK   r0   r   r  r   r  r   r   rx   r   r   r   r	   r   rH   rI   s   @r5   r  r    s     "O +$
	 	/bii /0 u5 /3'
 t+'
 +,	'

 
$'
  6'
r6   r  c                       e Zd ZdZdefdZee	 	 	 ddej                  dz  dej                  dz  dej                  dz  dee   d	ef
d
              Zy)
Aimv2ModelTri   c                    t        j                  | |       |j                  | _        |j                  j                  | _        |j                  j                  | _        t        j                  |j                        | _
        t        j                  |j                        | _        t        j                  | j
                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j"                  t%        j&                  | j(                  j*                              | _        t/        j0                  |j2                        | _        | j7                          y )NFr   )r
   r0   rY   r^   r   vision_embed_dimr]   text_embed_dimr   _from_configvision_modelr  
text_modelr   r   visual_projectiontext_projectionr   rx   tensorri   rZ   r   r   r   r[   max_log_logit_scaler   )r2   ri   s     r5   r0   zAimv2Model.__init__c  s     v.$33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r6   Nr	  r   r   r3   r~   c                     | j                   dd|i|} | j                  d||d|}|j                  }| j                  |      }|j                  }| j	                  |      }|t        |      z  }|t        |      z  }| j                  j                  d| j                        j                         j                  |j                        }	|	|z  |j                         z  }
|
j                         }t        ||
||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   )r	  r   r<   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr.   )r  r   r  r!  r"  r   r   clampr$  expr  r   tr`   )r2   r	  r   r   r3   vision_outputstext_outputsr)  r(  r   r'  r&  s               r5   r   zAimv2Model.forwardu  s'   B 6GT5F5F 6
%6
6

 4C4?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r6   )NNN)r@   rA   rB   r   rW   r0   r   r   rx   
LongTensorFloatTensorr   r   r   r`   r   r.   r6   r5   r  r  _  s    { $  .215.2	?
##d*?
 ''$.?
 t+	?

 +,?
 
?
  ?
r6   r  )rW   r   rK   r   r  r   r  )<rC   r   rx   torch.nn.functionalr   
functionalr    r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   processing_utilsr   utilsr   r   r   utils.genericr   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   r   rK   rW   r`   rd   rf   r  rh   r   r   r   r   r   r   r   r  r  __all__r.   r6   r5   <module>rB     s   ,      & / 9 K - & 
 0 P P 9 \ \ Q Qa * a HP & P f6$, 6$r	, 		< 		x 	1BII 1h	, 	X_ X2 2	= 			 D i? i iD 
E
+ E

E
P 
B
) B

B
J V
 V
 V
rr6   