
    iO                        d dl mZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$  ejJ                  e&      Z' G d de	      Z( G d de      Z) G d de#      Z* G d de      Z+ G d de      Z, G d de      Z- G d de      Z.e G d de             Z/ G d d e      Z0 G d! d"e      Z1g d#Z2y)$    )CallableN   )initialization)PreTrainedConfig)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)check_model_inputs   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                   F     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )MLCDVisionConfiga  
    This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
    [DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimensionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 336):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mlcd_vision_modelvision_configc                     t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        |
| _        |	| _        y )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr#   r$   r%   r&   r'   r(   r*   r)   r/   r.   r-   r+   r,   kwargs	__class__s                  o/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/mlcd/modular_mlcd.pyr"   zMLCDVisionConfig.__init__d   sz    " 	"6"&!2!2#6 $8!($$!2"4!2,$    )i  i    0         r   iP     gelugh㈵>        g{Gz?      ?)__name__
__module____qualname____doc__
model_typebase_config_keyr"   __classcell__r2   s   @r3   r   r   *   sH    4l %J%O % %r4   r   c                       e Zd Zy)MLCDMLPN)r<   r=   r>   r    r4   r3   rE   rE      s    r4   rE   c                   4    e Zd Zdededej
                  fdZy)MLCDRotaryEmbeddingnum_patches_heightnum_patches_widthreturnc                    t        j                  || j                  j                        j	                  d      j                  d|      }t        j                  || j                  j                        j	                  d      j                  |d      }t        j                  |j                         |j                         gd      }t        ||      }t        j                  || j                  j                  | j                  j                        }t        j                  || j                        }||   j                  d      }	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer7   r   dim)rL   dtype)torcharangeinv_freqrL   	unsqueezeexpandstackflattenmaxrP   outer)
r0   rH   rI   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             r3   forwardzMLCDRotaryEmbedding.forward   s	    LL+DMM4H4HISSTUV]]^`bst 	 LL*4==3G3GHRRSTU\\]oqst 	
 ++x//183C3C3EFBO .0ABll=1E1ET]]M`M`a#kk#t}}= -W5==a@r4   N)r<   r=   r>   intrQ   Tensorra   r    r4   r3   rG   rG      s     # # %,, r4   rG   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )MLCDVisionEmbeddingsconfigc                 (    t         |   |       | `y N)r!   r"   position_embeddingr0   rf   r2   s     r3   r"   zMLCDVisionEmbeddings.__init__   s     #r4   pixel_valuesrJ   c                 T   |j                   d   }| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }|S )Nr   rP   r   r7   rM   rN   )shapepatch_embeddingweightrP   torW   	transposeclass_embeddingrU   rQ   cat)r0   rk   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingss          r3   ra   zMLCDVisionEmbeddings.forward   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
r4   )
r<   r=   r>   r   r"   rQ   FloatTensorrc   ra   rB   rC   s   @r3   re   re      s-    $/ $
E$5$5 
%,, 
r4   re   c                        e Zd ZdZdef fdZ	 ddej                  deej                  ej                  f   dej                  dz  de	e
   d	eej                  ej                  dz  f   f
d
Z xZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    rf   c                 T    t         |   |       |j                  | _        d| _        y )NF)r!   r"   r'   	is_causalrj   s     r3   r"   zMLCDAttention.__init__   s%     $*$?$?!r4   Nhidden_statesposition_embeddingsattention_maskr1   rJ   c                    |j                   d d \  }}| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }	|d   j                  d      j                         }
|d   j                  d      j                         }t        |||
|      \  }}|j                  dddd      j                         }|j                  dddd      j                         }|	j                  dddd      j                         }	t        j                  | j                  j                  t               } || |||	|f| j"                  sdn| j$                  | j&                  | j(                  d|\  }}|j                  dddd      j                         }|j+                  ||d      }| j-                  |      }|j                  ddd      j                         }||fS )NrM   r   r7   r   r   r:   )dropoutscalingr~   )rn   q_projreshape	num_headshead_dimk_projv_projrT   floatr   permute
contiguousr	   get_interfacerf   _attn_implementationr   trainingr   scaler~   viewout_proj)r0   r   r   r   r1   ru   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  r3   ra   zMLCDAttention.forward   s;    "/!4!4Sb!9
J {{=199:zSWSaSacgcpcp:qr[[/77ZQUQ_Q_aeanan8op
{{=199:zSWSaSacgcpcp:qr "!$..q1779!!$..q1779#>|ZY\^a#b j $++Aq!Q7BBD''1a3>>@
#++Aq!Q7BBD(?(M(MKK,,.E)
 %8
%
  $}}C$,,JJnn
%
 
%
!\ "))!Q15@@B!&&z:rBmmK0!))!Q2==?L((r4   rh   )r<   r=   r>   r?   r   r"   rQ   rc   tupler   r   ra   rB   rC   s   @r3   r|   r|      s    /  /3	,)||,) #5<<#=>,) t+	,)
 +,,) 
u||U\\D00	1,)r4   r|   c                        e Zd Zdef fdZ	 d
dej                  deej                  ej                  f   dej                  dz  dee	   deej                     f
d	Z xZS )MLCDEncoderLayerrf   c                 D    t         |   |       t        |      | _        y rh   )r!   r"   r|   	self_attnrj   s     r3   r"   zMLCDEncoderLayer.__init__   s     &v.r4   Nr   r   r   r1   rJ   c                     |}| j                  |      } | j                  d|||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
        )r   r   r   r    )layer_norm1r   layer_norm2mlp)r0   r   r   r   r1   residual_s          r3   ra   zMLCDEncoderLayer.forward   s    $ !((7)4>> 
' 3)
 	
q !=0 ((7/ =0r4   rh   )r<   r=   r>   r   r"   rQ   rc   r   r   r   rz   ra   rB   rC   s   @r3   r   r      sz    // / /3	"||" #5<<#=>" t+	"
 +," 
u  	!"r4   r   c                        e Zd ZdZdef fdZ	 ddej                  deej                  ej                  f   dej                  dz  de
e   d	eez  f
d
Z xZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    rf   c                 $    t         |   |       y)z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r!   r"   rj   s     r3   r"   zMLCDEncoder.__init__-  s     r4   Ninputs_embedsr   r   r1   rJ   c                 V    |}| j                   D ]  } ||||fi |} t        |      S )a=  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        )last_hidden_state)layersr   )r0   r   r   r   r1   r   encoder_layers          r3   ra   zMLCDEncoder.forward1  sK    , &![[ 	M)# 	M	 +
 	
r4   rh   )r<   r=   r>   r?   r   r"   rQ   rz   r   rc   r   r   r   ra   rB   rC   s   @r3   r   r   $  s{    !/ ! /3	!
((!
 #5<<#=>!
 t+	!

 +,!
 
	 !
r4   r   c                   l    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZeedZ ej                          d        Zy)MLCDPreTrainedModelrf   mlcdTF)r   
attentionsc                 &	   | j                   j                  }t        |t              r| j                   j                  }t	        j
                  |j                  d|j                  dz  |z         t	        j
                  |j                  j                  |j                   j                  |z         t	        j                  |j                  t        j                  |j                  j                  d         j!                  d             yt        |t"              r| j                   j                  }|j                  dz  d|j                   j$                  z  dz  z  |z  }|j                  dz  |z  }t	        j
                  |j&                  j                  |       t	        j
                  |j(                  j                  |       t	        j
                  |j*                  j                  |       t	        j
                  |j,                  j                  |       yt        |t.              r| j                   j                  }|j                   j0                  dz  d|j                   j$                  z  dz  z  |z  }d|j                   j0                  z  dz  |z  }t	        j
                  |j2                  j                  |       t	        j
                  |j4                  j                  |       yt        |t6              ro| j                   j                  }|j                   j0                  |j                   j8                  z  dz  dz  |z  }t	        j
                  |j:                  d|       yt        |t<        j>                        r?t	        j@                  |jB                         t	        jD                  |j                         yt        |t<        jF                        r,|jB                   t	        j@                  |jB                         yt        |tH              rod	|jJ                  t        j                  d
|jL                  dt        jN                        |jL                  z  z  z  }t	        j                  |jP                  |       yy)zInitialize the weightsr:   g      )meanstd)r   rM   )r7   rM   r   Nr;   r   rm   ))rf   r,   
isinstancere   initnormal_rs   	embed_dimro   rp   r+   copy_position_idsrQ   rR   rn   rU   r|   r%   r   r   r   r   rE   r#   fc1fc2MLCDVisionTransformerr&   class_pos_embnn	LayerNormzeros_biasones_LinearrG   thetarO   r   rS   )r0   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stdrS   s           r3   _init_weightsz!MLCDPreTrainedModel._init_weightsd  s#    //f23[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**< 56[[33F!==448Y8YY]^^cggjppKLL--C[I-KK$JJv}}%		*v{{/FKK$ 34fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 5r4   N)r<   r=   r>   r   __annotations__base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r|   _can_record_outputsrQ   no_gradr   r    r4   r3   r   r   U  s[    &*#N"&)#
 U]]_!2 !2r4   r   c            	       l     e Zd Zdef fdZe	 ddej                  dz  dee	   de
ez  fd       Z xZS )	r   rf   c                    t         |   |       t        |j                  |j                  z  dz        | _        t        j                  t        j                  d|j                  |j                  z  dz              | _
        y )Nr   r7   )r!   r"   rG   r#   r&   vision_rotary_embeddingr   	ParameterrQ   randnr   rj   s     r3   r"   zMLCDVisionTransformer.__init__  sh     ':6;M;MQWQkQk;kop;p'q$\\%++a9K9KvOiOi9imn9n*opr4   Nrk   r1   rJ   c                    |t        d      |j                  d   | j                  j                  z  }|j                  d   | j                  j                  z  }| j	                  ||      }|j                  | j                  j                        }t        j                  | j                  |gd      }t        j                  ||fd      }|j                         |j                         f}| j                  |      }| j                  |      } | j                  d||d|}	|	d   }
|
d d dd d f   }| j                  |      }t!        |
|      S )	Nz You have to specify pixel_valuesrM   r   rN   )r   r   )r   pooler_outputr    )
ValueErrorrn   rf   r)   r   rq   r   rL   rQ   rt   r   r   ry   pre_layrnormencoderpost_layernormr   )r0   rk   r1   rH   rI   r`   embr   r   encoder_outputsr   pooled_outputs               r3   ra   zMLCDVisionTransformer.forward  sP    ?@@)//3t{{7M7MM(..r2dkk6L6LL556HJ[\'**4+=+=+D+DED$6$6#GQOii8bA"wwy#'')45))-8&$,, 
' 3
 
 ,A.)!Q'2++M:)/'
 	
r4   rh   )r<   r=   r>   r   r"   r   rQ   rz   r   r   r   r   ra   rB   rC   s   @r3   r   r     s^    q/ q
  26 
''$. 
 +, 
 
+	+	 
  
r4   r   c            
       l    e Zd Z ed      e	 ddej                  dz  dee   de	e
z  fd              Zy)	MLCDVisionModelF)tie_last_hidden_statesNrk   r1   rJ   c                 *     | j                   dd|i|S )a  
        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```rk   r    )vision_model)r0   rk   r1   s      r3   ra   zMLCDVisionModel.forward  s)    > !t   
%

 	
r4   rh   )r<   r=   r>   r   r   rQ   rz   r   r   r   r   ra   r    r4   r3   r   r     sY    u5 26 
''$. 
 +, 
 
+	+	 
  6 
r4   r   )r   r   r   )3collections.abcr   rQ   torch.nnr    r   r   configuration_utilsr   modeling_outputsr   r   modeling_utilsr	   r
   processing_utilsr   utilsr   r   r   utils.genericr   clip.modeling_clipr   r   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerr<   loggerr   rE   rG   re   r|   r   r   r   r   r   __all__r    r4   r3   <module>r      s    %   & 3 K F & @ @ /   ; [ 
		H	%Y%' Y%x	g 	/ D/ $9)M 9)x'' 'T.
+ .
b 02/ 02 02f'
1 '
T#
o #
Lr4   