
    i9                     0   d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'  e jP                  e)      Z*dejV                  dejV                  fdZ,dejV                  dejV                  fdZ-dejV                  dejV                  fdZ.e ed       G d de                    Z/e ed       G d d e                    Z0ee G d! d"e                    Z1 G d# d$ejd                        Z3 G d% d&ejd                        Z4	 dNd'ejd                  d(ejV                  d)ejV                  d*ejV                  d+ejV                  dz  d,e5d-e5d.ee   fd/Z6 G d0 d1ejd                        Z7 G d2 d3ejd                        Z8 G d4 d5e      Z9e G d6 d7e             Z: G d8 d9ejd                        Z; G d: d;ejd                        Z< ed<       G d= d>e:             Z= G d? d@ejd                        Z> edA       G dB dCe:             Z?e G dD dEe:             Z@e G dF dGe:             ZAe G dH dIe:             ZB edJ       G dK dLe:             ZCg dMZDy)OzPyTorch CLIP model.    )Callable)	dataclass)AnyN)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)check_model_inputs   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr!   )r   s    p/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/clip/modeling_clip.pycontrastive_lossr(   /   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r(   t)r*   caption_loss
image_losss      r'   	clip_lossr/   3   s,    #J/L!*,,.1J:%,,r)   tensorc                     t        j                  | d      }t        j                  |dd      }t        j                  |d      }|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
       T)dimkeepdim      ?)r$   powsum)r0   square_tensor
sum_tensornormed_tensors       r'   _get_vector_normr<   9   s<    
 IIfa(M=b$?JIIj#.Mr)   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)CLIPVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r@   r$   FloatTensor__annotations__rA   rB   tuplerC    r)   r'   r?   r?   D   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r)   r?   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)CLIPTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsrA   .rB   rC   )rD   rE   rF   rG   rN   r$   rH   rI   rA   rB   rJ   rC   rK   r)   r'   rM   rM   V   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r)   rM   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)
CLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_textrN   r@   text_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))rT   rU   N)getattrto_tuple).0kselfs     r'   	<genexpr>z&CLIPOutput.to_tuple.<locals>.<genexpr>   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)rJ   keysr\   s   `r'   rY   zCLIPOutput.to_tuple   s#     
YY[
 
 	
r)   )rD   rE   rF   rG   rQ   r$   rH   rI   rR   rS   rN   r@   rT   r   rU   rJ   r   rY   rK   r)   r'   rP   rP   h   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r)   rP   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )CLIPVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr2   r   position_idsr   r3   
persistent)super__init__rb   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr$   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr%   expandr\   rb   	__class__s     r'   rn   zCLIPVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr)   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr3   r6   r   r2   bicubicF)sizemodealign_cornersr4   )shaper|   weight	unsqueezer$   jit
is_tracingri   rr   r   reshapepermuter   r"   interpolateviewcat)r\   r   r   r   ry   r|   rz   class_pos_embedpatch_pos_embedr4   
new_height	new_widthsqrt_num_positionss                r'   interpolate_pos_encodingz-CLIPVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr)   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().)dtyper2   r   r3   r   )r   rq   
ValueErrorrx   r   r   toflatten	transposeru   r~   r$   r   r   r|   ri   )r\   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r'   forwardzCLIPVisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr)   F)rD   rE   rF   r   rn   r$   Tensorintr   rH   r   __classcell__r   s   @r'   ra   ra      se    q/ q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r)   ra   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
CLIPTextEmbeddingsrb   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nri   rj   Frk   )rm   rn   ro   r   r{   
vocab_sizetoken_embeddingmax_position_embeddingsr|   r}   r$   r%   r~   r\   rb   rp   r   s      r'   rn   zCLIPTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r)   N	input_idsri   inputs_embedsr   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nr3   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r|   r   r   ri   r   )r\   r   ri   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r'   r   zCLIPTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r)   NNN)rD   rE   rF   r   rn   r$   
LongTensorrH   r   r   r   r   s   @r'   r   r      sj    

~ 

 .20426	##d* &&- ((4/	
 
r)   r   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr3   r   )r4   r   )ptrainingr   r2   )r$   matmulr   r   r"   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r'   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r)   c                        e Zd ZdZdeez  f fdZ	 d
dej                  dej                  dz  de	e
   deej                  ej                  dz  f   fd	Z xZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrb   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rm   rn   rb   ro   rp   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r'   rn   zCLIPAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar)   NrB   r   r   r   c                    |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||d| j
                        j                  dd      }|j	                  ||d| j
                        j                  dd      }|	j	                  ||d| j
                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                  | j                  sdn| j                  d|\  }}|j                  ||d      j!                         }| j#                  |      }||fS )z#Input shape: Batch x Time x Channelr3   r   r2           )r   r   )r   r   r   r   r   r   r   r   get_interfacerb   _attn_implementationr   r   r   r   r   r   r   )r\   rB   r   r   r   r   rp   queriesr^   valuesattention_interfacer   r   s                r'   r   zCLIPAttention.forward4  sT    -:,?,?)
J	++m,{{=)]+,,z:r4==ISSTUWXYyyZT]]CMMaQRSZRGQQRSUVW(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
!\ "))*j"EPPRmmK0L((r)   N)rD   rE   rF   rG   r   r   rn   r$   r   r   r   rJ   r   r   r   s   @r'   r   r     st    GB/.@ B. /3$)||$) t+$) +,	$)
 
u||U\\D00	1$)r)   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CLIPMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )rm   rn   rb   r	   
hidden_actactivation_fnr   r   ro   intermediate_sizefc1fc2r   s     r'   rn   zCLIPMLP.__init__\  sd    #F$5$5699V//1I1IJ99V55v7I7IJr)   rB   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r\   rB   s     r'   r   zCLIPMLP.forwardc  s4    /**=9/r)   )rD   rE   rF   rn   r$   r   r   r   r   s   @r'   r   r   [  s$    KU\\ ell r)   r   c                        e Zd Zdeez  f fdZdej                  dej                  dee	   dej                  fdZ xZS )CLIPEncoderLayerrb   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)eps)rm   rn   ro   rp   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r'   rn   zCLIPEncoderLayer.__init__k  sl    ++&v.<<F<Q<QR6?<<F<Q<QRr)   rB   r   r   r   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rB   r   rK   )r   r   r   r   )r\   rB   r   r   residualr   s         r'   r   zCLIPEncoderLayer.forwards  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r)   )rD   rE   rF   r   r   rn   r$   r   r   r   rH   r   r   r   s   @r'   r   r   j  sV    S/.@ S||  +,	
 
		r)   r   c                   l    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZeedZ ej                          d        Zy)CLIPPreTrainedModelrb   clip)imagetextT)rB   rC   c                    | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t              r| j                   j                  }t	        j
                  |j                   d|j"                  dz  |z         t	        j
                  |j$                  j                  |j                   j&                  |z         t	        j
                  |j                  j                  |j                   j&                  |z         t	        j                  |j                  t        j                  |j(                        j                  d             nt        |t*              r| j                   j                  }|j"                  dz  d|j                   j,                  z  dz  z  |z  }|j"                  dz  |z  }t	        j
                  |j.                  j                  |       t	        j
                  |j0                  j                  |       t	        j
                  |j2                  j                  |       t	        j
                  |j4                  j                  |       nt        |t6              r| j                   j                  }|j                   j8                  dz  d|j                   j,                  z  dz  z  |z  }d|j                   j8                  z  dz  |z  }t	        j
                  |j:                  j                  |       t	        j
                  |j<                  j                  |       nt        |t>              rt	        j
                  |j@                  j                  |jB                  dz  | j                   j                  z         t	        j
                  |jD                  j                  |jF                  dz  | j                   j                  z         nGt        |tH              rZt	        j
                  |jD                  j                  | j                   j8                  dz  | j                   j                  z         nt        |tJ              rZt	        j
                  |j@                  j                  | j                   j8                  dz  | j                   j                  z         nst        |tL              rct	        j
                  |jN                  j                  | j                   jP                  j8                  dz  | j                   j                  z         t        |tR        jT                        r>t	        jV                  |jX                         t	        jZ                  |j                         t        |tR        j\                        r-|jX                   t	        jV                  |jX                         y	y	y	)
zInitialize the weightsr   g{Gz?)meanstdr3   rj   r   )r  r2   N)/rb   initializer_factor
isinstancer   initnormal_r   r   r|   copy_ri   r$   r%   r   r~   ra   ru   rp   rx   initializer_rangerz   r   num_hidden_layersr   r   r   r   r   ro   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   r   zeros_rh   ones_r   )r\   r   factorin_proj_stdout_proj_stdfc_stds         r'   _init_weightsz!CLIPPreTrainedModel._init_weights  su    //f01LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 45[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<	*LL&&--))4/$++2P2PP LL((//++T1DKK4R4RR  =>LL((//KK++T1DKK4R4RR  ;<LL&&--KK++T1DKK4R4RR  :;LL!!((KK--994?$++B`B``
 fbll+KK$JJv}}%fbii(V[[-DKK$ .E(r)   N)rD   rE   rF   r   rI   base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr$   no_gradr  rK   r)   r'   r   r     s[    (&*#N"&)#
 U]]_8% 8%r)   r   c                   `     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
fdZ xZS )
CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    rb   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
rm   rn   rb   r   
ModuleListranger  r   layersgradient_checkpointing)r\   rb   r   r   s      r'   rn   zCLIPEncoder.__init__  sO    mmuVMeMeGf$g!%5f%=$gh&+# %hs   A#Nr   r   r   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a7  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        )rA   )r,  r   )r\   r   r   r   rB   encoder_layers         r'   r   zCLIPEncoder.forward  sH    ( &![[ 	M) M	 +
 	
r)   r   )rD   rE   rF   rG   r   rn   r$   r   r   r   r   r   r   r   s   @r'   r'  r'    sK    ,z , /3
 t+
 +,	

 

r)   r'  c                        e Zd Zdef fdZe	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dee	   de
f
d	       Z xZS )CLIPTextTransformerrb   c                     t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        |j                  | _        y r   )rm   rn   rb   ro   r   r   r'  encoderr   r   r   final_layer_normeos_token_idr   s      r'   rn   zCLIPTextTransformer.__init__  sa    &&	,V4"6* "YF<Q<Q R #//r)   Nr   r   ri   r   r   c           	         |t        d      |j                         }|j                  d|d         }| j                  ||      }t	        | j
                  ||t        j                  |j                  d   |j                        d       }|j                  dd         | j                  d||dd	|}|j                  }| j                  |      }| j                  d
k(  rm|t        j                  |j                  d   |j                        |j                  t        j                   |j                        j#                  d      f   }	n|t        j                  |j                  d   |j                        |j                  t        j                   |j                        | j                  k(  j!                         j#                  d      f   }	t%        ||	|j&                  |j(                        S )NzYou have to specify input_idsr3   )r   ri   r   r    )rb   input_embedsr   cache_positionpast_key_valuesr   T)r   r   r   r2   r   )r   r!   r   rA   pooler_outputrB   rC   rK   )r   r   r   r   r
   rb   r$   r%   r   r!   popr3  rA   r4  r5  r   r   argmaxr   rB   rC   )
r\   r   r   ri   r   input_shaperB   encoder_outputsrA   pooled_outputs
             r'   r   zCLIPTextTransformer.forward  s    <==nn&NN2{27	),W+;;&) <<(;(;A(>}G[G[\ 
 	

;%+74<< ,
'),
 	,
 ,== 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M */')77&11	
 	
r)   r   )rD   rE   rF   r   rn   r   r$   r   r   r   r   r   r   r   s   @r'   r1  r1    s    	0~ 	0  *..2,0	=
<<$&=
 t+=
 llT)	=

 +,=
 
$=
 =
r)   r1  zI
    The text model from CLIP without any head or projection on top.
    c                        e Zd ZU eed<   dZddgZdef fdZdej                  fdZ
d Z ed	
      e	 	 	 ddej                  dz  dej                  dz  dej                  dz  dee   def
d              Z xZS )CLIPTextModelrb   r  r   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rm   rn   r1  
text_model	post_initr   s     r'   rn   zCLIPTextModel.__init__^  s&     -f5r)   r   c                 B    | j                   j                  j                  S r   rE  r   r   r_   s    r'   get_input_embeddingsz"CLIPTextModel.get_input_embeddingsd      ))999r)   c                 :    || j                   j                  _        y r   rH  r\   r   s     r'   set_input_embeddingsz"CLIPTextModel.set_input_embeddingsg      5:""2r)   Ftie_last_hidden_statesNr   r   ri   r   c                 .     | j                   d|||d|S )a9  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   ri   rK   )rE  )r\   r   r   ri   r   s        r'   r   zCLIPTextModel.forwardj  s/    2 t 
)%
 	
 	
r)   r   )rD   rE   rF   r   rI   r  _no_split_modulesrn   r   ModulerI  rM  r   r   r$   r   r   r   r   r   r   r   s   @r'   rB  rB  S  s      -/AB~ :bii :; u5 *..2,0	
<<$&
 t+
 llT)	

 +,
 
$
  6
r)   rB  c            
       r     e Zd Zdef fdZe	 	 d	dej                  dz  dedz  de	e
   defd       Z xZS )
CLIPVisionTransformerrb   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )rm   rn   rb   ro   ra   r   r   r   r   pre_layrnormr'  r3  post_layernormr   s      r'   rn   zCLIPVisionTransformer.__init__  sj    &&	.v6LL8M8MN"6* ll9&:O:OPr)   Nr   r   r   r   c                 $   |t        d      | j                  ||      }| j                  |      } | j                  dd|i|}|j                  }|d d dd d f   }| j                  |      }t        |||j                  |j                        S )Nz You have to specify pixel_values)r   r   r   r:  rK   )	r   r   rX  r3  rA   rY  r   rB   rC   )r\   r   r   r   rB   r?  rA   r@  s           r'   r   zCLIPVisionTransformer.forward  s     ?@@Ogh))-8+74<< ,
',
,

 ,==)!Q'2++M:)/')77&11	
 	
r)   r)  )rD   rE   rF   r   rn   r   r$   rH   boolr   r   r   r   r   r   s   @r'   rV  rV    si    Q/ Q  2605
''$.
 #'+
 +,	

 
$
 
r)   rV  zK
    The vision model from CLIP without any head or projection on top.
    c                        e Zd ZU eed<   dZdZdgZdef fdZde	j                  fdZ ed	      e	 	 ddej                  d
z  dedee   defd              Z xZS )CLIPVisionModelrb   r   r   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rm   rn   rV  vision_modelrF  r   s     r'   rn   zCLIPVisionModel.__init__  s'     1&9r)   r   c                 B    | j                   j                  j                  S r   r`  r   rx   r_   s    r'   rI  z$CLIPVisionModel.get_input_embeddings        ++;;;r)   FrO  Nr   r   c                 ,     | j                   d||d|S )a(  
        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   rK   )r`  )r\   r   r   r   s       r'   r   zCLIPVisionModel.forward  s.    > !t   
%%=
 
 	
r)   r)  )rD   rE   rF   r   rI   main_input_namer  rS  rn   r   rT  rI  r   r   r$   rH   r[  r   r   r   r   r   r   s   @r'   r]  r]    s     $O!+,/ <bii < u5 26).!
''$.!
 #'!
 +,	!

 
$!
  6!
r)   r]  c                       e Zd ZU eed<   g dZdef fdZee	 	 dde	j                  de	j                  dz  de	j                  dz  dee   d	eez  f
d
              Zee	 dde	j                   dedee   d	eez  fd              Zee	 	 	 	 	 	 dde	j&                  dz  de	j                   dz  de	j                  dz  de	j&                  dz  dedz  dedee   d	efd              Z xZS )r  rb   )r   r   ra   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        |j                  | _        t        j                  |      }|j                  | _        t         j                  |      }|j"                  | _        t%        j&                  | j                  | j                  d      | _        t%        j&                  | j                  | j                  d      | _        t%        j,                  t/        j0                  | j2                  j4                              | _        | j9                          y )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type Frh   )rm   rn   r  text_configr   	TypeErrortyper  r   projection_dimro   r  r  rB  _from_configrE  r]  r`  r   r   r  r  rs   r$   r0   rb   logit_scale_init_valuelogit_scalerF  )r\   rb   rk  r  rE  r`  r   s         r'   rn   zCLIPModel.__init__  sx    &,,n=++,-Q0 
 &..0@A--./q2 
 ((,,$33)55 - 9 9"//<
$//&33MB(55!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r)   Nr   r   ri   r   r   c                 x     | j                   d|||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   ri   return_dictrK   )rE  r;  r  )r\   r   r   ri   r   text_outputsr@  s          r'   get_text_featureszCLIPModel.get_text_features  sV    0 4C4?? 4
)%	4

 4
 %22%)%9%9-%H"r)   r   r   c                 v     | j                   d||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)r   r   rs  rK   )r`  r;  r  )r\   r   r   r   vision_outputsr@  s         r'   get_image_featureszCLIPModel.get_image_features:  sU    6 6GT5F5F 6
%%=6
 	6
 '44'+'='=m'L$r)   return_lossc           	      L    | j                   d||d|} | j                  d|||d|}	|j                  }
| j                  |
      }
|	j                  }| j	                  |      }|
t        |
      z  }
|t        |      z  }t        j                  ||
j                         j                  |j                              }|| j                  j                         j                  |j                        z  }|j                         }d}|rt        |      }t        |||||
|	|      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```re  rR  N)rQ   rR   rS   rN   r@   rT   rU   rK   )r`  rE  r;  r  r  r<   r$   r   r,   r   r!   rq  expr/   rP   )r\   r   r   r   ri   ry  r   r   rw  rt  r@   rN   rS   rR   rQ   s                  r'   r   zCLIPModel.forward`  sI   L 6GT5F5F 6
%%=6
 6
 4C4?? 4
)%4
 	4
 &33--l;"00**;7 $&6|&DD!$4[$AA  ,,{LNN4D4G4GHZHZ4[\)D,<,<,@,@,B,E,EkFXFX,YY*,,._-D-+#%* .
 	
r)   NNr   )NNNNNF)rD   rE   rF   r   rI   rS  rn   r   r   r$   r   r   r   rJ   r   ru  rH   r[  rx  r   rP   r   r   r   s   @r'   r  r    s   Z!z !F  /3,0	 <<  t+  llT)	 
 +,  
+	+    D  */"''" #'" +,	"
 
+	+"  "H  .215.204#').M
##d*M
 ''$.M
 t+	M

 &&-M
 D[M
 #'M
 +,M
 
M
  M
r)   r  c                        e Zd ZU eed<   dZddgZdef fdZdej                  fdZ
d Z ed	
      e	 	 	 ddej                  dz  dej                  dz  dej                  dz  dee   def
d              Z xZS )r  rb   rC  r   r   c                     t         |   |       t        j                  |      }|j                  | _        t        j                  |j                  |j                  d      | _	        | j                          y NFrj  )rm   rn   rB  ro  rE  r   r   ro   rn  r  rF  )r\   rb   rE  r   s      r'   rn   z$CLIPTextModelWithProjection.__init__  s[     "//7
$//!yy););V=R=RY^_ 	r)   r   c                 B    | j                   j                  j                  S r   rH  r_   s    r'   rI  z0CLIPTextModelWithProjection.get_input_embeddings  rJ  r)   c                 :    || j                   j                  _        y r   rH  rL  s     r'   rM  z0CLIPTextModelWithProjection.set_input_embeddings  rN  r)   FrO  Nr   r   ri   r   c                      | j                   d|||d|}|j                  }| j                  |      }t        ||j                        S )a@  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```rR  )rN   rA   rK   )rE  r;  r  rM   rA   )r\   r   r   ri   r   rt  r@  rN   s           r'   r   z#CLIPTextModelWithProjection.forward  sc    4 4C4?? 4
)%4
 	4
 %22**=9"#*<<
 	
r)   r   )rD   rE   rF   r   rI   r  rS  rn   r   rT  rI  rM  r   r   r$   r   r   r   rM   r   r   r   s   @r'   r  r    s     -/AB	~ 	:bii :; u5 *..2,0	$
<<$&$
 t+$
 llT)	$

 +,$
 
$
  6$
r)   r  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
 ed      e	 	 ddej                  d	z  d
edee   defd              Z xZS )r  rb   r   r^  c                     t         |   |       t        j                  |      }|j                  | _        t        j                  |j                  |j                  d      | _	        | j                          y r  )rm   rn   r]  ro  r`  r   r   ro   rn  r  rF  r\   rb   r`  r   s      r'   rn   z&CLIPVisionModelWithProjection.__init__  s\     &33F;(55!#6+=+=v?T?T[`!a 	r)   r   c                 B    | j                   j                  j                  S r   rb  r_   s    r'   rI  z2CLIPVisionModelWithProjection.get_input_embeddings  rc  r)   FrO  Nr   r   c                      | j                   d||d|}|j                  }| j                  |      }t        ||j                        S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
        >>> from transformers.image_utils import load_image

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```re  )r@   rA   rK   )r`  r;  r  r?   rA   )r\   r   r   r   rw  r@  r@   s          r'   r   z%CLIPVisionModelWithProjection.forward  sb    : 6GT5F5F 6
%%=6
 6

 '44--m<$%,>>
 	
r)   r)  )rD   rE   rF   r   rI   rf  r  rn   r   rT  rI  r   r   r$   rH   r[  r   r   r?   r   r   r   s   @r'   r  r    s    $O!	/ 	<bii < u5 26).&
''$.&
 #'&
 +,	&

 
&
  6&
r)   r  z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                        e Zd ZdZdZdeddf fdZ ed      e	 	 dde	j                  dz  d	e	j                  dz  d
ee   defd              Z xZS )r  r   r^  rb   r   Nc                 ~   t         |   |       |j                  | _        t        j	                  |j
                        }|j                  | _        |j                  dkD  r4t        j                  |j
                  j                  |j                        nt        j                         | _        | j                          y )Nr   )rm   rn   
num_labelsr]  ro  r  r`  r   r   ro   Identityr  rF  r  s      r'   rn   z#CLIPForImageClassification.__init__<  s      ++&33F4H4HI(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r)   FrO  labelsr   c                     | j                   |fi |}|j                  }t        j                  |ddddddf   d      }| j	                  |      }d}|| j                  ||| j                        }t        ||      S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   )rQ   r   )r`  rA   r$   r  r  loss_functionrb   r   )r\   r   r  r   outputssequence_outputr   rQ   s           r'   r   z"CLIPForImageClassification.forwardK  s     /@d.?.?/
/

 "33**_QAX%>AF1%%ffdkkBD$
 	
r)   r|  )rD   rE   rF   rf  r  r   rn   r   r   r$   r   r   r   r   r   r   r   s   @r'   r  r  2  s     %O!z d  u5 -1&*
llT)
 t#
 +,	

 

  6
r)   r  )r  r   rB  r  r]  r  r  )r   )ErG   collections.abcr   dataclassesr   typingr   r$   r    r   r  activationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   configuration_clipr   r   r   
get_loggerrD   loggerr   r(   r/   r<   r?   rM   rP   rT  ra   r   floatr   r   r   r   r   r'  r1  rB  rV  r]  r  r  r  r  __all__rK   r)   r'   <module>r     s    $ !    & ! / 9 b b F &  0 L L 
		H	%
`U\\ `ell `-%,, -5<< -U\\ ell  
	<K 	< 	< 
	<+ 	< 	<  
  
   
FP299 Pf% %^ %II%<<% 
% <<	%
 LL4'% % % '(%*;)BII ;)|bii 1 B G%/ G% G%T-
")) -
`J
")) J
Z 
0
' 0

0
f&
BII &
R 
2
) 2

2
j @
# @
 @
F =
"5 =
 =
@ ;
$7 ;
 ;
| 2
!4 2
2
jr)   