
    i                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)  e"jT                  e+      Z,de	jZ                  de	jZ                  fdZ.de	jZ                  de	jZ                  fdZ/ee  G d de                    Z0ee  G d de                    Z1ee  G d de                    Z2 G d de
jf                        Z4 G d  d!e
jf                        Z5	 dGd"e
jf                  d#e	jZ                  d$e	jZ                  d%e	jZ                  d&e	jZ                  dz  d'e6d(e6fd)Z7 G d* d+e
jf                        Z8 G d, d-e
jf                        Z9 G d. d/e      Z:e  G d0 d1e             Z; G d2 d3e
jf                        Z< G d4 d5e
jf                        Z= G d6 d7e;      Z> G d8 d9e
jf                        Z? G d: d;e;      Z@e  G d< d=e;             ZA G d> d?e
jf                        ZB G d@ dAe;      ZC e dBC       G dD dEe;             ZDg dFZEy)HzPyTorch CLIPSeg model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)is_flash_attention_requested   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr!   )r   s    v/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_lossr(   *   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r(   t)r*   caption_loss
image_losss      r'   clipseg_lossr/   /   s,    #J/L!*,,.1J:%,,r)   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)CLIPSegOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r7   r8   Ngetattrto_tuple.0kselfs     r'   	<genexpr>z)CLIPSegOutput.to_tuple.<locals>.<genexpr>U   s=      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysrA   s   `r'   r=   zCLIPSegOutput.to_tupleT   #     
YY[
 
 	
r)   )__name__
__module____qualname____doc__r2   r$   FloatTensor__annotations__r3   r4   r5   r6   r7   r   r8   rE   r   r=    r)   r'   r1   r1   5   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r)   r1   c                       e Zd ZU dZdZej                  dz  ed<   dZe	ej                     dz  ed<   dZ
e	ej                     dz  ed<   y)CLIPSegDecoderOutputz|
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    Nr   hidden_states
attentions)rI   rJ   rK   rL   r   r$   rM   rN   rR   rE   rS   rO   r)   r'   rQ   rQ   [   sR    
 (,FE$+59M5**+d2926Je''(4/6r)   rQ   c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZeed<   dZeed<   d	ee   fd
Zy)CLIPSegImageSegmentationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Binary cross entropy loss for segmentation.
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
        Conditional embeddings used for segmentation.
    pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
        Pooled output of the [`CLIPSegVisionModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    decoder_output (`CLIPSegDecoderOutput`):
        The output of the [`CLIPSegDecoder`].
    Nr2   r   conditional_embeddingspooled_outputr8   decoder_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r8   rX   Nr;   r>   s     r'   rB   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   s<      
  IIDGwW[]^O_OhOhOjj
rC   rD   rG   s   `r'   r=   z'CLIPSegImageSegmentationOutput.to_tuple   rH   r)   )rI   rJ   rK   rL   r2   r$   rM   rN   r   rV   rW   r8   r   rX   rQ   rE   r   r=   rO   r)   r'   rU   rU   h   s     &*D%

d
")'+FE$+7;E--4;.2M5$$t+26:3:+/N(/
%* 
r)   rU   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )CLIPSegVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__r]   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr$   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr%   expandrA   r]   	__class__s     r'   rk   z CLIPSegVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr)   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nrg   g      ?r   rd   bicubicF)sizemodealign_cornersdim)shapery   weight	unsqueezer$   jit
is_tracingre   ro   r   reshapepermuter   r"   interpolateviewcat)rA   r~   r   r   rv   ry   rw   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r'   interpolate_pos_encodingz0CLIPSegVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr)   pixel_valuesc                     |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  |      }|j	                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }	|r|	| j                  |	||      z   }	|	S |	| j                  | j                        z   }	|	S )	NzInput image size (*z) doesn't match model ().rd   r   rg   r   )r   rn   
ValueErrorru   flatten	transposerr   r{   r$   r   r   ry   re   )
rA   r   r   
batch_size_r   r   patch_embedsclass_embedsr~   s
             r'   forwardzCLIPSegVisionEmbeddings.forward   s   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++L9#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr)   T)rI   rJ   rK   r   rk   r$   Tensorintr   rM   r   __classcell__r}   s   @r'   r\   r\      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Y^YeYe r)   r\   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
CLIPSegTextEmbeddingsr]   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nre   rf   Frh   )rj   rk   rl   r   rx   
vocab_sizetoken_embeddingmax_position_embeddingsry   rz   r$   r%   r{   rA   r]   rm   r}   s      r'   rk   zCLIPSegTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r)   N	input_idsre   inputs_embedsr   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nrg   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   ry   r   r   re   r   )rA   r   re   r   
seq_lengthmax_position_embeddingposition_embeddingsr~   s           r'   r   zCLIPSegTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r)   )NNN)rI   rJ   rK   r   rk   r$   
LongTensorrM   r   r   r   r   s   @r'   r   r      sk    

0 

 .20426	##d* &&- ((4/	
 
r)   r   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nrg   r   )r   dtype)ptrainingr   rd   )r$   matmulr   r   r"   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r'   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r)   c                        e Zd ZdZdeez  f fdZ	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
ej                  ej                  dz  f   f
d
Z xZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr]   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rj   rk   r]   rl   rm   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr|   s     r'   rk   zCLIPSegAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar)   NrR   r   causal_attention_maskoutput_attentionsr   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
t        | j                        s||||z   }n||}n	|du| _
        t        j                  | j                  j                  t              } || ||	|
|| j                  | j                  | j                   sdn| j"                        \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )z#Input shape: Batch x Time x Channelr   rd   N        )r   r   r   )r   r   r   r   r   r   r   r   r   r]   r   r   get_interface_attn_implementationr   r   r   r   r   r   r   )rA   rR   r   r   r   r   r   rm   queriesrF   valuesattention_interfacer   r   s                 r'   r   zCLIPSegAttention.forward3  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ,DKK8).C.O!/2G!G&2!62$>DN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r)   )NNF)rI   rJ   rK   rL   r   r   rk   r$   r   boolrE   r   r   r   s   @r'   r   r     s    GB25FF B. /359).0)||0) t+0)  %||d2	0)
  $;0) 
u||U\\D00	10)r)   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
CLIPSegMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)rj   rk   r]   r	   
hidden_actactivation_fnr   r   rl   intermediate_sizefc1fc2r|   s     r'   rk   zCLIPSegMLP.__init__h  sd    #F$5$5699V//1I1IJ99V55v7I7IJr)   rR   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )rA   rR   s     r'   r   zCLIPSegMLP.forwardo  s4    /**=9/r)   )rI   rJ   rK   rk   r$   r   r   r   r   s   @r'   r   r   g  s$    KU\\ ell r)   r   c                        e Zd Zdef fdZ	 d
dej                  dej                  dej                  dedz  deej                     f
d	Z
 xZS )CLIPSegEncoderLayerr]   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)epsrj   rk   rl   rm   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r|   s     r'   rk   zCLIPSegEncoderLayer.__init__x  m    ++)&1<<F<Q<QRf%<<F<Q<QRr)   rR   r   r   r   Nr   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rR   r   r   r   )r   r   r   r   rA   rR   r   r   r   residualr   outputss           r'   r   zCLIPSegEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr)   F)rI   rJ   rK   r   rk   r$   r   r   rE   rM   r   r   r   s   @r'   r   r   w  sf    S} S */&||& &  %||	&
  $;& 
u  	!&r)   r   c                   R    e Zd ZU eed<   dZdZdZ ej                         d        Z
y)CLIPSegPreTrainedModelr]   clip)imagetextTc                 
   | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t              r| j                   j                  }t	        j
                  |j                   d|j"                  dz  |z         t	        j
                  |j$                  j                  |j                   j&                  |z         t	        j
                  |j                  j                  |j                   j&                  |z         t	        j                  |j                  t        j                  |j(                        j                  d             nt        |t*              r| j                   j                  }|j"                  dz  d|j                   j,                  z  dz  z  |z  }|j"                  dz  |z  }t	        j
                  |j.                  j                  |       t	        j
                  |j0                  j                  |       t	        j
                  |j2                  j                  |       t	        j
                  |j4                  j                  |       nt        |t6              r| j                   j                  }|j                   j8                  dz  d|j                   j,                  z  dz  z  |z  }d|j                   j8                  z  dz  |z  }t	        j
                  |j:                  j                  |       t	        j
                  |j<                  j                  |       nt        |t>              rt	        j
                  |j@                  j                  |jB                  dz  | j                   j                  z         t	        j
                  |jD                  j                  |jF                  dz  | j                   j                  z         t        |tH        jJ                        r>t	        jL                  |jN                         t	        jP                  |j                         t        |tH        jR                        r-|jN                   t	        jL                  |jN                         y	y	y	)
zInitialize the weightsr   g{Gz?)meanstdrg   rf   r   )r	  rd   N)*r]   initializer_factor
isinstancer   initnormal_r   r   ry   copy_re   r$   r%   r   r{   r\   rr   rm   ru   initializer_rangerw   r   num_hidden_layersr   r   r   r   r   rl   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   r   zeros_rc   ones_r   )rA   r   factorin_proj_stdout_proj_stdfc_stds         r'   _init_weightsz$CLIPSegPreTrainedModel._init_weights  s    //f34LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 78[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_ 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<-LL&&--))4/$++2P2PP LL((//++T1DKK4R4RR
 fbll+KK$JJv}}%fbii(V[[-DKK$ .E(r)   N)rI   rJ   rK   r   rN   base_model_prefixinput_modalitiessupports_gradient_checkpointingr$   no_gradr  rO   r)   r'   r  r    s4    (&*#U]]_)% )%r)   r  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddej                  dz  dej                  dz  de	dz  de	dz  d	e	dz  d
e
ez  fd       Z xZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    r]   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rj   rk   r]   r   
ModuleListranger  r   layersgradient_checkpointing)rA   r]   r   r}   s      r'   rk   zCLIPSegEncoder.__init__  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %ks   A#Nr   r   r   output_hidden_statesreturn_dictr   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrO   )r   r   r   )last_hidden_staterR   rS   )r]   r   r(  use_return_dict	enumerater&  r   )rA   r   r   r   r   r(  r)  encoder_statesall_attentionsrR   idxencoder_layerlayer_outputss                r'   r   zCLIPSegEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B)%"3	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r)   NNNNN)rI   rJ   rK   rL   r   rk   r   r$   r   r   rE   r   r   r   r   s   @r'   r"  r"    s    ,} ,  /359)-,0#'C
 t+C
  %||d2	C

  $;C
 #TkC
 D[C
 
	 C
 C
r)   r"  c                        e Zd Zdef fdZe	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	edz  d
e	e
z  fd       Z xZS )CLIPSegTextTransformerr]   c                     t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        |j                  | _        y r   )rj   rk   r]   rl   r   r~   r"  encoderr   r   r   final_layer_normeos_token_idr   s      r'   rk   zCLIPSegTextTransformer.__init__5  sa    &&	/7%f- "YF<Q<Q R #//r)   Nr   r   re   r   r(  r)  r   c                 Z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j                         }|j                  d|d         }| j                  ||      }t        ||j                  |j                        }	|t        ||j                        }| j                  |||	|||      }
|
d   }| j                  |      }| j                  dk(  rm|t        j                   |j"                  d   |j                        |j%                  t        j&                  |j                        j)                  d	      f   }n|t        j                   |j"                  d   |j                        |j%                  t        j&                  |j                        | j                  k(  j'                         j)                  d	      f   }|s
||f|
d
d  z   S t+        |||
j,                  |
j.                        S )NzYou have to specify input_idsrg   )r   re   r    )r   r   r   r   r(  r)  r   rd   )r   r!   r   r   r+  pooler_outputrR   rS   )r]   r   r(  r,  r   r   r   r~   r
   r   r!   r   r7  r8  r9  r$   r%   r   r   r   argmaxr   rR   rS   )rA   r   r   re   r   r(  r)  input_shaperR   r   encoder_outputsr+  rW   s                r'   r   zCLIPSegTextTransformer.forward@  s.    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&NN2{27	),W !A,,]5I5I!
 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %}58KKK)/')77&11	
 	
r)   NNNNNN)rI   rJ   rK   r   rk   r   r$   r   r   rE   r   r   r   r   s   @r'   r5  r5  4  s    	00 	0  *..2,0)-,0#'K
<<$&K
 t+K
 llT)	K

  $;K
 #TkK
 D[K
 
+	+K
 K
r)   r5  c                       e Zd ZU eed<   dZddgZdef fdZdej                  fdZ
d Ze	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  ded	z  ded	z  ded	z  deez  fd       Z xZS )CLIPSegTextModelr]   )r  r   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rj   rk   r5  
text_model	post_initr|   s     r'   rk   zCLIPSegTextModel.__init__  s&     08r)   r   c                 B    | j                   j                  j                  S r   rD  r~   r   rG   s    r'   get_input_embeddingsz%CLIPSegTextModel.get_input_embeddings  s    ))999r)   c                 :    || j                   j                  _        y r   rG  )rA   r   s     r'   set_input_embeddingsz%CLIPSegTextModel.set_input_embeddings  s    5:""2r)   Nr   r   re   r   r(  r)  c                 0    | j                  ||||||      S )a;  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   re   r   r(  r)  )rD  )rA   r   r   re   r   r(  r)  r   s           r'   r   zCLIPSegTextModel.forward  s,    4 )%/!5#  
 	
r)   r@  )rI   rJ   rK   r   rN   r  _no_split_modulesrk   r   ModulerH  rJ  r   r$   r   r   rE   r   r   r   r   s   @r'   rB  rB    s     02GH0 :bii :;  *..2,0)-,0#' 
<<$& 
 t+ 
 llT)	 

  $; 
 #Tk 
 D[ 
 
+	+ 
  
r)   rB  c                        e Zd Zdef fdZe	 	 	 	 ddej                  dz  dedz  dedz  dedz  dedz  d	e	e
z  fd
       Z xZS )CLIPSegVisionTransformerr]   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )rj   rk   r]   rl   r\   r~   r   r   r   pre_layrnormr"  r7  post_layernormr   s      r'   rk   z!CLIPSegVisionTransformer.__init__  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr)   Nr   r   r(  r)  r   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )N)r   )r   r   r(  r)  r   r   r;  )r]   r   r(  r,  r~   rR  r7  rS  r   rR   rS   )
rA   r   r   r(  r)  r   rR   r?  r+  rW   s
             r'   r   z CLIPSegVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r)   )NNNT)rI   rJ   rK   r   rk   r   r$   rM   r   rE   r   r   r   r   s   @r'   rP  rP    s    Q2 Q  *.,0#'04$
''$.$
  $;$
 #Tk	$

 D[$
 #'+$
 
+	+$
 $
r)   rP  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 	 	 	 ddej                  dz  dedz  d	edz  d
edz  dedz  deez  fd       Z xZS )CLIPSegVisionModelr]   r   )r  c                 d    t         |   |       t        |      | _        | j	                          y r   )rj   rk   rP  vision_modelrE  r|   s     r'   rk   zCLIPSegVisionModel.__init__  s'     4V<r)   r   c                 B    | j                   j                  j                  S r   )rX  r~   ru   rG   s    r'   rH  z'CLIPSegVisionModel.get_input_embeddings  s      ++;;;r)   Nr   r(  r   r)  c                 .    | j                  |||||      S )a+  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r(  r   r)  )rX  )rA   r   r   r(  r   r)  r   s          r'   r   zCLIPSegVisionModel.forward  s,    @   %/!5%=# ! 
 	
r)   )NNNTN)rI   rJ   rK   r   rN   main_input_namer  rk   r   rN  rH  r   r$   rM   r   rE   r   r   r   r   s   @r'   rV  rV    s    $O!2 <bii <  26)-,004#'%
''$.%
  $;%
 #Tk	%

 #'+%
 D[%
 
+	+%
 %
r)   rV  c                       e Zd ZU eed<   def fdZee	 	 ddej                  dej                  dz  dej                  dz  de
e   deez  f
d	              Zee	 dd
ej                  dede
e   deez  fd              Ze	 	 	 	 	 	 	 	 	 ddej$                  dz  d
ej                  dz  dej                  dz  dej$                  dz  dedz  dedz  dedz  dededz  deez  fd       Z xZS )r  r]   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  |_	        |j                  |_	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t!        |      | _        t%        j&                  | j                  | j                  d      | _        t%        j&                  | j                  | j                  d      | _        t%        j,                  t/        j0                  | j2                  j4                              | _        | j9                          y )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rc   )rj   rk   r  text_configr   	TypeErrortypevision_configr   r   projection_dimrl   r  r  r5  rD  rP  rX  r   r   r  r  rp   r$   tensorr]   logit_scale_init_valuelogit_scalerE  )rA   r]   r`  rc  r}   s       r'   rk   zCLIPSegModel.__init__4  ss    &,,.?@++,-Q0 
 &..0CD--./q2 
 ((,,+1+F+F(-3-H-H*$33)55 - 9 90=4]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r)   Nr   r   re   r   r   c                 x     | j                   d|||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   re   r)  rO   )rD  r<  r  )rA   r   r   re   r   text_outputsrW   s          r'   get_text_featureszCLIPSegModel.get_text_featuresX  sV    . 4C4?? 4
)%	4

 4
 %22%)%9%9-%H"r)   r   r   c                 v     | j                   d||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)r   r   r)  rO   )rX  r<  r  )rA   r   r   r   vision_outputsrW   s         r'   get_image_featureszCLIPSegModel.get_image_features{  sU    6 6GT5F5F 6
%%=6
 	6
 '44'+'='=m'L$r)   return_lossr   r(  r)  c
           	         ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  |||||	      }| j                  ||||||	      }|d   }| j                  |      }|d   }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                         }d}|rt        |      }|	s||||||f}||f|z   S |S t        |||||||	      S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr[  rL  r   rd   rg   T)r   r   keepdim)r2   r3   r4   r5   r6   r7   r8   )r]   r   r(  r,  rX  rD  r  r  normrg  expr$   r   r,   r/   r1   )rA   r   r   r   re   rn  r   r(  r   r)  r   rl  ri  r6   r5   rg  r4   r3   r2   outputs                       r'   r   zCLIPSegModel.forward  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 )%/!5# ' 
 &a(--l;"1o**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,.0D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r)   )NNr   )	NNNNNNNTN)rI   rJ   rK   r   rN   rk   r   r   r$   r   r   r   rE   r   rj  rM   r   rm  r   r1   r   r   r   s   @r'   r  r  0  s   "} "H  /3,0	<< t+ llT)	
 +, 
+	+  B  *."''" #'" +,	"
 
+	+"  "H  .215.204#')-,0)-#'^
##d*^
 ''$.^
 t+	^

 &&-^
 D[^
  $;^
 #Tk^
 #'^
 D[^
 
	^
 ^
r)   r  c                        e Zd ZdZdef fdZ	 ddej                  dej                  dej                  dedz  d	e	ej                     f
d
Z xZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    r]   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   r   r|   s     r'   rk   zCLIPSegDecoderLayer.__init__
  r   r)   rR   r   r   r   Nr   c                     |}| j                  ||||      \  }}||z   }| j                  |      }|}| j                  |      }||z   }| j                  |      }|f}|r||fz  }|S r   )r   r   r   r   r   s           r'   r   zCLIPSegDecoderLayer.forward  s    " !&*nn')"7/	 '5 '
#| !=0((7 / =0((7 "&Gr)   r  )rI   rJ   rK   rL   r   rk   r$   r   r   rE   rM   r   r   r   s   @r'   ru  ru    sk    S} S */'||' '  %||	'
  $;' 
u  	!'r)   ru  c                        e Zd Zdef fdZ	 	 	 d
deej                     dej                  dedz  dedz  dedz  f
d	Z	 xZ
S )CLIPSegDecoderr]   c                 &   t         |   |       |j                  | _        t        j                  |j
                  |j                        | _        t        j                  |j
                  |j                        | _        |j                  r|j                  j                  dz  |j                  j                  dz  f}t        j                  t        j                  |j                  |j                  dd      t        j                         t        j                  |j                  |j                  dz  |d   |d         t        j                         t        j                  |j                  dz  d|d   |d               | _        nPt        j                  |j                  d|j                  j                  |j                  j                        | _        t#        |j$                        }t        j&                  t)        |      D cg c]6  }t        j                  |j                  j*                  |j                        8 c}      | _        t/        j0                  |j                        }|j                  |_        |j2                  |_        |j6                  |_        d	|_        t        j&                  t)        t#        |j$                              D cg c]  }t=        |       c}      | _        | jA                          y c c}w c c}w )
N   r   r   )ra   paddingrd   r   )ra   rb   )rb   relu)!rj   rk   conditional_layerr   r   rd  
reduce_dimfilm_mulfilm_add"use_complex_transposed_convolutionrc  ro   
Sequentialrs   ReLUConvTranspose2dtransposed_convolutionr&   extract_layersr$  r%  rl   reducescopydeepcopydecoder_num_attention_headsr   decoder_intermediate_sizer   r   ru  r&  rE  )rA   r]   transposed_kernelsdepthr   decoder_configr}   s         r'   rk   zCLIPSegDecoder.__init__=  s[    !'!9!9		&"7"79J9JK		&"7"79J9JK44"("6"6"A"AQ"FH\H\HgHgklHl!m*,--		&++V->->AWXY	""%%%%* 21 5-a0	 	""%%*A;Ma;PYklmYn+D' +-*<*<!!1f&:&:&E&EfNbNbNmNm+D' F))*}}UZ[`UabPQRYYv++779J9JKb
 v';';<%+%6%6"-3-O-O*+1+K+K($*!mmRWX[\b\q\qXrRs$tQ%8%H$tu c %us   ;L	LNrR   rV   r   r(  r)  c                 @   |rdnd }|rdnd }|d d d   }	d }
t        t        |	| j                  | j                              D ]  \  }\  }}}|
 ||      |
z   }
n ||      }
|| j                  k(  rJ| j                  |      |
j                  ddd      z  | j                  |      z   }
|
j                  ddd      }
 ||
d d |      }|d   }
|r||
fz  }|s||d   fz  } |
d d dd d d f   j                  ddd      }
t        t        j                  |
j                  d               }|j                  d   }|
j                  ||
j                  d   ||      }
| j                  |
      j                  d      }|st        d |||fD              S t!        |||      S )	NrO   rg   r   r   rd   )r   r   r   c              3   &   K   | ]	  }||  y wr   rO   )r?   vs     r'   rB   z)CLIPSegDecoder.forward.<locals>.<genexpr>  s     aqSTS`as   )r   rR   rS   )r-  zipr&  r  r~  r  r   r  r   mathsqrtr   r   r  squeezerE   rQ   )rA   rR   rV   r   r(  r)  r   all_hidden_statesr/  activationsrs  i
activationlayerreducer2  r   r   r   s                      r'   r   zCLIPSegDecoder.forwardi  s    #7BD0d#DbD).7KVZVbVb8c.d 	6*A*
E6!
+f4
+D***'=>PQSTVWAXX[_[h[h*\   1a0!t4[lM #1%F#!fY.! =#3"55-	60 12q!))!Q2499V\\!_-.+11!4
Za$E,,V4<<Q?aV->$Oaaa#+%
 	
r)   )NNT)rI   rJ   rK   r   rk   rE   r$   r   r   r   r   r   s   @r'   ry  ry  <  si    *} *` *.,0#'7
U\\*7
 !&7
  $;	7

 #Tk7
 D[7
r)   ry  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                       e Zd ZU eed<   def fdZ	 	 	 	 	 ddedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  f
d	Z	e
	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dededz  deez  fd       Z xZS )CLIPSegForImageSegmentationr]   c                     t         |   |       || _        t        |      | _        |j
                  | _        t        |      | _        | j                          y r   )	rj   rk   r]   r  r  r  ry  decoderrE  r|   s     r'   rk   z$CLIPSegForImageSegmentation.__init__  sI      (	$33%f- 	r)   Nr   r   r   re   conditional_pixel_valuesc                    |`t        |      |k7  rt        d      t        j                         5  | j                  j                  |||      j                  }d d d        |S |]t        |      |k7  rt        d      t        j                         5  | j                  j                  |      j                  }d d d        |S t        d      # 1 sw Y   S xY w# 1 sw Y   S xY w)Nz@Make sure to pass as many prompt texts as there are query images)r   re   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r&   r   r$   r   r  rj  r<  rm  )rA   r   r   r   re   r  rV   s          r'   get_conditional_embeddingsz6CLIPSegForImageSegmentation.get_conditional_embeddings  s      9~+ !cdd  )-)D)Dn< *E *- '  &% &1+,
: !dee n)-)E)EF^)_)m)m&n &%	 m   &%n &%s   )C&CCC$r   rV   labelsr   r(  r   r)  r   c                 "   ||n| j                   j                  }t        j                         5  | j                  j                  ||d|
|      }| j                  j                  |d         }|r|j                  n|d   }| j                  D cg c]
  }||dz       }}|r<t        |j                  |j                  |	r|j                  nd|j                        }n|	s|dd |dd z   n|}ddd       |$| j                  |j                  d   ||||	      }n[|j                  d   |j                  d   k7  rt        d
      |j                  d   | j                   j                   k7  rt        d      | j#                  |||	|      }|r|j$                  n|d   }d}|8|j'                  |j(                        }t+        j,                         } |||      }|s|||f}||f|z   S |S t/        ||||      S c c}w # 1 sw Y   xY w)a~  
        conditional_pixel_values (`torch.FloatTensor`, *optional*):
            The pixel values of the conditional images.
        conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
            The conditional embeddings for the query images. If provided, the model will use this instead of computing
            the embeddings from the conditional_pixel_values.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr[  r   rd   r;  r   r   )r   r   r   re   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r(  r)  )r2   r   rV   rW   r8   rX   )r]   r,  r$   r   r  rX  r  rR   r  r   r+  r<  rS   r  r   r   rd  r  r   r   r!   r   BCEWithLogitsLossrU   )rA   r   r   r  rV   r   re   r  r   r(  r   r)  r   rl  rW   rR   r  r  decoder_outputsr   r2   loss_fnrs  s                          r'   r   z#CLIPSegForImageSegmentation.forward  sn   d &1%<k$++B]B] ]]_ 	!YY33)"3%))A' 4 N !II77q8IJM<GN88^\]M^M9=9L9LMA=Q/MKM !;&4&F&F"0">">BV.">">\`-88	" DXN2A&);;]k /	8 ")%)%D%D'--a0#-))A &E &" &++A.,2D2DQ2GG m  &++A.$++2L2LL 0  ,,"/!5# ' 
 ,7''OA<NYYv}}-F**,G66*D4m^UdeF)-)9TGf$EvE-#9' .*
 	
q N	 	s   A HG?AH?HHr3  )NNNNNNNNNTN)rI   rJ   rK   r   rN   rk   r   r$   r   r  r   rM   r   r   rE   r1   r   r   r   s   @r'   r  r    s    }  "&)-.2,08<&$J& <<$&& t+	&
 llT)& #(,,"5&:  /315=A;?.204*.)-,0)-#'
$$t+
 ''$.
 #("3"3d":	

 !& 1 1D 8
 t+
 &&-
   4'
  $;
 #Tk
 #'
 D[
 
	
 
r)   r  )r  r  rB  rV  r  )r   )FrL   r  r  collections.abcr   dataclassesr   typingr   r$   r    r   r  r  r	   modeling_attn_mask_utilsr
   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   configuration_clipsegr   r   r   
get_loggerrI   loggerr   r(   r/   r1   rQ   rU   rN  r\   r   floatr   r   r   r   r  r"  r5  rB  rP  rV  r  ru  ry  r  __all__rO   r)   r'   <module>r     s      $ !    & ! d 9 K F & j j 9 X X 
		H	%
`U\\ `ell `
-U\\ -ell -  
K  
   
F 7; 7  7 
[ 
  
<Pbii Ph%BII %` %II%<<% 
% <<	%
 LL4'% % %.G)ryy G)V  /4 /d 0%_ 0% 0%hS
RYY S
lX
RYY X
v3
- 3
l1
ryy 1
h4
/ 4
n O
) O
 O
d6")) 6rd
+ d
N 
m
"8 m

m
`r)   