
    iiR                     |   d dl Zd dl mZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ  G d dej>                        Z  G d dej>                        Z!	 	 d9dej>                  dejD                  dejD                  dejD                  dejD                  dz  de#dz  de#dee   fdZ$ G d dej>                        Z% G d d ej>                        Z& G d! d"ej>                        Z'd:d#ejD                  d$e#d%e(d&ejD                  fd'Z) G d( d)ej>                        Z* G d* d+ej>                        Z+ G d, d-e      Z, G d. d/ej>                        Z-e G d0 d1e             Z.e G d2 d3e.             Z/ ed45       G d6 d7ee.             Z0g d8Z1y);    N)Callable)nn   )initialization)ACT2FN)BackboneMixin)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring
is_tracing)check_model_inputs   )PixioConfigc                   f     e Zd ZdZdef fdZddej                  dedej                  fdZ	 xZ
S )	PixioPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    configc                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfr   r   r   r    r!   r&   	__class__s          r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/pixio/modeling_pixio.pyr   zPixioPatchEmbeddings.__init__.   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hi    pixel_valuesinterpolate_pos_encodingreturnc                    |j                   \  }}}}|| j                  k7  rt        d| j                   d| d      |sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d		      | j	                  |      j                  d
      j                  dd
      }|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).   )shaper    
ValueErrorr   r(   flatten	transpose)r)   r-   r.   
batch_sizer    heightwidth
embeddingss           r+   forwardzPixioPatchEmbeddings.forward=   s    2>2D2D/
L&%4,,,!../yaI  (++u8J/J (% 9+,Adooa.@-AE  __\2::1=GG1M
r,   F)__name__
__module____qualname____doc__r   r   torchTensorboolr<   __classcell__r*   s   @r+   r   r   '   s;    j{ jELL D ]b]i]i r,   r   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z	d
ej                  dej                  fdZ
 xZS )PixioEmbeddingszB
    Construct the CLS tokens, position and patch embeddings.
    r   r/   Nc                 (   t         |           t        j                  t	        j
                  d|j                  |j                              | _        d | _	        t        |      | _        | j                  j                  }t        j                  t	        j
                  d||j                  z   |j                              | _        t        j                  |j                        | _        |j                  | _        |j"                  | _        || _        y )Nr   )r   r   r   	ParameterrB   randnn_cls_tokensr!   	cls_token
mask_tokenr   patch_embeddingsr&   position_embeddingsDropouthidden_dropout_probdropoutr   r   )r)   r   r&   r*   s      r+   r   zPixioEmbeddings.__init__S   s    ekk!V5H5H&J\J\&]^ 4V <++77#%<<A{VM`M`?`bhbtbt0u#v zz&"<"<="// ++r,   r;   r9   r:   c                 @   |j                   d   | j                  z
  }| j                  j                   d   | j                  z
  }t               s||k(  r||k(  r| j                  S | j                  ddd| j                  f   }| j                  dd| j                  df   }|j                   d   }|| j                  z  }	|| j                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }|j                  }t        j                  j                  |j                  t        j                        |	|
fdd	
      j                  |      }|j                  dddd      j                  dd|      }t        j                   ||fd      S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r   r3   bicubicF)sizemodealign_cornersdtypedim)r4   rL   rP   r   r   intreshapepermuter[   r   
functionalinterpolatetorB   float32viewcat)r)   r;   r9   r:   r&   num_positionsclass_pos_embedpatch_pos_embedr]   
new_height	new_widthsqrt_num_positionstarget_dtypes                r+   r.   z(PixioEmbeddings.interpolate_pos_encoding`   s    !&&q)D,=,==0066q9D<M<MM|} <5+++2216I8I8I6I3IJ221d6G6G6I3IJr"t.
T__,	 !34)11!5GI[]`a)11!Q1=&,,--33u}}-i(	 4 

 "<"
  	 *11!Q1=BB1b#Nyy/?;CCr,   r-   c                 x   |j                   \  }}}}| j                  j                  j                  j                  }| j                  |j                  |            }| j                  j                  |dd      }t        j                  ||fd      }|| j                  |||      z   }| j                  |      }|S )NrZ   rU   r   r\   )r4   rO   r(   weightr[   rc   rM   expandrB   rf   r.   rS   )	r)   r-   r8   _r9   r:   rm   r;   
cls_tokenss	            r+   r<   zPixioEmbeddings.forward   s    '3'9'9$
Avu,,77>>DD**<???+NO
^^**:r2>
YY
J7Q?
$"?"?
FTY"ZZ
\\*-
r,   )r>   r?   r@   rA   r   r   rB   rC   r^   r.   r<   rE   rF   s   @r+   rH   rH   N   si    { t $D5<< $D $DUX $D]b]i]i $DLELL U\\ r,   rH   modulequerykeyvalueattention_maskscalingrS   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|#|d d d d d d d |j                  d   f   }||z   }t
        j                  j                  |d      }t
        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )	NrU         r3   r   r\   )ptrainingr   )rW   rB   matmulr7   r4   r   ra   softmaxrS   r~   
contiguous)
rs   rt   ru   rv   rw   rx   rS   ry   attn_weightsattn_outputs
             r+   eager_attention_forwardr      s     **R.D( <<s}}Q':;gEL!'1a399R=(@A#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r,   c                   z     e Zd Zdef fdZdej                  deej                  ej                  f   fdZ xZ	S )PixioSelfAttentionr   c                 2   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        |j                  | _        | j                  dz  | _        d| _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r1   r{   Fbias)r   r   r!   num_attention_headshasattrr5   r   r^   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probrx   	is_causalr   Linearqkv_biasrt   ru   rv   r)   r   r*   s     r+   r   zPixioSelfAttention.__init__   sF    : ::a?PVXhHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r,   hidden_statesr/   c           
         |j                   d   }|d| j                  | j                  f} | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      }t        j                  | j                  j                  t              } || |||d | j                  | j                  | j                  sdn| j                         \  }}	|j#                         d d | j$                  fz   }
|j'                  |
      }||	fS )Nr   rU   r   r3           )r   rx   rS   r|   )r4   r   r   ru   re   r7   rv   rt   r   get_interfacer   _attn_implementationr   r   rx   r~   r   rW   r   r_   )r)   r   r8   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes              r+   r<   zPixioSelfAttention.forward   sF   "((+
D$<$<d>V>VV	0DHH]+00)<FFq!L	4djj/44i@JJ1aP4djj/44i@JJ1aP(?(M(MKK,,.E)
 *=nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EFo--r,   )
r>   r?   r@   r   r   rB   rC   tupler<   rE   rF   s   @r+   r   r      s:    ]{ ](.U\\ .eELL%,,<V6W .r,   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )PixioSelfOutputz
    The residual connection is defined in PixioLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	r   r   r   r   r!   denserQ   rR   rS   r   s     r+   r   zPixioSelfOutput.__init__   sB    YYv1163E3EF
zz&"<"<=r,   r   input_tensorr/   c                 J    | j                  |      }| j                  |      }|S r   )r   rS   )r)   r   r   s      r+   r<   zPixioSelfOutput.forward   s$    

=1]3r,   )
r>   r?   r@   rA   r   r   rB   rC   r<   rE   rF   s   @r+   r   r      s=    
>{ >
U\\  RWR^R^ r,   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )PixioAttentionr   c                 b    t         |           t        |      | _        t	        |      | _        y r   )r   r   r   	attentionr   outputr   s     r+   r   zPixioAttention.__init__   s&    +F3%f-r,   r   r/   c                 R    | j                  |      \  }}| j                  ||      }|S r   )r   r   )r)   r   self_attn_outputrq   r   s        r+   r<   zPixioAttention.forward   s,    "nn];!-}=r,   	r>   r?   r@   r   r   rB   rC   r<   rE   rF   s   @r+   r   r      s*    .{ .
U\\ ell r,   r   input	drop_probr~   r/   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   )r   )r[   device)r4   ndimrB   randr[   r   floor_div)r   r   r~   	keep_probr4   random_tensorr   s          r+   	drop_pathr     s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr,   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
PixioDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r/   c                 0    t         |           || _        y r   )r   r   r   )r)   r   r*   s     r+   r   zPixioDropPath.__init__  s    "r,   r   c                 D    t        || j                  | j                        S r   )r   r   r~   )r)   r   s     r+   r<   zPixioDropPath.forward  s    FFr,   c                      d| j                    S )Nzp=)r   r)   s    r+   
extra_reprzPixioDropPath.extra_repr  s    DNN#$$r,   r   )r>   r?   r@   rA   floatr   rB   rC   r<   strr   rE   rF   s   @r+   r   r     sG    b#%$, #$ #GU\\ Gell G%C %r,   r   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )PixioMLPr/   c                 ~   t         |           |j                  x}}t        |j                  |j                  z        }t        j                  ||d      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||d      | _        y )NTr   )r   r   r!   r^   	mlp_ratior   r   fc1r"   
hidden_actr   r   
activationfc2)r)   r   in_featuresout_featureshidden_featuresr*   s        r+   r   zPixioMLP.__init__  s    %+%7%77lf0063C3CCD99[/Ef''-$V%6%67DO$//DO99_lFr,   hidden_statec                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r)   r   s     r+   r<   zPixioMLP.forward*  s2    xx-|4xx-r,   )r/   N)r>   r?   r@   r   rB   rC   r<   rE   rF   s   @r+   r   r     s$    	GELL U\\ r,   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )
PixioLayerr   r/   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |      | _        |j                  dkD  rt        |j                        nt        j                         | _        t        j                  |j                  |j
                        | _        t        |      | _        y )Nepsr   )r   r   r   	LayerNormr!   layer_norm_epsnorm1r   r   drop_path_rater   Identityr   norm2r   mlpr   s     r+   r   zPixioLayer.__init__2  s    \\&"4"4&:O:OP
'/AGAVAVY\A\v'<'<=bdbmbmbo\\&"4"4&:O:OP
F#r,   r   c                     | j                  |      }| j                  |      }| j                  |      |z   }| j                  |      }| j	                  |      }| j                  |      |z   }|S r   )r   r   r   r   r   )r)   r   hidden_states_normself_attention_outputlayer_outputs        r+   r<   zPixioLayer.forward<  sj    !ZZ6 $/A B'<=Mzz-0xx-~~l3mCr,   r   rF   s   @r+   r   r   1  s1    ${ $t $U\\ ell r,   r   c                   N     e Zd Zdef fdZddej                  dedefdZ	 xZ
S )PixioEncoderr   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   r   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingr)   r   rq   r*   s      r+   r   zPixioEncoder.__init__K  sN    ]]fF^F^@_#`1Jv$6#`a
&+# $as   A#r   output_hidden_statesr/   c                     |r|gnd }t        | j                        D ]!  \  }} ||      }|s|j                  |       # t        ||rt	        |            S d       S )N)last_hidden_stater   )	enumerater   appendr   r   )r)   r   r   all_hidden_statesilayer_modules         r+   r<   zPixioEncoder.forwardQ  so    /C]O(4 	8OA|(7M !((7	8
 +6G% 12
 	
MQ
 	
r,   r=   )r>   r?   r@   r   r   rB   rC   rD   r   r<   rE   rF   s   @r+   r   r   J  s.    ,{ ,

U\\ 

 

Zi 

r,   r   c                       e Zd ZU eed<   dZdZdZdZddgZ	dZ
dZdZdZeedZ ej$                         d	ej(                  ej*                  z  ej,                  z  fd
       Zy)PixioPreTrainedModelr   pixior-   )imageTrH   r   )r   
attentionsrs   c                 "   t        |t        j                  t        j                  f      rct	        j
                  |j                  d| j                  j                         |j                   t	        j                  |j                         yyt        |t        j                        r?t	        j                  |j                         t	        j                  |j                         yt        |t              rt	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         |j                    t	        j                  |j                          yyy)zInitialize the weightsr   )meanstdN)r"   r   r   r'   inittrunc_normal_ro   r   initializer_ranger   zeros_r   ones_rH   rP   rM   rN   )r)   rs   s     r+   _init_weightsz"PixioPreTrainedModel._init_weightso  s     fryy"))45v}}3DKK<Y<YZ{{&FKK( '-KK$JJv}}%0v99IfIfgv//ct{{?\?\]  ,F--. - 1r,   N)r>   r?   r@   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsrB   no_gradr   r   r'   r   r   r,   r+   r   r   ^  s    $O!&*#*L9N"&#(
 U]]_/BII		$9BLL$H / /r,   r   c            	            e Zd Zdef fdZdefdZ ed      e	 	 dde	j                  dz  d	edz  defd
              Z xZS )
PixioModelr   c                     t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr   )r   r   r   rH   r;   r   encoderr   r   r!   r   	layernorm	post_initr   s     r+   r   zPixioModel.__init__  sW     )&1#F+f&8&8f>S>STr,   r/   c                 .    | j                   j                  S r   r;   rO   r   s    r+   get_input_embeddingszPixioModel.get_input_embeddings      ///r,   F)tie_last_hidden_statesNr-   r   c                 b   || j                   j                  }|t        d      | j                  |      }| j	                  ||      }|j
                  }| j                  |      }|d d d | j                  j                  d d f   j                  d      }t        |||j                        S )Nz You have to specify pixel_valuesr   r   r\   )r   pooler_outputr   )r   r   r5   r;   r  r   r  rL   r   r   r   )r)   r-   r   ry   embedding_outputencoder_outputssequence_outputpooled_outputs           r+   r<   zPixioModel.forward  s      '#';;#C#C ?@@??<8+/<<8H_s<+t);;..9'+IT__-I-I+I1(LMRRWXRY)-')77
 	
r,   )NN)r>   r?   r@   r   r   r   r  r   r   rB   rC   rD   r   r<   rE   rF   s   @r+   r  r    sn    	{ 	0&: 0 u5 -1,0
llT)
 #Tk

 
$
  6
r,   r  zN
    Pixio backbone, to be used with frameworks like DETR and MaskFormer.
    )custom_introc            	       p     e Zd Z fdZdefdZee	 ddej                  de
dz  defd              Z xZS )	PixioBackbonec                 X   t         |   |       t        |j                  dz         D cg c]  }|j                   c}| _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        | j                          y c c}w )Nr   r   )r   r   r   r   r!   num_featuresrH   r;   r   r  r   r   r   r  r  r   s      r+   r   zPixioBackbone.__init__  s     9>v?W?WZ[?[9\]AV//])&1#F+f&8&8f>S>ST 	 ^s   B'r/   c                 .    | j                   j                  S r   r  r   s    r+   r  z"PixioBackbone.get_input_embeddings  r  r,   Nr-   r   c                    || j                   j                  }| j                  |      }| j                  |d      }|j                  }g }t        | j                  |      D ]  \  }}	|| j                  v s| j                   j                  r| j                  |	      }	| j                   j                  r|	dd| j                  j                  df   }	|j                  \  }
}}}| j                   j                  }|	j                  |
||z  ||z  d      }	|	j                  dddd      j!                         }	|j#                  |	        t%        t'        |      |r|	      S d	      S )
aw  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 1280, 16, 16]
        ```NTr  rU   r   r   r   r3   )feature_mapsr   )r   r   r;   r  r   zipstage_namesr   apply_layernormr  reshape_hidden_statesrL   r4   r   r_   r`   r   r   r
   r   )r)   r-   r   ry   r  r   r   r(  stager   r8   rq   r9   r:   r   s                  r+   r<   zPixioBackbone.forward  s\   >  '#';;#C#C ??<8"&,,/?VZ,"[,,#&t'7'7#G 
	2E<)));;..#'>>,#?L;;44#/4??3O3O3Q0Q#RL3?3E3E0J65!%!7!7J#/#7#7
FjDXZ_cmZmoq#rL#/#7#71a#C#N#N#PL##L1
	2 |,+?-
 	
EI
 	
r,   r   )r>   r?   r@   r   r   r  r   r   rB   rC   rD   r
   r<   rE   rF   s   @r+   r#  r#    sT    
0&: 0 NR4
!LL4
@Dt4
	4
  4
r,   r#  )r  r   r#  )Nr   )r   F)2collections.abcr#   r   rB   r    r   r   activationsr   backbone_utilsr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   configuration_pixior   Moduler   rH   rC   r   r   r   r   r   rD   r   r   r   r   r   r   r  r#  __all__r  r,   r+   <module>r;     s  *  $   & ! + 9 [ [ F & C C / ,$299 $NDbii DZ !%II%<<% 
% <<	%
 LL4'% T\% % '(%:/. /.dbii "	RYY 	U\\ e T V[VbVb %BII %ryy &+ 2
299 
( /? / /B (
% (
 (
V 
F
M#7 F

F
R Br,   