
    i@                        d Z ddlZddlmZ ddlmZ ddlmZmZmZ ddl	m
Z
mZmZ ddlmZ d	d
lmZ d	dlmZmZmZ d	dlmZmZmZ  ej2                  e      Z G d de      Z G d de      Z G d dej<                        Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d dej<                        Z$ G d de      Z%e
 G d d e%             Z& e
d!"       G d# d$e             Z'g d%Z(y)&zPyTorch Pixio model.    N)nn   )GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPooling)auto_docstring
is_tracinglogging)check_model_inputs   )Dinov2Config)Dinov2BackboneDinov2DropPath	Dinov2MLP)ViTAttentionViTPatchEmbeddingsViTPreTrainedModelc                   N     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )PixioConfiga  
    This is the configuration class to store the configuration of a [`PixioModel`]. It is used to instantiate a
    Pixio model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the ViT
    [facebook/pixio-huge](https://huggingface.co/facebook/pixio-huge) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1280):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        mlp_ratio (`int`, *optional*, defaults to 4):
            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
        n_cls_tokens (`int`, *optional*, defaults to 8):
            Number of class tokens in the Transformer encoder.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        image_size (`int`, *optional*, defaults to 256):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys and values.
        drop_path_rate (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate per sample (when applied in the main path of residual layers).
        out_features (`list[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
            same order as defined in the `stage_names` attribute.
        out_indices (`list[int]`, *optional*):
            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
            If unset and `out_features` is unset, will default to the last stage. Must be in the
            same order as defined in the `stage_names` attribute.
        apply_layernorm (`bool`, *optional*, defaults to `True`):
            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
            seq_len, hidden_size)`.

    Example:

    ```python
    >>> from transformers import PixioConfig, PixioModel

    >>> # Initializing a Pixio pixio-huge style configuration
    >>> configuration = PixioConfig()

    >>> # Initializing a model (with random weights) from the pixio-huge style configuration
    >>> model = PixioModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```pixioc                 ^    t         |   ||||||||	|
|||||||       || _        | `| `| `y )N)hidden_sizenum_hidden_layersnum_attention_heads	mlp_ratio
hidden_acthidden_dropout_probattention_probs_dropout_probinitializer_rangelayer_norm_eps
image_size
patch_sizenum_channelsqkv_biasdrop_path_rateapply_layernormreshape_hidden_states)super__init__n_cls_tokenslayerscale_valueuse_swiglu_ffnuse_mask_token)selfr   r   r   r   r+   r   r   r   r    r!   r"   r#   r$   r%   r&   out_featuresout_indicesr'   r(   kwargs	__class__s                        q/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/pixio/modular_pixio.pyr*   zPixioConfig.__init__o   sc    . 	#/ 3! 3)E/)!!%)+"7! 	 	
& )!    )i                gelu        r;   g{Gz?gư>   r7   r   Tr;   NNTT)__name__
__module____qualname____doc__
model_typer*   __classcell__r3   s   @r4   r   r   #   sV    GR J %(").  . r5   r   c                       e Zd Zy)PixioPatchEmbeddingsNr=   r>   r?    r5   r4   rE   rE          r5   rE   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z	d
ej                  dej                  fdZ
 xZS )PixioEmbeddingszB
    Construct the CLS tokens, position and patch embeddings.
    configreturnNc                 (   t         |           t        j                  t	        j
                  d|j                  |j                              | _        d | _	        t        |      | _        | j                  j                  }t        j                  t	        j
                  d||j                  z   |j                              | _        t        j                  |j                        | _        |j                  | _        |j"                  | _        || _        y )N   )r)   r*   r   	Parametertorchrandnr+   r   	cls_token
mask_tokenrE   patch_embeddingsnum_patchesposition_embeddingsDropoutr   dropoutr#   rK   )r/   rK   rU   r3   s      r4   r*   zPixioEmbeddings.__init__   s    ekk!V5H5H&J\J\&]^ 4V <++77#%<<A{VM`M`?`bhbtbt0u#v zz&"<"<="// ++r5   
embeddingsheightwidthc                 @   |j                   d   | j                  z
  }| j                  j                   d   | j                  z
  }t               s||k(  r||k(  r| j                  S | j                  ddd| j                  f   }| j                  dd| j                  df   }|j                   d   }|| j                  z  }	|| j                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }|j                  }t        j                  j                  |j                  t        j                        |	|
fdd	
      j                  |      }|j                  dddd      j                  dd|      }t        j                   ||fd      S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        rN   Ng      ?r   r   r   bicubicF)sizemodealign_cornersdtypedim)shaper+   rV   r
   r#   intreshapepermuterc   r   
functionalinterpolatetorP   float32viewcat)r/   rY   rZ   r[   rU   num_positionsclass_pos_embedpatch_pos_embedre   
new_height	new_widthsqrt_num_positionstarget_dtypes                r4   interpolate_pos_encodingz(PixioEmbeddings.interpolate_pos_encoding   s    !&&q)D,=,==0066q9D<M<MM|} <5+++2216I8I8I6I3IJ221d6G6G6I3IJr"t.
T__,	 !34)11!5GI[]`a)11!Q1=&,,--33u}}-i(	 4 

 "<"
  	 *11!Q1=BB1b#Nyy/?;CCr5   pixel_valuesc                 x   |j                   \  }}}}| j                  j                  j                  j                  }| j                  |j                  |            }| j                  j                  |dd      }t        j                  ||fd      }|| j                  |||      z   }| j                  |      }|S )Nrb   r]   rN   rd   )rf   rT   
projectionweightrc   rl   rR   expandrP   ro   rw   rX   )	r/   rx   
batch_size_rZ   r[   rv   rY   
cls_tokenss	            r4   forwardzPixioEmbeddings.forward   s    '3'9'9$
Avu,,77>>DD**<???+NO
^^**:r2>
YY
J7Q?
$"?"?
FTY"ZZ
\\*-
r5   )r=   r>   r?   r@   r   r*   rP   Tensorrg   rw   r   rB   rC   s   @r4   rJ   rJ      si    { t $D5<< $D $DUX $D]b]i]i $DLELL U\\ r5   rJ   c                       e Zd Zy)PixioAttentionNrF   rG   r5   r4   r   r      rH   r5   r   c                       e Zd Zy)PixioDropPathNrF   rG   r5   r4   r   r      rH   r5   r   c                       e Zd Zy)PixioMLPNrF   rG   r5   r4   r   r      rH   r5   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )
PixioLayerrK   rL   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |      | _        |j                  dkD  rt        |j                        nt        j                         | _        t        j                  |j                  |j
                        | _        t        |      | _        y )Nepsr;   )r)   r*   r   	LayerNormr   r!   norm1r   	attentionr&   r   Identity	drop_pathnorm2r   mlpr/   rK   r3   s     r4   r*   zPixioLayer.__init__   s    \\&"4"4&:O:OP
'/AGAVAVY\A\v'<'<=bdbmbmbo\\&"4"4&:O:OP
F#r5   hidden_statesc                     | j                  |      }| j                  |      }| j                  |      |z   }| j                  |      }| j	                  |      }| j                  |      |z   }|S N)r   r   r   r   r   )r/   r   hidden_states_normself_attention_outputlayer_outputs        r4   r   zPixioLayer.forward  sj    !ZZ6 $/A B'<=Mzz-0xx-~~l3mCr5   )	r=   r>   r?   r   r*   rP   r   r   rB   rC   s   @r4   r   r      s1    ${ $t $U\\ ell r5   r   c                   N     e Zd Zdef fdZddej                  dedefdZ	 xZ
S )PixioEncoderrK   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r)   r*   rK   r   
ModuleListranger   r   layergradient_checkpointing)r/   rK   r~   r3   s      r4   r*   zPixioEncoder.__init__  sN    ]]fF^F^@_#`1Jv$6#`a
&+# $as   A#r   output_hidden_statesrL   c                     |r|gnd }t        | j                        D ]!  \  }} ||      }|s|j                  |       # t        ||rt	        |            S d       S )N)last_hidden_stater   )	enumerater   appendr   tuple)r/   r   r   all_hidden_statesilayer_modules         r4   r   zPixioEncoder.forward  so    /C]O(4 	8OA|(7M !((7	8
 +6G% 12
 	
MQ
 	
r5   )F)r=   r>   r?   r   r*   rP   r   boolr   r   rB   rC   s   @r4   r   r     s.    ,{ ,

U\\ 

 

Zi 

r5   r   c                       e Zd Zy)PixioPreTrainedModelNrF   rG   r5   r4   r   r   $  rH   r5   r   c            	            e Zd Zdef fdZdefdZ ed      e	 	 dde	j                  dz  d	edz  defd
              Z xZS )
PixioModelrK   c                     t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr   )r)   r*   rK   rJ   rY   r   encoderr   r   r   r!   	layernorm	post_initr   s     r4   r*   zPixioModel.__init__*  sW     )&1#F+f&8&8f>S>STr5   rL   c                 .    | j                   j                  S r   )rY   rT   )r/   s    r4   get_input_embeddingszPixioModel.get_input_embeddings5  s    ///r5   F)tie_last_hidden_statesNrx   r   c                 b   || j                   j                  }|t        d      | j                  |      }| j	                  ||      }|j
                  }| j                  |      }|d d d | j                  j                  d d f   j                  d      }t        |||j                        S )Nz You have to specify pixel_valuesr   rN   rd   )r   pooler_outputr   )rK   r   
ValueErrorrY   r   r   r   r+   meanr   r   )r/   rx   r   r2   embedding_outputencoder_outputssequence_outputpooled_outputs           r4   r   zPixioModel.forward8  s      '#';;#C#C ?@@??<8+/<<8H_s<+t);;..9'+IT__-I-I+I1(LMRRWXRY)-')77
 	
r5   )NN)r=   r>   r?   r   r*   rE   r   r   r	   rP   r   r   r   r   rB   rC   s   @r4   r   r   (  sn    	{ 	0&: 0 u5 -1,0
llT)
 #Tk

 
$
  6
r5   r   zN
    Pixio backbone, to be used with frameworks like DETR and MaskFormer.
    )custom_introc            	       R    e Zd Zee	 ddej                  dedz  defd              Z	y)PixioBackboneNrx   r   rL   c                    || j                   j                  }| j                  |      }| j                  |d      }|j                  }g }t        | j                  |      D ]  \  }}	|| j                  v s| j                   j                  r| j                  |	      }	| j                   j                  r|	dd| j                  j                  df   }	|j                  \  }
}}}| j                   j                  }|	j                  |
||z  ||z  d      }	|	j                  dddd      j!                         }	|j#                  |	        t%        t'        |      |r|	      S d	      S )
aw  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 1280, 16, 16]
        ```NTr   r]   r   r   rN   r   )feature_mapsr   )rK   r   rY   r   r   zipstage_namesr0   r'   r   r(   r+   rf   r#   rh   ri   
contiguousr   r   r   )r/   rx   r   r2   r   outputr   r   stagehidden_stater}   r~   rZ   r[   r#   s                  r4   r   zPixioBackbone.forwardZ  s\   >  '#';;#C#C ??<8"&,,/?VZ,"[,,#&t'7'7#G 
	2E<)));;..#'>>,#?L;;44#/4??3O3O3Q0Q#RL3?3E3E0J65!%!7!7J#/#7#7
FjDXZ_cmZmoq#rL#/#7#71a#C#N#N#PL##L1
	2 |,+?-
 	
EI
 	
r5   r   )
r=   r>   r?   r   r	   rP   r   r   r   r   rG   r5   r4   r   r   T  sA     NR4
!LL4
@Dt4
	4
  4
r5   r   )r   r   r   r   ))r@   rP   r   modeling_layersr   modeling_outputsr   r   r   utilsr	   r
   r   utils.genericr   dinov2.configuration_dinov2r   dinov2.modeling_dinov2r   r   r   vit.modeling_vitr   r   r   
get_loggerr=   loggerr   rE   ModulerJ   r   r   r   r   r   r   r   r   __all__rG   r5   r4   <module>r      s      9 [ [ 8 8 / 6 
 T S 
		H	%z , z z	- 	Dbii DN	\ 		N 		y 	+ 2
299 
(	- 	 (
% (
 (
V 
7
N 7

7
t Qr5   