
    i$                        d Z ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ d
dlmZmZ d
dlmZmZmZ d
dlmZmZmZmZmZmZmZmZ  G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z(e G d  d!e             Z) ed"#       G d$ d%e             Z* G d& d'e      Z+g d(Z,y))zPyTorch SAM 2 model.    N   )initialization)PreTrainedConfig)PreTrainedModel)Unpack)auto_docstring)TransformersKwargscheck_model_inputs   )CONFIG_MAPPING
AutoConfig)
Sam2ConfigSam2MaskDecoderConfigSam2PromptEncoderConfig)Sam2AttentionSam2FeedForwardSam2LayerNorm	Sam2ModelSam2PreTrainedModelSam2TwoWayAttentionBlockSam2VisionEncoderOutputSam2VisionModelc                   L     e Zd ZdZdZdZdeiZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )EdgeTamVisionConfiga	  
    This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `timm/repvit_m1.dist_in1k`):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configedgetam_vision_modelbackbone_configc                    |g dn|}|ddgddgddggn|}|ddgn|}t        |t              r'|j                  dd      |d<   t        |d      di |}n|t	        j
                  d	dd
g dd      }|| _        || _        || _        || _	        || _
        || _        || _        || _        |	| _        |
| _        || _        || _        t%        | L  di | y )N)i     `   0         @   r   r   
model_typetimm_wrapperztimm/repvit_m1.dist_in1kT)r      r   r   )in_chansfeatures_onlyout_indices)
model_args )
isinstancedictgetr   r   from_pretrainedr   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levels
hidden_actlayer_norm_epsinitializer_rangesuper__init__)selfr   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   kwargs	__class__s                 u/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/edgetam/modular_edgetam.pyr=   zEdgeTamVisionConfig.__init__U   s     7L6S 2Yn2H2Pc3Z#sb"X.Vl 	 )<(Cq!fI\ot,,;,?,?n,]OL),_\-JK^o^O$(88*()DQ]^O
  / &;"&<#..$&#6 "4$,!2"6"    )NNNr"   r'   r'   r   Nr   gelugư>g{Gz?)
__name__
__module____qualname____doc__base_config_keyr%   r   sub_configsr=   __classcell__)r@   s   @rA   r   r   (   sQ    $L &O'J:K "# .# .#rB   r   c                       e Zd Zy)EdgeTamPromptEncoderConfigNrD   rE   rF   r,   rB   rA   rL   rL          rB   rL   c                       e Zd Zy)EdgeTamMaskDecoderConfigNrM   r,   rB   rA   rP   rP      rN   rB   rP   c                       e Zd Zy)EdgeTamConfigNrM   r,   rB   rA   rR   rR      rN   rB   rR   c                       e Zd Zy)EdgeTamLayerNormNrM   r,   rB   rA   rT   rT      rN   rB   rT   c                       e Zd Zy)EdgeTamVisionEncoderOutputNrM   r,   rB   rA   rV   rV      rN   rB   rV   c                       e Zd Zy)EdgeTamAttentionNrM   r,   rB   rA   rX   rX      rN   rB   rX   c                       e Zd Zy)EdgeTamTwoWayAttentionBlockNrM   r,   rB   rA   rZ   rZ      rN   rB   rZ   c                       e Zd Zy)EdgeTamFeedForwardNrM   r,   rB   rA   r\   r\      rN   rB   r\   c                   :    e Zd Z ej                         d        Zy)EdgeTamPreTrainedModelc                    t        j                  | |       t        |t              r-|j                   t        j                  |j                         y y t        |d      r,t        j                  |j                  |j                         y y )Npositional_embedding)std)r   _init_weightsr-   EdgeTamModelno_memory_embeddinginitzeros_hasattrnormal_r`   scale)r>   modules     rA   rb   z$EdgeTamPreTrainedModel._init_weights   sg    %%dF3fl+))5F667 6V34LL44&,,G 5rB   N)rD   rE   rF   torchno_gradrb   r,   rB   rA   r^   r^      s    U]]_H HrB   r^   zN
    The vision model from EdgeTAM without any head or projection on top.
    )custom_introc            	       f    e Zd ZeZdZi Zd Ze	 dde	j                  dz  dee   deez  fd       Zy)EdgeTamVisionModelpixel_valuesc                     t        d      Nz2Can't get input embeddings from timm wrapper modelNotImplementedErrorr>   s    rA   get_input_embeddingsz'EdgeTamVisionModel.get_input_embeddings       !"VWWrB   Nr?   returnc           	      ^   |t        d       | j                  |fi |}|j                  }|D cg c]  }|j                  dddd       }}| j	                  |      \  }}|| j
                   d  d d d   }|| j
                   d  d d d   }t        |d   |||j                        S c c}w )Nz You have to specify pixel_valuesr   r   r   r'   )last_hidden_statefpn_hidden_statesfpn_position_encodinghidden_states)
ValueErrorbackboner{   permuteneckr8   rV   r~   )r>   rp   r?   backbone_outputintermediate_hidden_stateshidden_stater|   r}   s           rA   forwardzEdgeTamVisionModel.forward   s     ?@@ ($--??%4%F%F"[u%v<l&:&:1aA&F%v"%v3799=W3X00-t/F/F.F.HI$B$O 5t7N7N6N6P QRVTVRV W)8</"7)77	
 	
 &ws   B*)N)rD   rE   rF   r   config_classmain_input_name_can_record_outputsrv   r
   rk   FloatTensorr   r	   tuplerV   r   r,   rB   rA   ro   ro      sh     'L$O X  26
''$.
 +,
 
+	+	
 
rB   ro   c                       e Zd Zg dZd Zy)rc   )z
^memory_.*z^mask_downsample.*zspatial_perceiver.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterc                     t        d      rr   rs   ru   s    rA   rv   z!EdgeTamModel.get_input_embeddings   rw   rB   N)rD   rE   rF   "_keys_to_ignore_on_load_unexpectedrv   r,   rB   rA   rc   rc      s    	*&XrB   rc   )rc   ro   r^   rR   r   rL   rP   )-rG   rk    r   re   configuration_utilsr   modeling_utilsr   processing_utilsr   utilsr   utils.genericr	   r
   autor   r   sam2.configuration_sam2r   r   r   sam2.modeling_sam2r   r   r   r   r   r   r   r   r   rL   rP   rR   rT   rV   rX   rZ   r\   r^   ro   rc   __all__r,   rB   rA   <module>r      s
     & 3 - & D - ` `	 	 	[#* [#|	!8 		4 		J 		} 		!8 		} 		": 		 	 H0 H H 
"
 "

"
JX9 X rB   