
    i                    $   d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)  e!       rddl*m+Z+ ddl,m-Z-  e#j\                  e/      Z0 G d dejb                        Z2	 ddl3m4Z4 e4Z2e0jk                  d        G d dejb                        Z9 G d dejb                        Z: G d dejb                        Z; G d d e      Z< G d! d"ejb                        Z=e  G d# d$e             Z>e  G d% d&e>             Z? G d' d(ejb                        Z@ G d) d*ejb                        ZA G d+ d,ejb                        ZB G d- d.ejb                        ZC G d/ d0ejb                        ZD G d1 d2e      ZE e d34       G d5 d6e>             ZF e d74       G d8 d9e>e             ZGg d:ZHy# e6$ r Y /e7$ r e0jq                  d       Y Gw xY w);zPix2Struct modeling file    N)Union)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torchdynamo_compilinglogging)is_flash_attention_requested   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfig)	BlockMask)make_flex_block_causal_maskc                   &     e Zd Zd fd	Zd Z xZS )Pix2StructLayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      |/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr%   zPix2StructLayerNorm.__init__=   s1     	ll5::k#:; #    c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   T)keepdim)tor'   float32powmeanrsqrtr*   r)   dtypefloat16bfloat16)r+   hidden_statesvariances      r/   forwardzPix2StructLayerNorm.forwardE   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r0   )gư>__name__
__module____qualname__r%   r?   __classcell__r.   s   @r/   r"   r"   <   s    $+r0   r"   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )Pix2StructVisionEmbeddingsa-  
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    configreturnNc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _
        t        j                  |j                        | _        y N)r$   r%   r   Linearpatch_embed_hidden_sizer,   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr+   rI   r.   s     r/   r%   z#Pix2StructVisionEmbeddings.__init__i   s}     "		&*H*H&J\J\ ]LL9K9KL!||FNNF<N<NOzz&"5"56r0   flattened_patchesc                 "   |d d d d df   j                         }|d d d d df   j                         }|d d d d dd f   }| j                  |      }| j                  |      }| j                  |      }||z   |z   }| j	                  |      }|S )Nr   r   r2   )longrO   rR   rS   rV   )r+   rX   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r/   r?   z"Pix2StructVisionEmbeddings.forwardr   s     (1a0557'1a0557-aABh7**+<=
**;7--k:  .0>A
\\*-
r0   )
rA   rB   rC   __doc__r   r%   r'   Tensorr?   rD   rE   s   @r/   rH   rH   b   s7    7/ 7D 7 %,, r0   rH   c                   ,     e Zd Z fdZ	 	 	 ddZ xZS )Pix2StructVisionAttentionc                 |   t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        | j                  | j                  z  | _	        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        d| _        y NFbias)r$   r%   r,   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutrV   	inner_dimr   rM   querykeyvalueoutputgradient_checkpointingrW   s     r/   r%   z"Pix2StructVisionAttention.__init__   s    !--"(++11//(?(??YYt//eL
99T--t~~EJYYt//eL
ii0@0@uM&+#r0   c                 (    |j                   dd \  } fd} | j                  |            } | j                  |            } | j                  |            }	t	        j
                  ||j                  dd            }
|t	        j                  d j                  ||f|
j                  |
j                        } j                  r j                  rd|_        |j                         dk(  r*||ddddddf   j                  |j                        z   }nw|||j                  |j                        z   }nVt!               sLt	        j"                  |f|j                  |j                        }||j                  |j                        z   }d|z
  }|j%                  |dk(  t	        j&                  |
j                        j(                        }|
|z  }
t	        j*                  |
t	        j,                  t	        j&                  |
j                        j(                              }
t.        j0                  j3                  |
dt        j4                  	      j7                  |
      }t.        j0                  j9                  | j8                   j                  
      }t	        j
                  ||	      }|j                  dd      j;                         j=                  d j>                        } jA                  |      }|f|fz   }|r||fz   }|S )z&
        Self-attention block
        Nr2   c                     | j                         j                  dj                  j                        j	                  dd      S )
projectionr3   r   r2   )
contiguousviewrk   ri   	transpose)states
batch_sizer+   s    r/   to_projection_shapez>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s<    $$&++JDLL$JaJabllmnpqrrr0   r   r   devicer:   Tr3   )dimr:   ptraining)!shapern   ro   rp   r'   matmulrx   zerosrk   r}   r:   rr   r   requires_gradr~   r5   r   r(   masked_fillfinfominmaxtensorr   
functionalsoftmaxr6   type_asrV   rv   rw   rm   rq   )r+   r=   attention_maskposition_biasoutput_attentions
seq_lengthr{   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsrz   s   `              @r/   r?   z!Pix2StructVisionAttention.forward   s    "/!4!4Ra!8
J	s +4::m+DE )-)@A
*4::m+DE lJ,@,@A,FG !KKDLL*j9&--W]WcWcM **t}}.2+!!#q( -q$a?O0P0S0STaThTh0i i+ -0A0A-BVBV0W W-/!&,]5I5IQ^QdQd" !.0A0A-BVBV0W W-M,88!9KU[[Y_YeYeMfMjMjk&&65<<FLL0I0M0M#NO }},,V5==,QYYZ`a }},,\T\\TXTaTa,bll<> "++Aq1<<>CCJPRTXTbTbckk+..M#33/Gr0   )NNFr@   rE   s   @r/   rc   rc      s    ,$ Gr0   rc   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructVisionMlprI   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y re   r$   r%   r   rM   r,   d_ffwi_0wi_1worT   rU   rV   r   dense_act_fnactrW   s     r/   r%   zPix2StructVisionMlp.__init__       IIf00&++EJ	IIf00&++EJ	))FKK););%Hzz&"5"56&--.r0   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rL   r   r   r   rV   
isinstancer   r)   r'   ra   r:   int8r5   r+   r=   hidden_geluhidden_linears       r/   r?   zPix2StructVisionMlp.forward       hhtyy78		-0#m3]3 tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r0   )rA   rB   rC   r   r%   r?   rD   rE   s   @r/   r   r      s    /5 /r0   r   c                        e Zd Zdeddf fdZ	 	 d	dej                  dej                  dz  dedeej                  ej                  f   eej                     z  fdZ	 xZ
S )
Pix2StructVisionLayerrI   rJ   Nc                 *   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   r-   )r$   r%   chunk_size_feed_forwardseq_len_dimrc   	attentionr   mlpr"   r,   layer_norm_epspre_mlp_layer_normpre_attention_layer_normrW   s     r/   r%   zPix2StructVisionLayer.__init__   ss    '-'E'E$26:&v."5f6H6HfNcNc"d(;F<N<NTZTiTi(j%r0   r=   r   r   c                     |}| j                  |      }| j                  |||      }|d   }|dd  }||z   }| j                  |      }| j                  |      |z   }|f|z   }|S )N)r   r   r   r   )r   r   r   r   )	r+   r=   r   r   residualself_attention_outputsattention_outputr   layer_outputs	            r/   r?   zPix2StructVisionLayer.forward  s     ! 55mD!%)/ "0 "

 2!4(, )83 ..}=xx-=/G+r0   NF)rA   rB   rC   r   r%   r'   ra   booltupler?   rD   rE   s   @r/   r   r      sz    k/ kD k /3"'	|| t+  	
 
u||U\\)	*U5<<-@	@r0   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej                  dej                  dz  deded	edee	z  fd
Z
 xZS )Pix2StructVisionEncoderrI   rJ   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r$   r%   rI   r   
ModuleListrangenum_hidden_layersr   layerrr   )r+   rI   _r.   s      r/   r%   z Pix2StructVisionEncoder.__init__(  sP    ]]5QWQiQiKj#ka$9&$A#kl
&+# $ls   A#r=   r   r   output_hidden_statesreturn_dictc                     |rdnd }|rdnd }t        | j                        D ](  \  }}	|r||fz   } |	|||      }
|
d   }|s ||
d   fz   }* |r||fz   }|st        d |||fD              S t        |||      S )N r   r   c              3   &   K   | ]	  }||  y wrL   r   .0vs     r/   	<genexpr>z2Pix2StructVisionEncoder.forward.<locals>.<genexpr>H  s     mq_`_lms   last_hidden_stater=   
attentions)	enumerater   r   r   )r+   r=   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_outputss              r/   r?   zPix2StructVisionEncoder.forward.  s     #7BD$5b4(4 		POA|#$58H$H!(HYZM)!,M &9]1=M<O&O#		P   1]4D Dm]4EGZ$[mmm++*
 	
r0   )NFFT)rA   rB   rC   r   r%   r'   ra   r   r   r   r?   rD   rE   s   @r/   r   r   '  sv    ,5 ,$ , /3"'%* 
||
 t+
  	

 #
 
 
	 
r0   r   c                   d    e Zd ZU eed<   dZdZed        Z e	j                         d        Zd Zy)Pix2StructPreTrainedModelrI   )imagetextFc                 v    t        j                  t              }t        j                  t              }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r'   r   r   r   )r+   r   
input_maskdummy_inputss       r/   r   z&Pix2StructPreTrainedModel.dummy_inputsW  s6    LL.	\\*-
!*"&0

 r0   c                    | j                   j                  }t        |t              r$t	        j
                  |j                  |dz         yt        |t              r8t        | j                   t              r | j                   j                  j                  n| j                   j                  }t        | j                   t              r | j                   j                  j                  n| j                   j                  }t	        j                  |j                  j                  d||dz  z         t        |j                  d      r?|j                  j                  )t	        j                   |j                  j                         t	        j                  |j"                  j                  d||dz  z         t        |j"                  d      r?|j"                  j                  )t	        j                   |j"                  j                         t	        j                  |j$                  j                  d||dz  z         t        |j$                  d      rA|j$                  j                  *t	        j                   |j$                  j                         yyyt        |t&              rt        | j                   t              r | j                   j                  j                  n| j                   j                  }t        | j                   t              r | j                   j                  j(                  n| j                   j                  }t        | j                   t              r | j                   j                  j*                  n| j                   j*                  }t	        j                  |j,                  j                  d|||z  dz  z         t	        j                  |j.                  j                  d||dz  z         t	        j                  |j0                  j                  d||dz  z         t	        j                  |j2                  j                  d|||z  dz  z         |j4                  r3t	        j                  |j6                  j                  d||dz  z         yyt        |t8        j:                        rt        | j                   t              r | j                   j                  j                  n| j                   j                  }t	        j                  |j                  d||dz  z         |j<                  Et?        |j                  dd      s-t	        j                   |j                  |j<                            yyyt        |t@              rt        | j                   t              r | j                   j                  j                  n| j                   j                  }t	        j                  |jB                  j                  d||dz  z         yt        |t8        jD                  t8        jF                  f      rct	        jH                  |j                  d| j                   jJ                         |j                   t	        j                   |j                         yyt        |t              r-|j                   t	        jL                  |j                         yyt        |t8        j:                        rt	        j                  |j                  d| j                   jJ                         |j<                  Et?        |j                  dd      s-t	        j                   |j                  |j<                            yyyy)	zInitialize the weights      ?        g      )r8   stdrg   N_is_hf_initializedF)'rI   initializer_factorr   r"   init	constant_r)    Pix2StructTextDenseGatedActDenser   text_configr,   r   normal_r   hasattrrg   zeros_r   r   Pix2StructTextAttentionrh   	num_headsrn   ro   rp   rq   has_relative_attention_biasrelative_attention_biasr   rP   padding_idxgetattrPix2StructTextModellm_headrM   Conv2dtrunc_normal_initializer_rangeones_)r+   modulefactorr,   r   ri   rk   s          r/   _init_weightsz'Pix2StructPreTrainedModel._init_weightsb  sE    //f12NN6==&3,7 @A dkk+;< ''33[[,, 
 4>dkkK[3\4;;**//bfbmbmbrbrDLL++#6kVZEZ;[\v{{F+0@0@0LFKK,,-LL++#6kVZEZ;[\v{{F+0@0@0LFKK,,-LL))DT>9RSvyy&)fiinn.HFIINN+ /I) 78 dkk+;< ''33[[,,  1;4;;HX0Y'',,_c_j_j_v_v 
 dkk+;< ''11[[**  LL,,3F{UgGglpFp<qrLL**&KQUDU:VWLL,,3FkSWFW<XYLL--CVRdHdimGm=no11V;;BBRX]hmq\qRrs 2- dkk+;< ''33[[,,  LLSfQU@U6VW!!-gfmmMach6iFMM&*<*<=> 7j- 34 dkk+;< ''33[[,,  LL..SfY]H]>^_BII 67v}}3DKK<Y<YZ{{&FKK( ' 34}}(

6==) )-LLSdkk6S6ST!!-gfmmMach6iFMM&*<*<=> 7j- .r0   c                 8   | j                   j                  }| j                   j                  }|t        d      |j	                  |j
                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |dk(  |       |S )Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information..r3   r   ).r   z1self.model.config.pad_token_id has to be defined.)rI   decoder_start_token_idpad_token_id
ValueError	new_zerosr   clonemasked_fill_)r+   r   r  r  shifted_input_idss        r/   _shift_rightz&Pix2StructPreTrainedModel._shift_right  s    !%!C!C{{//!)< 
 &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r0   N)rA   rB   rC   r   __annotations__input_modalities_can_compile_fullgraphpropertyr   r'   no_gradr   r	  r   r0   r/   r   r   P  sJ    ("  U]]_I? I?X!r0   r   c                        e Zd ZU eed<   dZdZdZdgZdef fdZ	d Z
e	 	 	 	 	 ddej                  dz  d	ej                  dz  d
edz  dedz  dedz  deez  fd       Z xZS )Pix2StructVisionModelrI   rX   )r   Tr   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        | j                          y Nr   )r$   r%   rI   rH   r]   r   encoderr"   r,   r   	layernorm	post_initrW   s     r/   r%   zPix2StructVisionModel.__init__  sU     4V<.v6,V-?-?VEZEZ[ 	r0   c                 .    | j                   j                  S rL   )r]   rO   r+   s    r/   get_input_embeddingsz*Pix2StructVisionModel.get_input_embeddings  s    ///r0   Nr   r   r   r   rJ   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |#|j                  d      dk7  j                         }| j                  |      }| j                  |||||      }|d   }	| j                  |	      }	|s|	f}
|
|dd z   S t        |	|j                  |j                        S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
            paper](https://huggingface.co/papers/2210.03347) (figure 5) for more details.

        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, Pix2StructVisionModel

        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 2048, 768]
        ```
        Nz%You have to specify flattened_patchesr3   r~   r   )r   r   r   r   r   r   )rI   r   r   use_return_dictr  sumfloatr]   r  r  r   r=   r   )r+   rX   r   r   r   r   kwargsembedding_outputencoder_outputssequence_outputhead_outputss              r/   r?   zPix2StructVisionModel.forward  s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$DEE!/333;q@GGIN??+<=,,)/!5# ' 
 *!,..9+-L/!""555-)77&11
 	
r0   )NNNNN)rA   rB   rC   r   r
  main_input_namer  supports_gradient_checkpointing_no_split_modulesr%   r  r   r'   ra   r   r   r   r?   rD   rE   s   @r/   r  r    s    "")O!&*#01
5 
0  26.2)-,0#'H
 <<$.H
 t+H
  $;	H

 #TkH
 D[H
 
+	+H
 H
r0   r  c                   *     e Zd Zdef fdZd Z xZS )r   rI   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y re   r   rW   s     r/   r%   z)Pix2StructTextDenseGatedActDense.__init__*  r   r0   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rL   r   r   s       r/   r?   z(Pix2StructTextDenseGatedActDense.forward2  r   r0   rA   rB   rC   r   r%   r?   rD   rE   s   @r/   r   r   )  s    /3 /r0   r   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructTextLayerFFrI   c                     t         |           t        |      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y r  )r$   r%   r   DenseReluDenser"   r,   layer_norm_epsilon
layer_normr   rT   rU   rV   rW   s     r/   r%   zPix2StructTextLayerFF.__init__G  sK    >vF-f.@.@fF_F_`zz&"5"56r0   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S rL   )r/  r-  rV   )r+   r=   forwarded_statess      r/   r?   zPix2StructTextLayerFF.forwardO  s=    ??=9../?@%5E(FFr0   r)  rE   s   @r/   r+  r+  F  s    73 7r0   r+  c                   b     e Zd Zddededz  f fdZed	d       Zd
dZ	 	 	 	 	 	 	 	 ddZ	 xZ
S )r   NrI   	layer_idxc                    t         |           || _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _	        |j                  | _        | j                  | j                  z  | _        || _        |-t        j                  d| j                   j"                   d       t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        | j                  r/t%        j0                  | j                  | j                        | _        d| _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Frf   )r$   r%   r   relative_attention_num_bucketsrelative_attention_max_distancer,   rh   ri   r   rk   rU   rV   rm   r3  loggerwarning_oncer.   rA   r   rM   rn   ro   rp   rq   rP   r   rr   r+   rI   r   r3  r.   s       r/   r%   z Pix2StructTextAttention.__init__W  sg   +F(.4.S.S+/5/U/U,!--"(++''**(?(??"*4>>+B+B*C D, , YYt//1A1AN
99T--t/?/?eLYYt//1A1AN
ii 0 0$2B2BO+++-<<8[8[]a]i]i+jD(&+#r0   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r2   r   )r5   r'   rZ   absr   
zeros_likelogr  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r/   _relative_position_bucketz1Pix2StructTextAttention._relative_position_buckets  s(   . AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r0   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n|dddf   j                  |      }t        j                  |t        j
                  |      dddf   }||z
  }| j                  |d| j                  | j                        }| j                  |      }	|	j                  g d      j                  d      }	|	S )z%Compute binned relative position biasN)r:   r}   F)rB  rC  rD  )r2   r   r   r   )r   r)   r}   r'   arangerZ   r5   rI  r5  r6  permute	unsqueeze)
r+   query_length
key_lengthr}   cache_positioncontext_positionmemory_positionrA  relative_position_bucketvaluess
             r/   compute_biasz$Pix2StructTextAttention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+.>>#'#A#A;;==	 $B $
  --.FG	*44Q7r0   c
                    |j                   dd \  }
}|du}| j                  |      }|j                  |
d| j                  | j                        j                  dd      }|Qt        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |
d| j                  | j                        j                  dd      }|j                  |
d| j                  | j                        j                  dd      }|D|s|	nd}	|j%                  ||| j                  d|	i      \  }}|rd|j                  | j                  <   t'        j(                  ||j                  dd            }||j                   d   }||n|	d   dz   }| j*                  sZt'        j,                  d| j                  ||f|j.                  |j0                  	      }| j2                  rE| j4                  r9d|_        n1| j9                  |||j.                  |	
      }|dddd| dddf   }|#|ddddddd|j                   d   f   }||z   }|}||z  }t:        j<                  j?                  |jA                         d      jC                  |      }t:        j<                  jE                  || jD                  | j4                        }t'        j(                  ||      }|j                  dd      jG                         }|j                  |
d| jH                        }| jK                  |      }||f}|r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr2   r3   r   rP  Tr   r|   )r}   rP  r  r   )&r   rn   rw   rk   ri   rx   r   r
   
is_updatedgetr3  cross_attention_cacheself_attention_cachelayerskeysrT  ro   rp   updater'   r   r   r   r}   r:   rr   r   r   rU  r   r   r   r  r   rV   rv   rm   rq   )r+   r=   maskkey_value_statesr   past_key_valuesrN  	use_cacher   rP  rz   r   is_cross_attentionr   rX  curr_past_key_valuescurrent_statesr   r   r   rO  real_seq_lengthcausal_maskr   r   r   r   s                              r/   r?   zPix2StructTextAttention.forward  s   " "/!4!4Ra!8
J .T9zz-0#((RtG^G^_iijkmno &:oGZ+[(3377GJ!'6'L'L$'6'K'K$#2 -?)]/j-44T^^DIIJ/66t~~FMML.1J::n5L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL*7It+?+F+Fdnn?OQ_>`,(
L &AEO..t~~> lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>!&&z2t~~Fkk+../Gr0   FN)T       )NN)NNNNNFFN)rA   rB   rC   r   intr%   staticmethodrI  rU  r?   rD   rE   s   @r/   r   r   V  sX    ,3 ,cfimcm ,8 -  - `0 ar0   r   c                   @     e Zd Zddedz  f fdZ	 	 	 	 	 	 ddZ xZS ) Pix2StructTextLayerSelfAttentionNr3  c                     t         |           t        |||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )Nr   r3  r   r$   r%   r   r   r"   r,   r.  r/  r   rT   rU   rV   r9  s       r/   r%   z)Pix2StructTextLayerSelfAttention.__init__   sU    00KW`
 .f.@.@fF_F_`zz&"5"56r0   c           	          | j                  |      }| j                  |||||||      }	|| j                  |	d         z   }|f|	dd  z   }
|
S )N)r_  r   ra  rb  r   rP  r   r   r/  r   rV   )r+   r=   r   r   ra  rb  r   rP  normed_hidden_statesr   r   s              r/   r?   z(Pix2StructTextLayerSelfAttention.forward(  sq      $}=>> '+/) * 
 &5Ea5H(II "%5ab%99r0   rh  )NNNFFNrA   rB   rC   rk  r%   r?   rD   rE   s   @r/   rn  rn    s-    7SSWZ 7 r0   rn  c                   B     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 ddZ xZS )!Pix2StructTextLayerCrossAttentionNr3  c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFrp  r   rq  )r+   rI   r3  r.   s      r/   r%   z*Pix2StructTextLayerCrossAttention.__init__C  sP    0UZfop-f.@.@fF_F_`zz&"5"56r0   c
                     | j                  |      }
| j                  |
||||||||		      }|| j                  |d         z   }|f|dd  z   }|S )N)r_  r`  r   ra  rb  rN  r   rP  r   r   rs  )r+   r=   r`  r   r   ra  rb  rN  r   rP  rt  r   r   r   s                 r/   r?   z)Pix2StructTextLayerCrossAttention.forwardI  sv      $}=>> -'+%/) * 

 %t||4DQ4G'HH/$4QR$88r0   rL   )NNNFNFNru  rE   s   @r/   rw  rw  B  s/    7#* 7 r0   rw  c                   H     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 	 	 	 ddZ xZS )Pix2StructTextBlockNr3  c                     t         |           t        |||      | _        t	        ||      | _        t        |      | _        y )Nrp  )r3  )r$   r%   rn  self_attentionrw  encoder_decoder_attentionr+  r   r9  s       r/   r%   zPix2StructTextBlock.__init__g  sH    >(C
 *K*
&
 )0r0   c           
         | j                  ||||||	|      }|d   }|dd  }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|d u}|r| j                  ||||||d   dz   ||	      }|d   }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }||dd  z   }| j                  |      }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|f}||z   S )N)r   r   ra  rb  r   rP  r   r   i  )r   r   r3   )r`  r   r   ra  rN  rb  r   )r}  r:   r'   r;   isinfanyr   r   clampr~  r   )r+   r=   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasra  rb  r   r   rP  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                     r/   r?   zPix2StructTextBlock.forwardw  s    "&!4!4)'+/) "5 "
 /q12126 %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM2$>&*&D&D!65; /+B/!3#"3 'E 	'# 4A6M ""emm3M8R8V8V8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O / %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM "***r0   rh  )
NNNNNNFFTNru  rE   s   @r/   r{  r{  f  s9    1SSWZ 1& "#&*?+r0   r{  z3
    The standalone text decoder of Pix2Struct
    )custom_introc                   j    e Zd ZU eed<   dZdgZddiZdZ fdZ	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 d!d
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  ded	z  ded	z  ded	z  ded	z  dej                  d	z  ded	z  dej                  d	z  deej                  df   ez  fd       Z	 d"deej*                  df   dej*                  dej*                  dedef
dZedej*                  dededej2                  dej*                  defd        Z xZS )#r   rI   )r   r{  zlm_head.weightzembed_tokens.weightTc                 V   t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        |t        |dk(        |       c}      | _        t        |j
                  |j                        | _        t        j                   |j"                        | _        t        j&                  |j
                  |j                  d      | _        | j+                          d| _        y c c}w )Nr   rp  r   Frf   )r$   r%   r   rP   
vocab_sizer,   embed_tokensr   r   
num_layersr{  r   r   r"   r.  final_layer_normrT   rU   rV   rM   r   r  rr   )r+   rI   r   r.   s      r/   r%   zPix2StructTextModel.__init__  s     LL):):F<N<NO]] v001 $FQRSV`ab

 !4F4F4FFLeLe fzz&"5"56yy!3!3V5F5FUS 	&+#s   &!D&c                     || _         y rL   )r  r+   new_embeddingss     r/   set_input_embeddingsz(Pix2StructTextModel.set_input_embeddings  s
    *r0   Nr   r   r  r  inputs_embedsra  rb  r   r   labelsr   rP  rJ   .c                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }| j
                  r%| j                  r|rt        j                  d       d}||t        d      |&|j                         }|j                  d|d         }n!||j                         dd }nt        d      |$| j                  J d       | j                  |      }|\  }}|rc|a| j                   j                  r5t        t        | j                         t        | j                               }nt        | j                         }d	}||d	   }n||j!                         }|%t#        j$                  |||z   |j&                  
      }|9||j!                         |z   n|}t#        j(                  |||j&                  
      }| j                   j*                  r2| j-                  |||t/        |t              r|j0                  n||      }nX|ddddddf   }|j3                  |j4                        }d|z
  t#        j6                  |j4                        j8                  z  }|M|j                         \  }}}||f}|!t#        j(                  ||j&                  
      }| j;                  |      }nd}|	rdnd}|rdnd}|rdnd}d}d}| j=                  |      }t?        | j@                        D ]L  \  }} |	r||fz   } | ||||||||||
      }!|!d	   }|!d   }|	|!|rdnd   }|s8||!d   fz   }|D||!d   fz   }N | jC                  |      }| j=                  |      }| jE                  |      }"|	r||fz   }d}#|
|
j3                  |"j&                        }
tG        jH                  dd      }$ |$|"jK                         j                  d|"j                  d            |
jK                         j                  d            }#|stM        d |#|"||||fD              S tO        |#|"||||      S )aU  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
            embeddings so you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
            Training](./t5#training).

        Example:

        ```python
        >>> from transformers import AutoProcessor, Pix2StructTextModel

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        ```
        NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer3   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddings)rI   r   r}   )r:   r   r   )ra  rb  r   rP  r   r   r2      r  r8   )ignore_index	reductionc              3   $   K   | ]  }|| 
 y wrL   r   r   s     r/   r   z.Pix2StructTextModel.forward.<locals>.<genexpr>  s       = s   )losslogitsra  r=   r   cross_attentions)(rI   rb  r   r   r  rr   r   r7  warningr  sizerw   r  is_encoder_decoderr
   r	   get_seq_lengthr'   rK  r}   r(   
is_decoder_update_causal_maskr   r[  r5   r:   r   r   invert_attention_maskrV   r   r   r  r   r   CrossEntropyLossrv   r   r   )%r+   r   r   r  r  r  ra  rb  r   r   r  r   rP  r  input_shaperz   r   past_key_values_lengthmask_seq_lengthrg  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   r  r=   r   r   r   r  r  loss_fcts%                                        r/   r?   zPix2StructTextModel.forward  s   V "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]&&4==YNNl I ]%>stt"#..*K!r;r?;I&',,.s3Kdee $$0p2pp0 --i8M!,
J0{{--"5 4l$++6V# #/dkk"B!"%%3A%6"(%4%C%C%E"!"\\&(>(KTaThThN ! BQA\..0:=bl  #ZZ
OML`L`aN;;!!22o/BC  44$!K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK !,=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+"6BD0d&7rd(,%]3(4 	VOA|#$58H$H!(%/- /#"3-M *!,M
 *!,M$00=CTaZ[0\- !/=3C2E!E(4+?=QRCSBU+U(;	V> --m<]3m,   1]4D DYYv}}-F**OHF--/44RRI6K\K\K^KcKcdfKghD  #%"(   1++%1
 	
r0   r   input_tensorc           	         t        | j                        r||dk(  j                         r|S y | j                  j                  dk(  r't	        |t
        j                        rt        |      }|S ||j                         nd}||j                  nd}| j                  j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t	        |t
        j                        r|j                  d   n||	z   dz   }
| j!                  ||	|
|||j                  d   	      }| j                  j                  dk(  rQ|O|j"                  j$                  d
v r7|s5t        j&                  |      j(                  }t        j*                  ||      }|S )Nr   flex_attentionr   Fsdpa)r  r  is_trainingr   r3   )sequence_lengthtarget_lengthr:   rP  rz   )cudaxpunpu)r   rI   r  _attn_implementationr   r'   ra   r    r  is_compileabler   _ignore_causal_mask_sdpar   r:   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr}   typer   r   _unmask_unattended)r+   r   r  rP  ra  r   past_seen_tokensusing_compilable_cacher:   r  r  rg  	min_dtypes                r/   r  z'Pix2StructTextModel._update_causal_mask  s    (4)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr0   r  r  r:   rz   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuer:   r}   r   )diagonalr  r3   r   )r~   r'   r   r   fullr}   triurK  reshapeexpandr  r   r5   r   )r   r  r  r:   rP  rz   r  rg  r  mask_lengthpadding_masks              r/   r  zIPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r0   )NNNNNNNNNNNN)F)rA   rB   rC   r   r
  r  r%  _tied_weights_keysr$  r%   r  r   r'   
LongTensorFloatTensorr   r   r   r   r?   r   ra   r  rl  rk  r:   r  rD   rE   s   @r/   r   r     s&    !  ./*,AB&*#,&+  .237:>;?15(,!%)-,0*.#'26G
##d*G
 ))D0G
  %0047	G

 !& 1 1D 8G
 ''$.G
 G
 $;G
  $;G
 #TkG
   4'G
 D[G
 ((4/G
 
u  #%	&)J	JG
 G
` #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r0   r   zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c                       e Zd ZU eed<   dZdef fdZd Zd Zde	j                  fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d	z  d
ej                  d	z  dej                   d	z  dej"                  d	z  deeej                        d	z  ded	z  dej                   d	z  dej(                  d	z  ded	z  ded	z  ded	z  ded	z  dej                   d	z  deej                     ez  fd       Z xZS )"Pix2StructForConditionalGenerationrI   rX   c                     t         |   |       t        |j                        | _        t        |j                        | _        |j                  | _        | j                          y rL   )
r$   r%   r  vision_configr  r   r   decoderis_vqar  rW   s     r/   r%   z+Pix2StructForConditionalGeneration.__init__,  sK     ,V-A-AB*6+=+=>mm 	r0   c                 6    | j                   j                         S rL   )r  r  r  s    r/   r  z7Pix2StructForConditionalGeneration.get_input_embeddings7  s    ||0022r0   c                 :    | j                   j                  |       y rL   )r  r  r  s     r/   r  z7Pix2StructForConditionalGeneration.set_input_embeddings:  s    )).9r0   rJ   c                 6    | j                   j                         S rL   )r  get_output_embeddingsr  s    r/   r  z8Pix2StructForConditionalGeneration.get_output_embeddings=  s    ||1133r0   c                 :    | j                   j                  |       y rL   )r  set_output_embeddingsr  s     r/   r  z8Pix2StructForConditionalGeneration.set_output_embeddings@  s    **>:r0   Nr   r   r   r   ra  r  decoder_inputs_embedsrb  r   r   r   rP  c                    |	|	n| j                   j                  j                  }	||n| j                   j                  }|| j	                  |||
||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|U|S|Q| j                  |      }||n2|j                  | j                   j                        j                         }d|dddf<   | j                  |||||||	|
||||      }|s||z   S t        |j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  |j"                  |j$                  	      S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`

            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder.

        Example:

        Inference:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> # autoregressive generation
        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A stop sign is on a street corner.

        >>> # conditional generation
        >>> text = "A picture of"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A picture of a stop sign with a red stop sign
        ```

        Training:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "A stop sign is on the street corner."

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> labels = processor(text=text, return_tensors="pt").input_ids

        >>> # forward pass
        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> print(f"{loss.item():.5f}")
        5.94282
        ```N)rX   r   r   r   r   r   r   r2   r   )r   r   r  ra  r  r  rb  r   r   r  r   rP  )	r  r  ra  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)rI   r   rb  r  r  r   r   lenr	  ner  r  r  r   r  r  ra  r=   r   r  r   )r+   rX   r   r   r   r   ra  r  r  rb  r   r   r   rP  r  r=   decoder_outputss                    r/   r?   z*Pix2StructForConditionalGeneration.forwardC  s   P "+!6IDKK<S<S<]<]	%0%<k$++B]B] ""ll"3-"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 *5 '&))$++*B*BCIIK # ,-"1a4( ,,'1/+"/#1/!5#) ' 
 "_44 %%"))+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r0   )NNNNNNNNNNNNN)rA   rB   rC   r   r
  r#  r%   r  r  r   Moduler  r  r   r'   r  r  
BoolTensorr   r   ra   r   r   r?   rD   rE   s   @r/   r  r  #  s    )O	/ 	3:4ryy 4;  7;3759:>BF(,*.59!%)-,0#'26d
 ,,t3d
 ))D0d
 !++d2	d

 !& 0 04 7d
 uU%6%6784?d
 d
   4'd
  %||d2d
 $;d
  $;d
 #Tkd
 D[d
 ((4/d
  
u  	!$6	6!d
 d
r0   r  )r   r  r  r   )Ir`   r>  typingr   r'   r    r   r   activationsr   cache_utilsr   r	   r
   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   utils.genericr   configuration_pix2structr   r   r   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr    
get_loggerrA   r7  r  r"   apex.normalizationrF   infoImportError	Exceptionr  rH   rc   r   r   r   r   r  r   r+  r   rn  rw  r{  r   r  __all__r   r0   r/   <module>r     s        & ! C C ) > 9  .  : d d  !;J 
		H	%+")) +2
a/&
KKij! !HW		 Wv")) :&6 &R&
bii &
R q! q! q!h _
5 _
 _
Fryy :BII  Ebii ERryy F!		 !HP+4 P+f 
b3 b
bJ 
@
)BO @

@
Fa,  	 a
NN_`as   .G. .H6HH